commit e2a661e405bc1db5fd811b9151797c3b5f238f8c Author: ModelHub XC Date: Thu May 7 04:58:32 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: sampluralis/llama-sft-proj Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..36dea69 --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +--- +library_name: transformers +model_name: llama-sft-proj +tags: +- generated_from_trainer +- sft +- trl +- alignment-handbook +licence: license +--- + +# Model Card for llama-sft-proj + +This model is a fine-tuned version of [None](https://huggingface.co/None). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="sampluralis/llama-sft-proj", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/ajanthan-pluralis-research/huggingface/runs/yvxwr9l0) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 4.57.6 +- Pytorch: 2.6.0+cu126 +- Datasets: 4.6.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..e01f2ce --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.519232355835209e+19, + "train_loss": 0.0698729722998863, + "train_runtime": 8612.5861, + "train_samples": 1444084, + "train_samples_per_second": 17.173, + "train_steps_per_second": 2.147 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..b481759 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,96 @@ +{# ───── defaults ───── #} +{%- if enable_thinking is not defined -%} +{%- set enable_thinking = true -%} +{%- endif -%} + +{# ───── reasoning mode ───── #} +{%- if enable_thinking -%} + {%- set reasoning_mode = "/think" -%} +{%- else -%} + {%- set reasoning_mode = "/no_think" -%} +{%- endif -%} + +{# ───── header (system message) ───── #} +{{- "<|im_start|>system\n" -}} + +{%- if messages[0].role == "system" -%} + {%- set system_message = messages[0].content -%} + {%- if "/no_think" in system_message -%} + {%- set reasoning_mode = "/no_think" -%} + {%- elif "/think" in system_message -%} + {%- set reasoning_mode = "/think" -%} + {%- endif -%} + {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%} +{%- endif -%} + +{%- if "/system_override" in system_message -%} + {{- custom_instructions.replace("/system_override", "").rstrip() -}} + {{- "<|im_end|>\n" -}} +{%- else -%} + {{- "## Metadata\n\n" -}} + {{- "Knowledge Cutoff Date: June 2025\n" -}} + {%- set today = strftime_now("%d %B %Y") -%} + {{- "Today Date: " ~ today ~ "\n" -}} + {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}} + + {{- "## Custom Instructions\n\n" -}} + {%- if custom_instructions -%} + {{- custom_instructions + "\n\n" -}} + {%- elif reasoning_mode == "/think" -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}} + {%- else -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}} + {%- endif -%} + + {{- "## Tools\n\n" -}} + {{- "### XML Tools\n\n" -}} + {%- if tools -%} + {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n\n") -%} + {%- for tool in tools -%} + {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | tojson) ~ "\n" -%} + {%- endfor -%} + {%- set xml_tools = ns.xml_tool_string + "\n\nFor each function call, return a json object with function name and arguments within XML tags." -%} + {%- endif -%} + {%- if xml_tools -%} + {{- xml_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "### Python Tools\n\n" -}} + {%- if python_tools -%} + {{- python_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "<|im_end|>\n" -}} +{%- endif -%} + +{# ───── main loop ───── #} +{%- for message in messages -%} + {%- set content = message.content if message.content is string else "" -%} + {%- if message.role == "user" -%} + {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }} + {%- elif message.role == "assistant" -%} + {% generation %} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- endif -%} + {% endgeneration %} + + {%- elif message.role == "tool" -%} + {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }} + {%- endif -%} +{%- endfor -%} + +{# ───── generation prompt ───── #} +{%- if add_generation_prompt -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" }} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..1517fca --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128012, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": 128012, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..6bbb3c8 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,10 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128012, + "pad_token_id": 128012, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..87625d3 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef9a56095d11bf5f8b49fb6dc6d2af7c1537249fa69d2cd1813f8315b5554218 +size 2996982344 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..eb8ad06 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|im_end|>", + "pad_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f342589 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7c979daf2c715603b21e094ce7e032280b007311a070cdf98ed708c492d614 +size 17209792 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..636c7ef --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128014": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128015": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128017": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128018": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|im_end|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..e01f2ce --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.519232355835209e+19, + "train_loss": 0.0698729722998863, + "train_runtime": 8612.5861, + "train_samples": 1444084, + "train_samples_per_second": 17.173, + "train_steps_per_second": 2.147 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..728a62a --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,166453 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 18490, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027041644131963225, + "grad_norm": 3.8694989681243896, + "learning_rate": 0.0, + "loss": 1.3757, + "mean_token_accuracy": 0.6467338800430298, + "num_tokens": 508614.0, + "step": 1 + }, + { + "epoch": 0.0005408328826392645, + "grad_norm": 4.610819339752197, + "learning_rate": 3.603603603603604e-08, + "loss": 1.4469, + "mean_token_accuracy": 0.6426355838775635, + "num_tokens": 1000933.0, + "step": 2 + }, + { + "epoch": 0.0008112493239588967, + "grad_norm": 2.489088773727417, + "learning_rate": 7.207207207207208e-08, + "loss": 1.2335, + "mean_token_accuracy": 0.6757146716117859, + "num_tokens": 1525119.0, + "step": 3 + }, + { + "epoch": 0.001081665765278529, + "grad_norm": 5.70018196105957, + "learning_rate": 1.0810810810810812e-07, + "loss": 1.3594, + "mean_token_accuracy": 0.6718229055404663, + "num_tokens": 2040588.0, + "step": 4 + }, + { + "epoch": 0.0013520822065981612, + "grad_norm": 4.569960594177246, + "learning_rate": 1.4414414414414417e-07, + "loss": 1.3788, + "mean_token_accuracy": 0.6444813013076782, + "num_tokens": 2518672.0, + "step": 5 + }, + { + "epoch": 0.0016224986479177934, + "grad_norm": 3.296457290649414, + "learning_rate": 1.801801801801802e-07, + "loss": 1.3781, + "mean_token_accuracy": 0.6521331071853638, + "num_tokens": 3042747.0, + "step": 6 + }, + { + "epoch": 0.0018929150892374256, + "grad_norm": 3.446957588195801, + "learning_rate": 2.1621621621621625e-07, + "loss": 1.4219, + "mean_token_accuracy": 0.6443007588386536, + "num_tokens": 3566859.0, + "step": 7 + }, + { + "epoch": 0.002163331530557058, + "grad_norm": 3.344780206680298, + "learning_rate": 2.5225225225225225e-07, + "loss": 1.3359, + "mean_token_accuracy": 0.6691887378692627, + "num_tokens": 4091138.0, + "step": 8 + }, + { + "epoch": 0.0024337479718766902, + "grad_norm": 3.836761951446533, + "learning_rate": 2.8828828828828833e-07, + "loss": 1.3925, + "mean_token_accuracy": 0.6502911448478699, + "num_tokens": 4615417.0, + "step": 9 + }, + { + "epoch": 0.0027041644131963224, + "grad_norm": 3.356236696243286, + "learning_rate": 3.2432432432432436e-07, + "loss": 1.3638, + "mean_token_accuracy": 0.6510025262832642, + "num_tokens": 5139696.0, + "step": 10 + }, + { + "epoch": 0.0029745808545159546, + "grad_norm": 129.86680603027344, + "learning_rate": 3.603603603603604e-07, + "loss": 10.818, + "mean_token_accuracy": 0.010664675384759903, + "num_tokens": 5601795.0, + "step": 11 + }, + { + "epoch": 0.003244997295835587, + "grad_norm": 388.60003662109375, + "learning_rate": 3.963963963963964e-07, + "loss": 11.0565, + "mean_token_accuracy": 0.010503498837351799, + "num_tokens": 6070287.0, + "step": 12 + }, + { + "epoch": 0.003515413737155219, + "grad_norm": 121.23548126220703, + "learning_rate": 4.324324324324325e-07, + "loss": 10.7779, + "mean_token_accuracy": 0.009555651806294918, + "num_tokens": 6594549.0, + "step": 13 + }, + { + "epoch": 0.0037858301784748512, + "grad_norm": 203.7624053955078, + "learning_rate": 4.684684684684685e-07, + "loss": 10.6838, + "mean_token_accuracy": 0.010214762762188911, + "num_tokens": 7118826.0, + "step": 14 + }, + { + "epoch": 0.004056246619794483, + "grad_norm": 193.425537109375, + "learning_rate": 5.045045045045045e-07, + "loss": 10.7208, + "mean_token_accuracy": 0.009682649746537209, + "num_tokens": 7642988.0, + "step": 15 + }, + { + "epoch": 0.004326663061114116, + "grad_norm": 105.63664245605469, + "learning_rate": 5.405405405405406e-07, + "loss": 10.9236, + "mean_token_accuracy": 0.009651056490838528, + "num_tokens": 8167205.0, + "step": 16 + }, + { + "epoch": 0.004597079502433748, + "grad_norm": 190.54000854492188, + "learning_rate": 5.765765765765767e-07, + "loss": 10.7224, + "mean_token_accuracy": 0.010925070382654667, + "num_tokens": 8691425.0, + "step": 17 + }, + { + "epoch": 0.0048674959437533805, + "grad_norm": 191.1670379638672, + "learning_rate": 6.126126126126126e-07, + "loss": 10.7396, + "mean_token_accuracy": 0.010759414173662663, + "num_tokens": 9215644.0, + "step": 18 + }, + { + "epoch": 0.005137912385073012, + "grad_norm": 114.18309783935547, + "learning_rate": 6.486486486486487e-07, + "loss": 10.5453, + "mean_token_accuracy": 0.01090940646827221, + "num_tokens": 9727459.0, + "step": 19 + }, + { + "epoch": 0.005408328826392645, + "grad_norm": 229.23069763183594, + "learning_rate": 6.846846846846847e-07, + "loss": 10.7776, + "mean_token_accuracy": 0.010439380072057247, + "num_tokens": 10251737.0, + "step": 20 + }, + { + "epoch": 0.005678745267712277, + "grad_norm": 3.441542625427246, + "learning_rate": 7.207207207207208e-07, + "loss": 1.3177, + "mean_token_accuracy": 0.6685017347335815, + "num_tokens": 10775964.0, + "step": 21 + }, + { + "epoch": 0.005949161709031909, + "grad_norm": 146.6537322998047, + "learning_rate": 7.567567567567569e-07, + "loss": 10.5732, + "mean_token_accuracy": 0.011354492045938969, + "num_tokens": 11300165.0, + "step": 22 + }, + { + "epoch": 0.006219578150351541, + "grad_norm": 251.84051513671875, + "learning_rate": 7.927927927927928e-07, + "loss": 10.73, + "mean_token_accuracy": 0.010129565373063087, + "num_tokens": 11824394.0, + "step": 23 + }, + { + "epoch": 0.006489994591671174, + "grad_norm": 103.12158966064453, + "learning_rate": 8.288288288288289e-07, + "loss": 10.2844, + "mean_token_accuracy": 0.01195601187646389, + "num_tokens": 12300420.0, + "step": 24 + }, + { + "epoch": 0.006760411032990805, + "grad_norm": 129.74795532226562, + "learning_rate": 8.64864864864865e-07, + "loss": 10.0109, + "mean_token_accuracy": 0.011580554768443108, + "num_tokens": 12820475.0, + "step": 25 + }, + { + "epoch": 0.007030827474310438, + "grad_norm": 117.20321655273438, + "learning_rate": 9.00900900900901e-07, + "loss": 10.0571, + "mean_token_accuracy": 0.01201096922159195, + "num_tokens": 13344739.0, + "step": 26 + }, + { + "epoch": 0.007301243915630071, + "grad_norm": 89.86516571044922, + "learning_rate": 9.36936936936937e-07, + "loss": 9.9758, + "mean_token_accuracy": 0.013772892765700817, + "num_tokens": 13857369.0, + "step": 27 + }, + { + "epoch": 0.0075716603569497025, + "grad_norm": 86.30531311035156, + "learning_rate": 9.72972972972973e-07, + "loss": 9.8981, + "mean_token_accuracy": 0.013154538348317146, + "num_tokens": 14381642.0, + "step": 28 + }, + { + "epoch": 0.007842076798269334, + "grad_norm": 80.77513885498047, + "learning_rate": 1.009009009009009e-06, + "loss": 9.8995, + "mean_token_accuracy": 0.013261919841170311, + "num_tokens": 14905801.0, + "step": 29 + }, + { + "epoch": 0.008112493239588967, + "grad_norm": 96.3874282836914, + "learning_rate": 1.0450450450450452e-06, + "loss": 9.4232, + "mean_token_accuracy": 0.018942994996905327, + "num_tokens": 15351015.0, + "step": 30 + }, + { + "epoch": 0.0083829096809086, + "grad_norm": 61.652793884277344, + "learning_rate": 1.0810810810810812e-06, + "loss": 9.4877, + "mean_token_accuracy": 0.01758813112974167, + "num_tokens": 15839774.0, + "step": 31 + }, + { + "epoch": 0.008653326122228232, + "grad_norm": 56.73585891723633, + "learning_rate": 1.1171171171171171e-06, + "loss": 9.3242, + "mean_token_accuracy": 0.01686837524175644, + "num_tokens": 16364041.0, + "step": 32 + }, + { + "epoch": 0.008923742563547863, + "grad_norm": 48.058780670166016, + "learning_rate": 1.1531531531531533e-06, + "loss": 9.4584, + "mean_token_accuracy": 0.01651872508227825, + "num_tokens": 16888240.0, + "step": 33 + }, + { + "epoch": 0.009194159004867496, + "grad_norm": 42.81907653808594, + "learning_rate": 1.1891891891891893e-06, + "loss": 9.3727, + "mean_token_accuracy": 0.016442518681287766, + "num_tokens": 17412330.0, + "step": 34 + }, + { + "epoch": 0.009464575446187128, + "grad_norm": 49.29421615600586, + "learning_rate": 1.2252252252252253e-06, + "loss": 8.9256, + "mean_token_accuracy": 0.01840662769973278, + "num_tokens": 17936498.0, + "step": 35 + }, + { + "epoch": 0.009734991887506761, + "grad_norm": 41.52471160888672, + "learning_rate": 1.2612612612612613e-06, + "loss": 9.3099, + "mean_token_accuracy": 0.016707822680473328, + "num_tokens": 18460615.0, + "step": 36 + }, + { + "epoch": 0.010005408328826392, + "grad_norm": 41.7969856262207, + "learning_rate": 1.2972972972972974e-06, + "loss": 9.0135, + "mean_token_accuracy": 0.019075367599725723, + "num_tokens": 18924415.0, + "step": 37 + }, + { + "epoch": 0.010275824770146024, + "grad_norm": 40.73152160644531, + "learning_rate": 1.3333333333333334e-06, + "loss": 9.0953, + "mean_token_accuracy": 0.01932756043970585, + "num_tokens": 19448668.0, + "step": 38 + }, + { + "epoch": 0.010546241211465657, + "grad_norm": 44.895748138427734, + "learning_rate": 1.3693693693693694e-06, + "loss": 8.9109, + "mean_token_accuracy": 0.020414330065250397, + "num_tokens": 19911783.0, + "step": 39 + }, + { + "epoch": 0.01081665765278529, + "grad_norm": 42.104637145996094, + "learning_rate": 1.4054054054054056e-06, + "loss": 8.7359, + "mean_token_accuracy": 0.01844700053334236, + "num_tokens": 20435913.0, + "step": 40 + }, + { + "epoch": 0.011087074094104922, + "grad_norm": 1.5165314674377441, + "learning_rate": 1.4414414414414416e-06, + "loss": 1.2588, + "mean_token_accuracy": 0.6719189882278442, + "num_tokens": 20960135.0, + "step": 41 + }, + { + "epoch": 0.011357490535424553, + "grad_norm": 28.99015235900879, + "learning_rate": 1.4774774774774775e-06, + "loss": 8.3399, + "mean_token_accuracy": 0.028626924380660057, + "num_tokens": 21419709.0, + "step": 42 + }, + { + "epoch": 0.011627906976744186, + "grad_norm": 20.746410369873047, + "learning_rate": 1.5135135135135137e-06, + "loss": 8.5188, + "mean_token_accuracy": 0.023226981982588768, + "num_tokens": 21943872.0, + "step": 43 + }, + { + "epoch": 0.011898323418063819, + "grad_norm": 17.279563903808594, + "learning_rate": 1.5495495495495497e-06, + "loss": 8.3892, + "mean_token_accuracy": 0.02788066491484642, + "num_tokens": 22468031.0, + "step": 44 + }, + { + "epoch": 0.012168739859383451, + "grad_norm": 17.50021743774414, + "learning_rate": 1.5855855855855857e-06, + "loss": 8.2174, + "mean_token_accuracy": 0.029250985011458397, + "num_tokens": 22992131.0, + "step": 45 + }, + { + "epoch": 0.012439156300703082, + "grad_norm": 16.623798370361328, + "learning_rate": 1.6216216216216219e-06, + "loss": 8.5392, + "mean_token_accuracy": 0.03216709941625595, + "num_tokens": 23461592.0, + "step": 46 + }, + { + "epoch": 0.012709572742022715, + "grad_norm": 17.092300415039062, + "learning_rate": 1.6576576576576578e-06, + "loss": 8.5551, + "mean_token_accuracy": 0.028837447986006737, + "num_tokens": 23975510.0, + "step": 47 + }, + { + "epoch": 0.012979989183342347, + "grad_norm": 15.909649848937988, + "learning_rate": 1.6936936936936938e-06, + "loss": 7.8719, + "mean_token_accuracy": 0.03491545096039772, + "num_tokens": 24499651.0, + "step": 48 + }, + { + "epoch": 0.01325040562466198, + "grad_norm": 14.549489974975586, + "learning_rate": 1.72972972972973e-06, + "loss": 7.9347, + "mean_token_accuracy": 0.03177408501505852, + "num_tokens": 24980867.0, + "step": 49 + }, + { + "epoch": 0.01352082206598161, + "grad_norm": 13.921740531921387, + "learning_rate": 1.765765765765766e-06, + "loss": 7.9164, + "mean_token_accuracy": 0.03855527192354202, + "num_tokens": 25483413.0, + "step": 50 + }, + { + "epoch": 0.013791238507301243, + "grad_norm": 13.993817329406738, + "learning_rate": 1.801801801801802e-06, + "loss": 7.7834, + "mean_token_accuracy": 0.0320182703435421, + "num_tokens": 25996420.0, + "step": 51 + }, + { + "epoch": 0.014061654948620876, + "grad_norm": 14.105549812316895, + "learning_rate": 1.8378378378378381e-06, + "loss": 7.7039, + "mean_token_accuracy": 0.03672121465206146, + "num_tokens": 26520681.0, + "step": 52 + }, + { + "epoch": 0.014332071389940509, + "grad_norm": 15.845444679260254, + "learning_rate": 1.873873873873874e-06, + "loss": 8.1677, + "mean_token_accuracy": 0.037314511835575104, + "num_tokens": 27044943.0, + "step": 53 + }, + { + "epoch": 0.014602487831260141, + "grad_norm": 13.222240447998047, + "learning_rate": 1.90990990990991e-06, + "loss": 7.791, + "mean_token_accuracy": 0.045362912118434906, + "num_tokens": 27507212.0, + "step": 54 + }, + { + "epoch": 0.014872904272579772, + "grad_norm": 12.593511581420898, + "learning_rate": 1.945945945945946e-06, + "loss": 7.5032, + "mean_token_accuracy": 0.039387647062540054, + "num_tokens": 28031393.0, + "step": 55 + }, + { + "epoch": 0.015143320713899405, + "grad_norm": 12.898920059204102, + "learning_rate": 1.9819819819819822e-06, + "loss": 7.6923, + "mean_token_accuracy": 0.044823214411735535, + "num_tokens": 28504329.0, + "step": 56 + }, + { + "epoch": 0.015413737155219038, + "grad_norm": 11.665436744689941, + "learning_rate": 2.018018018018018e-06, + "loss": 7.4482, + "mean_token_accuracy": 0.04911056160926819, + "num_tokens": 28996545.0, + "step": 57 + }, + { + "epoch": 0.01568415359653867, + "grad_norm": 11.842251777648926, + "learning_rate": 2.054054054054054e-06, + "loss": 7.4505, + "mean_token_accuracy": 0.047099899500608444, + "num_tokens": 29520677.0, + "step": 58 + }, + { + "epoch": 0.0159545700378583, + "grad_norm": 13.723803520202637, + "learning_rate": 2.0900900900900904e-06, + "loss": 7.1328, + "mean_token_accuracy": 0.04962838813662529, + "num_tokens": 30044764.0, + "step": 59 + }, + { + "epoch": 0.016224986479177934, + "grad_norm": 11.495928764343262, + "learning_rate": 2.126126126126126e-06, + "loss": 7.3654, + "mean_token_accuracy": 0.05260945111513138, + "num_tokens": 30568995.0, + "step": 60 + }, + { + "epoch": 0.016495402920497566, + "grad_norm": 1.8734186887741089, + "learning_rate": 2.1621621621621623e-06, + "loss": 1.2913, + "mean_token_accuracy": 0.6717120409011841, + "num_tokens": 31093146.0, + "step": 61 + }, + { + "epoch": 0.0167658193618172, + "grad_norm": 12.070652961730957, + "learning_rate": 2.1981981981981985e-06, + "loss": 7.4905, + "mean_token_accuracy": 0.048743538558483124, + "num_tokens": 31617423.0, + "step": 62 + }, + { + "epoch": 0.01703623580313683, + "grad_norm": 11.704329490661621, + "learning_rate": 2.2342342342342343e-06, + "loss": 7.6218, + "mean_token_accuracy": 0.04926250874996185, + "num_tokens": 32092291.0, + "step": 63 + }, + { + "epoch": 0.017306652244456464, + "grad_norm": 11.992820739746094, + "learning_rate": 2.2702702702702705e-06, + "loss": 6.8248, + "mean_token_accuracy": 0.061367154121398926, + "num_tokens": 32599776.0, + "step": 64 + }, + { + "epoch": 0.017577068685776097, + "grad_norm": 10.11640453338623, + "learning_rate": 2.3063063063063067e-06, + "loss": 6.974, + "mean_token_accuracy": 0.06149953603744507, + "num_tokens": 33100260.0, + "step": 65 + }, + { + "epoch": 0.017847485127095726, + "grad_norm": 9.30036735534668, + "learning_rate": 2.3423423423423424e-06, + "loss": 7.0793, + "mean_token_accuracy": 0.06604214757680893, + "num_tokens": 33624511.0, + "step": 66 + }, + { + "epoch": 0.01811790156841536, + "grad_norm": 8.331859588623047, + "learning_rate": 2.3783783783783786e-06, + "loss": 6.7584, + "mean_token_accuracy": 0.06418460607528687, + "num_tokens": 34148775.0, + "step": 67 + }, + { + "epoch": 0.01838831800973499, + "grad_norm": 8.70333194732666, + "learning_rate": 2.414414414414415e-06, + "loss": 6.9811, + "mean_token_accuracy": 0.06359325349330902, + "num_tokens": 34673009.0, + "step": 68 + }, + { + "epoch": 0.018658734451054624, + "grad_norm": 16.929073333740234, + "learning_rate": 2.4504504504504506e-06, + "loss": 6.6751, + "mean_token_accuracy": 0.07599572837352753, + "num_tokens": 35139952.0, + "step": 69 + }, + { + "epoch": 0.018929150892374257, + "grad_norm": 8.974466323852539, + "learning_rate": 2.4864864864864867e-06, + "loss": 6.5823, + "mean_token_accuracy": 0.07645224034786224, + "num_tokens": 35664048.0, + "step": 70 + }, + { + "epoch": 0.01919956733369389, + "grad_norm": 7.492007732391357, + "learning_rate": 2.5225225225225225e-06, + "loss": 6.5988, + "mean_token_accuracy": 0.07163664698600769, + "num_tokens": 36188240.0, + "step": 71 + }, + { + "epoch": 0.019469983775013522, + "grad_norm": 7.641438007354736, + "learning_rate": 2.5585585585585587e-06, + "loss": 6.3914, + "mean_token_accuracy": 0.0761638730764389, + "num_tokens": 36712521.0, + "step": 72 + }, + { + "epoch": 0.019740400216333154, + "grad_norm": 6.957399845123291, + "learning_rate": 2.594594594594595e-06, + "loss": 6.7998, + "mean_token_accuracy": 0.0757390707731247, + "num_tokens": 37236741.0, + "step": 73 + }, + { + "epoch": 0.020010816657652784, + "grad_norm": 6.492644786834717, + "learning_rate": 2.6306306306306306e-06, + "loss": 6.4277, + "mean_token_accuracy": 0.08260145038366318, + "num_tokens": 37760978.0, + "step": 74 + }, + { + "epoch": 0.020281233098972416, + "grad_norm": 7.5145111083984375, + "learning_rate": 2.666666666666667e-06, + "loss": 6.1091, + "mean_token_accuracy": 0.10193414986133575, + "num_tokens": 38285152.0, + "step": 75 + }, + { + "epoch": 0.02055164954029205, + "grad_norm": 6.10105562210083, + "learning_rate": 2.702702702702703e-06, + "loss": 6.3546, + "mean_token_accuracy": 0.08659171313047409, + "num_tokens": 38796738.0, + "step": 76 + }, + { + "epoch": 0.02082206598161168, + "grad_norm": 6.765620708465576, + "learning_rate": 2.7387387387387388e-06, + "loss": 6.634, + "mean_token_accuracy": 0.08368251472711563, + "num_tokens": 39320779.0, + "step": 77 + }, + { + "epoch": 0.021092482422931314, + "grad_norm": 6.491325378417969, + "learning_rate": 2.774774774774775e-06, + "loss": 6.4932, + "mean_token_accuracy": 0.08990268409252167, + "num_tokens": 39845036.0, + "step": 78 + }, + { + "epoch": 0.021362898864250947, + "grad_norm": 5.808135509490967, + "learning_rate": 2.810810810810811e-06, + "loss": 6.5303, + "mean_token_accuracy": 0.08632342517375946, + "num_tokens": 40369258.0, + "step": 79 + }, + { + "epoch": 0.02163331530557058, + "grad_norm": 5.252036094665527, + "learning_rate": 2.846846846846847e-06, + "loss": 5.8402, + "mean_token_accuracy": 0.10216771811246872, + "num_tokens": 40846724.0, + "step": 80 + }, + { + "epoch": 0.021903731746890212, + "grad_norm": 1.4088271856307983, + "learning_rate": 2.882882882882883e-06, + "loss": 1.286, + "mean_token_accuracy": 0.6741145849227905, + "num_tokens": 41370998.0, + "step": 81 + }, + { + "epoch": 0.022174148188209845, + "grad_norm": 5.531675815582275, + "learning_rate": 2.9189189189189193e-06, + "loss": 6.1935, + "mean_token_accuracy": 0.09727020561695099, + "num_tokens": 41865680.0, + "step": 82 + }, + { + "epoch": 0.022444564629529474, + "grad_norm": 5.269561290740967, + "learning_rate": 2.954954954954955e-06, + "loss": 5.7161, + "mean_token_accuracy": 0.09602780640125275, + "num_tokens": 42389849.0, + "step": 83 + }, + { + "epoch": 0.022714981070849107, + "grad_norm": 4.595400333404541, + "learning_rate": 2.9909909909909912e-06, + "loss": 5.9473, + "mean_token_accuracy": 0.10308583825826645, + "num_tokens": 42914056.0, + "step": 84 + }, + { + "epoch": 0.02298539751216874, + "grad_norm": 6.478364944458008, + "learning_rate": 3.0270270270270274e-06, + "loss": 5.3002, + "mean_token_accuracy": 0.11156164109706879, + "num_tokens": 43438299.0, + "step": 85 + }, + { + "epoch": 0.023255813953488372, + "grad_norm": 4.670969009399414, + "learning_rate": 3.063063063063063e-06, + "loss": 5.8729, + "mean_token_accuracy": 0.10344249755144119, + "num_tokens": 43950415.0, + "step": 86 + }, + { + "epoch": 0.023526230394808004, + "grad_norm": 4.840254783630371, + "learning_rate": 3.0990990990990994e-06, + "loss": 6.1775, + "mean_token_accuracy": 0.10229188203811646, + "num_tokens": 44474597.0, + "step": 87 + }, + { + "epoch": 0.023796646836127637, + "grad_norm": 3.9525628089904785, + "learning_rate": 3.1351351351351356e-06, + "loss": 5.8994, + "mean_token_accuracy": 0.1084158644080162, + "num_tokens": 44998811.0, + "step": 88 + }, + { + "epoch": 0.02406706327744727, + "grad_norm": 4.048078536987305, + "learning_rate": 3.1711711711711713e-06, + "loss": 5.7499, + "mean_token_accuracy": 0.11752287298440933, + "num_tokens": 45514485.0, + "step": 89 + }, + { + "epoch": 0.024337479718766902, + "grad_norm": 5.005536079406738, + "learning_rate": 3.2072072072072075e-06, + "loss": 5.4233, + "mean_token_accuracy": 0.11038537323474884, + "num_tokens": 46038748.0, + "step": 90 + }, + { + "epoch": 0.024607896160086535, + "grad_norm": 4.723410606384277, + "learning_rate": 3.2432432432432437e-06, + "loss": 5.3313, + "mean_token_accuracy": 0.12489961087703705, + "num_tokens": 46562941.0, + "step": 91 + }, + { + "epoch": 0.024878312601406164, + "grad_norm": 3.764437198638916, + "learning_rate": 3.2792792792792795e-06, + "loss": 6.1545, + "mean_token_accuracy": 0.10276611149311066, + "num_tokens": 47051029.0, + "step": 92 + }, + { + "epoch": 0.025148729042725797, + "grad_norm": 3.63820219039917, + "learning_rate": 3.3153153153153157e-06, + "loss": 5.8, + "mean_token_accuracy": 0.11051676422357559, + "num_tokens": 47527666.0, + "step": 93 + }, + { + "epoch": 0.02541914548404543, + "grad_norm": 3.444727659225464, + "learning_rate": 3.351351351351352e-06, + "loss": 5.8331, + "mean_token_accuracy": 0.11773425340652466, + "num_tokens": 48051797.0, + "step": 94 + }, + { + "epoch": 0.025689561925365062, + "grad_norm": 3.1768810749053955, + "learning_rate": 3.3873873873873876e-06, + "loss": 5.6776, + "mean_token_accuracy": 0.11750930547714233, + "num_tokens": 48576075.0, + "step": 95 + }, + { + "epoch": 0.025959978366684695, + "grad_norm": 3.2758629322052, + "learning_rate": 3.423423423423424e-06, + "loss": 5.807, + "mean_token_accuracy": 0.11338154971599579, + "num_tokens": 49100324.0, + "step": 96 + }, + { + "epoch": 0.026230394808004327, + "grad_norm": 3.4549410343170166, + "learning_rate": 3.45945945945946e-06, + "loss": 5.7524, + "mean_token_accuracy": 0.1197580024600029, + "num_tokens": 49624600.0, + "step": 97 + }, + { + "epoch": 0.02650081124932396, + "grad_norm": 3.2633373737335205, + "learning_rate": 3.4954954954954957e-06, + "loss": 5.7667, + "mean_token_accuracy": 0.1178801879286766, + "num_tokens": 50148780.0, + "step": 98 + }, + { + "epoch": 0.026771227690643593, + "grad_norm": 3.755636215209961, + "learning_rate": 3.531531531531532e-06, + "loss": 5.1332, + "mean_token_accuracy": 0.1332007646560669, + "num_tokens": 50614598.0, + "step": 99 + }, + { + "epoch": 0.02704164413196322, + "grad_norm": 3.2400457859039307, + "learning_rate": 3.567567567567568e-06, + "loss": 5.7224, + "mean_token_accuracy": 0.12048989534378052, + "num_tokens": 51138863.0, + "step": 100 + }, + { + "epoch": 0.027312060573282854, + "grad_norm": 1.3437622785568237, + "learning_rate": 3.603603603603604e-06, + "loss": 1.2848, + "mean_token_accuracy": 0.6807279586791992, + "num_tokens": 51663135.0, + "step": 101 + }, + { + "epoch": 0.027582477014602487, + "grad_norm": 3.4100759029388428, + "learning_rate": 3.63963963963964e-06, + "loss": 5.7753, + "mean_token_accuracy": 0.11792796105146408, + "num_tokens": 52187304.0, + "step": 102 + }, + { + "epoch": 0.02785289345592212, + "grad_norm": 3.10269832611084, + "learning_rate": 3.6756756756756763e-06, + "loss": 5.4379, + "mean_token_accuracy": 0.12999847531318665, + "num_tokens": 52711492.0, + "step": 103 + }, + { + "epoch": 0.028123309897241752, + "grad_norm": 2.9715847969055176, + "learning_rate": 3.711711711711712e-06, + "loss": 5.5547, + "mean_token_accuracy": 0.13372859358787537, + "num_tokens": 53235740.0, + "step": 104 + }, + { + "epoch": 0.028393726338561385, + "grad_norm": 2.9919090270996094, + "learning_rate": 3.747747747747748e-06, + "loss": 5.2499, + "mean_token_accuracy": 0.13427743315696716, + "num_tokens": 53760018.0, + "step": 105 + }, + { + "epoch": 0.028664142779881017, + "grad_norm": 22.06209373474121, + "learning_rate": 3.7837837837837844e-06, + "loss": 5.1553, + "mean_token_accuracy": 0.1663743257522583, + "num_tokens": 54198401.0, + "step": 106 + }, + { + "epoch": 0.02893455922120065, + "grad_norm": 3.5161774158477783, + "learning_rate": 3.81981981981982e-06, + "loss": 4.9112, + "mean_token_accuracy": 0.14834269881248474, + "num_tokens": 54722669.0, + "step": 107 + }, + { + "epoch": 0.029204975662520283, + "grad_norm": 2.732100248336792, + "learning_rate": 3.855855855855856e-06, + "loss": 5.4745, + "mean_token_accuracy": 0.12091224640607834, + "num_tokens": 55246946.0, + "step": 108 + }, + { + "epoch": 0.029475392103839912, + "grad_norm": 4.38313102722168, + "learning_rate": 3.891891891891892e-06, + "loss": 4.9137, + "mean_token_accuracy": 0.1445319950580597, + "num_tokens": 55771163.0, + "step": 109 + }, + { + "epoch": 0.029745808545159545, + "grad_norm": 2.6379497051239014, + "learning_rate": 3.927927927927928e-06, + "loss": 5.5056, + "mean_token_accuracy": 0.1379345953464508, + "num_tokens": 56267267.0, + "step": 110 + }, + { + "epoch": 0.030016224986479177, + "grad_norm": 2.454843521118164, + "learning_rate": 3.9639639639639645e-06, + "loss": 5.2255, + "mean_token_accuracy": 0.14557784795761108, + "num_tokens": 56791546.0, + "step": 111 + }, + { + "epoch": 0.03028664142779881, + "grad_norm": 4.825044631958008, + "learning_rate": 4.000000000000001e-06, + "loss": 5.4646, + "mean_token_accuracy": 0.12278036028146744, + "num_tokens": 57315828.0, + "step": 112 + }, + { + "epoch": 0.030557057869118442, + "grad_norm": 3.060637950897217, + "learning_rate": 4.036036036036036e-06, + "loss": 5.5561, + "mean_token_accuracy": 0.11992418020963669, + "num_tokens": 57749330.0, + "step": 113 + }, + { + "epoch": 0.030827474310438075, + "grad_norm": 2.4679269790649414, + "learning_rate": 4.072072072072072e-06, + "loss": 5.3508, + "mean_token_accuracy": 0.14250420033931732, + "num_tokens": 58273587.0, + "step": 114 + }, + { + "epoch": 0.031097890751757708, + "grad_norm": 2.409852981567383, + "learning_rate": 4.108108108108108e-06, + "loss": 5.2599, + "mean_token_accuracy": 0.1445225477218628, + "num_tokens": 58749554.0, + "step": 115 + }, + { + "epoch": 0.03136830719307734, + "grad_norm": 2.8032186031341553, + "learning_rate": 4.1441441441441446e-06, + "loss": 4.997, + "mean_token_accuracy": 0.14522181451320648, + "num_tokens": 59273753.0, + "step": 116 + }, + { + "epoch": 0.03163872363439697, + "grad_norm": 2.863590717315674, + "learning_rate": 4.180180180180181e-06, + "loss": 4.8778, + "mean_token_accuracy": 0.15371133387088776, + "num_tokens": 59714097.0, + "step": 117 + }, + { + "epoch": 0.0319091400757166, + "grad_norm": 2.2405266761779785, + "learning_rate": 4.216216216216217e-06, + "loss": 5.2229, + "mean_token_accuracy": 0.14449745416641235, + "num_tokens": 60238357.0, + "step": 118 + }, + { + "epoch": 0.03217955651703624, + "grad_norm": 2.3844103813171387, + "learning_rate": 4.252252252252252e-06, + "loss": 4.9763, + "mean_token_accuracy": 0.154159277677536, + "num_tokens": 60762628.0, + "step": 119 + }, + { + "epoch": 0.03244997295835587, + "grad_norm": 3.0145556926727295, + "learning_rate": 4.2882882882882885e-06, + "loss": 4.4452, + "mean_token_accuracy": 0.1537996381521225, + "num_tokens": 61286559.0, + "step": 120 + }, + { + "epoch": 0.032720389399675504, + "grad_norm": 2.0808990001678467, + "learning_rate": 4.324324324324325e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.7108750939369202, + "num_tokens": 61807315.0, + "step": 121 + }, + { + "epoch": 0.03299080584099513, + "grad_norm": 2.6406822204589844, + "learning_rate": 4.360360360360361e-06, + "loss": 5.0396, + "mean_token_accuracy": 0.15723320841789246, + "num_tokens": 62331480.0, + "step": 122 + }, + { + "epoch": 0.03326122228231476, + "grad_norm": 2.480863571166992, + "learning_rate": 4.396396396396397e-06, + "loss": 5.0096, + "mean_token_accuracy": 0.15774178504943848, + "num_tokens": 62855742.0, + "step": 123 + }, + { + "epoch": 0.0335316387236344, + "grad_norm": 2.4514150619506836, + "learning_rate": 4.432432432432433e-06, + "loss": 4.7074, + "mean_token_accuracy": 0.16730228066444397, + "num_tokens": 63380025.0, + "step": 124 + }, + { + "epoch": 0.03380205516495403, + "grad_norm": 2.3448753356933594, + "learning_rate": 4.4684684684684686e-06, + "loss": 5.0302, + "mean_token_accuracy": 0.15915068984031677, + "num_tokens": 63864011.0, + "step": 125 + }, + { + "epoch": 0.03407247160627366, + "grad_norm": 2.378044366836548, + "learning_rate": 4.504504504504505e-06, + "loss": 4.7397, + "mean_token_accuracy": 0.16515691578388214, + "num_tokens": 64384918.0, + "step": 126 + }, + { + "epoch": 0.03434288804759329, + "grad_norm": 2.153578042984009, + "learning_rate": 4.540540540540541e-06, + "loss": 5.0153, + "mean_token_accuracy": 0.1632569134235382, + "num_tokens": 64873812.0, + "step": 127 + }, + { + "epoch": 0.03461330448891293, + "grad_norm": 2.1364810466766357, + "learning_rate": 4.576576576576577e-06, + "loss": 4.9994, + "mean_token_accuracy": 0.1618492603302002, + "num_tokens": 65398077.0, + "step": 128 + }, + { + "epoch": 0.03488372093023256, + "grad_norm": 2.2606475353240967, + "learning_rate": 4.612612612612613e-06, + "loss": 4.995, + "mean_token_accuracy": 0.15390387177467346, + "num_tokens": 65922300.0, + "step": 129 + }, + { + "epoch": 0.035154137371552194, + "grad_norm": 2.1687324047088623, + "learning_rate": 4.6486486486486495e-06, + "loss": 5.012, + "mean_token_accuracy": 0.16798943281173706, + "num_tokens": 66393234.0, + "step": 130 + }, + { + "epoch": 0.03542455381287182, + "grad_norm": 2.4378159046173096, + "learning_rate": 4.684684684684685e-06, + "loss": 5.2264, + "mean_token_accuracy": 0.15141433477401733, + "num_tokens": 66854580.0, + "step": 131 + }, + { + "epoch": 0.03569497025419145, + "grad_norm": 2.454800605773926, + "learning_rate": 4.720720720720721e-06, + "loss": 4.6411, + "mean_token_accuracy": 0.17747315764427185, + "num_tokens": 67358258.0, + "step": 132 + }, + { + "epoch": 0.03596538669551109, + "grad_norm": 21.578289031982422, + "learning_rate": 4.756756756756757e-06, + "loss": 4.7522, + "mean_token_accuracy": 0.18686263263225555, + "num_tokens": 67882486.0, + "step": 133 + }, + { + "epoch": 0.03623580313683072, + "grad_norm": 2.2157809734344482, + "learning_rate": 4.792792792792793e-06, + "loss": 4.8598, + "mean_token_accuracy": 0.16749192774295807, + "num_tokens": 68406650.0, + "step": 134 + }, + { + "epoch": 0.03650621957815035, + "grad_norm": 2.100811719894409, + "learning_rate": 4.82882882882883e-06, + "loss": 4.7499, + "mean_token_accuracy": 0.18018946051597595, + "num_tokens": 68930907.0, + "step": 135 + }, + { + "epoch": 0.03677663601946998, + "grad_norm": 1.9875565767288208, + "learning_rate": 4.864864864864866e-06, + "loss": 4.7498, + "mean_token_accuracy": 0.1765982210636139, + "num_tokens": 69455139.0, + "step": 136 + }, + { + "epoch": 0.03704705246078962, + "grad_norm": 2.047515392303467, + "learning_rate": 4.900900900900901e-06, + "loss": 4.7839, + "mean_token_accuracy": 0.1715896725654602, + "num_tokens": 69979379.0, + "step": 137 + }, + { + "epoch": 0.03731746890210925, + "grad_norm": 1.9234440326690674, + "learning_rate": 4.936936936936937e-06, + "loss": 4.6717, + "mean_token_accuracy": 0.16705673933029175, + "num_tokens": 70503618.0, + "step": 138 + }, + { + "epoch": 0.03758788534342888, + "grad_norm": 2.229527473449707, + "learning_rate": 4.9729729729729735e-06, + "loss": 4.5063, + "mean_token_accuracy": 0.18379707634449005, + "num_tokens": 71027714.0, + "step": 139 + }, + { + "epoch": 0.03785830178474851, + "grad_norm": 2.088179111480713, + "learning_rate": 5.00900900900901e-06, + "loss": 4.8677, + "mean_token_accuracy": 0.17725175619125366, + "num_tokens": 71551965.0, + "step": 140 + }, + { + "epoch": 0.03812871822606814, + "grad_norm": 2.172349691390991, + "learning_rate": 5.045045045045045e-06, + "loss": 1.3235, + "mean_token_accuracy": 0.6576178073883057, + "num_tokens": 72076105.0, + "step": 141 + }, + { + "epoch": 0.03839913466738778, + "grad_norm": 2.4332239627838135, + "learning_rate": 5.081081081081082e-06, + "loss": 4.7237, + "mean_token_accuracy": 0.18650084733963013, + "num_tokens": 72600282.0, + "step": 142 + }, + { + "epoch": 0.03866955110870741, + "grad_norm": 2.237287998199463, + "learning_rate": 5.117117117117117e-06, + "loss": 4.63, + "mean_token_accuracy": 0.19443677365779877, + "num_tokens": 73093014.0, + "step": 143 + }, + { + "epoch": 0.038939967550027044, + "grad_norm": 2.1664462089538574, + "learning_rate": 5.153153153153153e-06, + "loss": 4.5041, + "mean_token_accuracy": 0.19265884160995483, + "num_tokens": 73617196.0, + "step": 144 + }, + { + "epoch": 0.03921038399134667, + "grad_norm": 2.3487634658813477, + "learning_rate": 5.18918918918919e-06, + "loss": 4.6281, + "mean_token_accuracy": 0.17950382828712463, + "num_tokens": 74141271.0, + "step": 145 + }, + { + "epoch": 0.03948080043266631, + "grad_norm": 2.13069486618042, + "learning_rate": 5.225225225225226e-06, + "loss": 4.5557, + "mean_token_accuracy": 0.19432082772254944, + "num_tokens": 74665363.0, + "step": 146 + }, + { + "epoch": 0.03975121687398594, + "grad_norm": 2.5387444496154785, + "learning_rate": 5.261261261261261e-06, + "loss": 4.3886, + "mean_token_accuracy": 0.20109973847866058, + "num_tokens": 75189459.0, + "step": 147 + }, + { + "epoch": 0.04002163331530557, + "grad_norm": 2.305185556411743, + "learning_rate": 5.297297297297298e-06, + "loss": 4.5507, + "mean_token_accuracy": 0.1722959578037262, + "num_tokens": 75713670.0, + "step": 148 + }, + { + "epoch": 0.0402920497566252, + "grad_norm": 2.9469149112701416, + "learning_rate": 5.333333333333334e-06, + "loss": 4.0728, + "mean_token_accuracy": 0.2165079265832901, + "num_tokens": 76237893.0, + "step": 149 + }, + { + "epoch": 0.04056246619794483, + "grad_norm": 3.158095121383667, + "learning_rate": 5.369369369369369e-06, + "loss": 4.3097, + "mean_token_accuracy": 0.19757291674613953, + "num_tokens": 76728232.0, + "step": 150 + }, + { + "epoch": 0.04083288263926447, + "grad_norm": 3.089010715484619, + "learning_rate": 5.405405405405406e-06, + "loss": 4.712, + "mean_token_accuracy": 0.2197898030281067, + "num_tokens": 77188771.0, + "step": 151 + }, + { + "epoch": 0.0411032990805841, + "grad_norm": 2.493816375732422, + "learning_rate": 5.441441441441442e-06, + "loss": 4.5873, + "mean_token_accuracy": 0.1998491883277893, + "num_tokens": 77713017.0, + "step": 152 + }, + { + "epoch": 0.041373715521903734, + "grad_norm": 2.397770881652832, + "learning_rate": 5.4774774774774776e-06, + "loss": 4.6272, + "mean_token_accuracy": 0.2059347778558731, + "num_tokens": 78237297.0, + "step": 153 + }, + { + "epoch": 0.04164413196322336, + "grad_norm": 2.581677198410034, + "learning_rate": 5.513513513513515e-06, + "loss": 4.3205, + "mean_token_accuracy": 0.20662535727024078, + "num_tokens": 78761524.0, + "step": 154 + }, + { + "epoch": 0.041914548404543, + "grad_norm": 2.48514723777771, + "learning_rate": 5.54954954954955e-06, + "loss": 4.4645, + "mean_token_accuracy": 0.2086922526359558, + "num_tokens": 79285793.0, + "step": 155 + }, + { + "epoch": 0.04218496484586263, + "grad_norm": 2.4872188568115234, + "learning_rate": 5.585585585585585e-06, + "loss": 4.4263, + "mean_token_accuracy": 0.2064133584499359, + "num_tokens": 79810071.0, + "step": 156 + }, + { + "epoch": 0.04245538128718226, + "grad_norm": 2.394547700881958, + "learning_rate": 5.621621621621622e-06, + "loss": 4.6055, + "mean_token_accuracy": 0.1945187896490097, + "num_tokens": 80334336.0, + "step": 157 + }, + { + "epoch": 0.042725797728501894, + "grad_norm": 2.868225336074829, + "learning_rate": 5.657657657657658e-06, + "loss": 4.1086, + "mean_token_accuracy": 0.23565508425235748, + "num_tokens": 80858424.0, + "step": 158 + }, + { + "epoch": 0.04299621416982152, + "grad_norm": 2.4846811294555664, + "learning_rate": 5.693693693693694e-06, + "loss": 4.3368, + "mean_token_accuracy": 0.21436303853988647, + "num_tokens": 81382655.0, + "step": 159 + }, + { + "epoch": 0.04326663061114116, + "grad_norm": 2.783130168914795, + "learning_rate": 5.729729729729731e-06, + "loss": 4.2236, + "mean_token_accuracy": 0.22539010643959045, + "num_tokens": 81906932.0, + "step": 160 + }, + { + "epoch": 0.04353704705246079, + "grad_norm": 1.47299325466156, + "learning_rate": 5.765765765765766e-06, + "loss": 1.3086, + "mean_token_accuracy": 0.6746399998664856, + "num_tokens": 82403697.0, + "step": 161 + }, + { + "epoch": 0.043807463493780424, + "grad_norm": 2.82441782951355, + "learning_rate": 5.8018018018018015e-06, + "loss": 4.0285, + "mean_token_accuracy": 0.2422506958246231, + "num_tokens": 82876620.0, + "step": 162 + }, + { + "epoch": 0.04407787993510005, + "grad_norm": 2.811727523803711, + "learning_rate": 5.837837837837839e-06, + "loss": 3.685, + "mean_token_accuracy": 0.25972098112106323, + "num_tokens": 83400690.0, + "step": 163 + }, + { + "epoch": 0.04434829637641969, + "grad_norm": 2.1819703578948975, + "learning_rate": 5.873873873873874e-06, + "loss": 4.048, + "mean_token_accuracy": 0.24875719845294952, + "num_tokens": 83924837.0, + "step": 164 + }, + { + "epoch": 0.04461871281773932, + "grad_norm": 2.197937488555908, + "learning_rate": 5.90990990990991e-06, + "loss": 4.302, + "mean_token_accuracy": 0.2230222225189209, + "num_tokens": 84425549.0, + "step": 165 + }, + { + "epoch": 0.04488912925905895, + "grad_norm": 3.0393381118774414, + "learning_rate": 5.945945945945947e-06, + "loss": 4.0323, + "mean_token_accuracy": 0.2525957524776459, + "num_tokens": 84915537.0, + "step": 166 + }, + { + "epoch": 0.045159545700378584, + "grad_norm": 2.480259656906128, + "learning_rate": 5.9819819819819825e-06, + "loss": 4.2719, + "mean_token_accuracy": 0.22320294380187988, + "num_tokens": 85439744.0, + "step": 167 + }, + { + "epoch": 0.04542996214169821, + "grad_norm": 2.0676891803741455, + "learning_rate": 6.018018018018018e-06, + "loss": 4.1093, + "mean_token_accuracy": 0.24905642867088318, + "num_tokens": 85963725.0, + "step": 168 + }, + { + "epoch": 0.04570037858301785, + "grad_norm": 2.6401360034942627, + "learning_rate": 6.054054054054055e-06, + "loss": 4.4145, + "mean_token_accuracy": 0.23497997224330902, + "num_tokens": 86487758.0, + "step": 169 + }, + { + "epoch": 0.04597079502433748, + "grad_norm": 2.558845043182373, + "learning_rate": 6.09009009009009e-06, + "loss": 3.8738, + "mean_token_accuracy": 0.26503172516822815, + "num_tokens": 86953986.0, + "step": 170 + }, + { + "epoch": 0.046241211465657114, + "grad_norm": 2.051849842071533, + "learning_rate": 6.126126126126126e-06, + "loss": 4.3016, + "mean_token_accuracy": 0.23530739545822144, + "num_tokens": 87478231.0, + "step": 171 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 2.091862201690674, + "learning_rate": 6.162162162162163e-06, + "loss": 4.4223, + "mean_token_accuracy": 0.2359175980091095, + "num_tokens": 87968693.0, + "step": 172 + }, + { + "epoch": 0.04678204434829638, + "grad_norm": 2.2487549781799316, + "learning_rate": 6.198198198198199e-06, + "loss": 4.3342, + "mean_token_accuracy": 0.24496331810951233, + "num_tokens": 88482764.0, + "step": 173 + }, + { + "epoch": 0.04705246078961601, + "grad_norm": 2.191338539123535, + "learning_rate": 6.234234234234234e-06, + "loss": 4.4004, + "mean_token_accuracy": 0.24128121137619019, + "num_tokens": 89007037.0, + "step": 174 + }, + { + "epoch": 0.04732287723093564, + "grad_norm": 1.9973392486572266, + "learning_rate": 6.270270270270271e-06, + "loss": 4.2245, + "mean_token_accuracy": 0.23723304271697998, + "num_tokens": 89531232.0, + "step": 175 + }, + { + "epoch": 0.047593293672255274, + "grad_norm": 2.111359119415283, + "learning_rate": 6.3063063063063065e-06, + "loss": 4.0485, + "mean_token_accuracy": 0.2586632966995239, + "num_tokens": 90055378.0, + "step": 176 + }, + { + "epoch": 0.0478637101135749, + "grad_norm": 2.5840904712677, + "learning_rate": 6.342342342342343e-06, + "loss": 4.2487, + "mean_token_accuracy": 0.24450016021728516, + "num_tokens": 90574969.0, + "step": 177 + }, + { + "epoch": 0.04813412655489454, + "grad_norm": 2.7463278770446777, + "learning_rate": 6.378378378378379e-06, + "loss": 4.0165, + "mean_token_accuracy": 0.24192669987678528, + "num_tokens": 91099222.0, + "step": 178 + }, + { + "epoch": 0.04840454299621417, + "grad_norm": 1.8976584672927856, + "learning_rate": 6.414414414414415e-06, + "loss": 4.0452, + "mean_token_accuracy": 0.2660364508628845, + "num_tokens": 91623494.0, + "step": 179 + }, + { + "epoch": 0.048674959437533805, + "grad_norm": 1.8318763971328735, + "learning_rate": 6.45045045045045e-06, + "loss": 3.8635, + "mean_token_accuracy": 0.28067725896835327, + "num_tokens": 92147679.0, + "step": 180 + }, + { + "epoch": 0.048945375878853434, + "grad_norm": 1.003243088722229, + "learning_rate": 6.486486486486487e-06, + "loss": 1.2466, + "mean_token_accuracy": 0.6728487014770508, + "num_tokens": 92671914.0, + "step": 181 + }, + { + "epoch": 0.04921579232017307, + "grad_norm": 2.0354251861572266, + "learning_rate": 6.522522522522523e-06, + "loss": 4.0459, + "mean_token_accuracy": 0.24701449275016785, + "num_tokens": 93196088.0, + "step": 182 + }, + { + "epoch": 0.0494862087614927, + "grad_norm": 2.2083842754364014, + "learning_rate": 6.558558558558559e-06, + "loss": 4.0331, + "mean_token_accuracy": 0.27538102865219116, + "num_tokens": 93713460.0, + "step": 183 + }, + { + "epoch": 0.04975662520281233, + "grad_norm": 1.809564232826233, + "learning_rate": 6.594594594594595e-06, + "loss": 3.8185, + "mean_token_accuracy": 0.2887369692325592, + "num_tokens": 94237643.0, + "step": 184 + }, + { + "epoch": 0.050027041644131964, + "grad_norm": 1.979149341583252, + "learning_rate": 6.630630630630631e-06, + "loss": 4.2175, + "mean_token_accuracy": 0.2544465661048889, + "num_tokens": 94747241.0, + "step": 185 + }, + { + "epoch": 0.05029745808545159, + "grad_norm": 2.2949252128601074, + "learning_rate": 6.666666666666667e-06, + "loss": 3.6352, + "mean_token_accuracy": 0.3041189908981323, + "num_tokens": 95226907.0, + "step": 186 + }, + { + "epoch": 0.05056787452677123, + "grad_norm": 1.896005630493164, + "learning_rate": 6.702702702702704e-06, + "loss": 3.9307, + "mean_token_accuracy": 0.284877210855484, + "num_tokens": 95751078.0, + "step": 187 + }, + { + "epoch": 0.05083829096809086, + "grad_norm": 2.220487594604492, + "learning_rate": 6.738738738738739e-06, + "loss": 4.092, + "mean_token_accuracy": 0.26995670795440674, + "num_tokens": 96275169.0, + "step": 188 + }, + { + "epoch": 0.051108707409410495, + "grad_norm": 2.2115349769592285, + "learning_rate": 6.774774774774775e-06, + "loss": 3.995, + "mean_token_accuracy": 0.29615262150764465, + "num_tokens": 96734378.0, + "step": 189 + }, + { + "epoch": 0.051379123850730124, + "grad_norm": 1.7264485359191895, + "learning_rate": 6.810810810810811e-06, + "loss": 4.0145, + "mean_token_accuracy": 0.2822348177433014, + "num_tokens": 97228434.0, + "step": 190 + }, + { + "epoch": 0.05164954029204975, + "grad_norm": 1.5867559909820557, + "learning_rate": 6.846846846846848e-06, + "loss": 4.1115, + "mean_token_accuracy": 0.278208464384079, + "num_tokens": 97699149.0, + "step": 191 + }, + { + "epoch": 0.05191995673336939, + "grad_norm": 1.5068320035934448, + "learning_rate": 6.882882882882883e-06, + "loss": 4.0803, + "mean_token_accuracy": 0.28738221526145935, + "num_tokens": 98196837.0, + "step": 192 + }, + { + "epoch": 0.05219037317468902, + "grad_norm": 1.7868934869766235, + "learning_rate": 6.91891891891892e-06, + "loss": 3.9997, + "mean_token_accuracy": 0.269906610250473, + "num_tokens": 98720929.0, + "step": 193 + }, + { + "epoch": 0.052460789616008655, + "grad_norm": 2.2013773918151855, + "learning_rate": 6.954954954954955e-06, + "loss": 3.8113, + "mean_token_accuracy": 0.2927558124065399, + "num_tokens": 99245210.0, + "step": 194 + }, + { + "epoch": 0.052731206057328284, + "grad_norm": 1.5861568450927734, + "learning_rate": 6.9909909909909915e-06, + "loss": 4.0568, + "mean_token_accuracy": 0.2914646565914154, + "num_tokens": 99723807.0, + "step": 195 + }, + { + "epoch": 0.05300162249864792, + "grad_norm": 1.7026946544647217, + "learning_rate": 7.027027027027028e-06, + "loss": 3.9495, + "mean_token_accuracy": 0.3071490526199341, + "num_tokens": 100140165.0, + "step": 196 + }, + { + "epoch": 0.05327203893996755, + "grad_norm": 1.7340549230575562, + "learning_rate": 7.063063063063064e-06, + "loss": 3.6794, + "mean_token_accuracy": 0.3190075755119324, + "num_tokens": 100605672.0, + "step": 197 + }, + { + "epoch": 0.053542455381287185, + "grad_norm": 1.877952218055725, + "learning_rate": 7.099099099099099e-06, + "loss": 3.4809, + "mean_token_accuracy": 0.3101056218147278, + "num_tokens": 101129873.0, + "step": 198 + }, + { + "epoch": 0.053812871822606814, + "grad_norm": 2.015967845916748, + "learning_rate": 7.135135135135136e-06, + "loss": 3.8698, + "mean_token_accuracy": 0.3056207597255707, + "num_tokens": 101600313.0, + "step": 199 + }, + { + "epoch": 0.05408328826392644, + "grad_norm": 1.9583239555358887, + "learning_rate": 7.1711711711711716e-06, + "loss": 3.794, + "mean_token_accuracy": 0.28329330682754517, + "num_tokens": 102120921.0, + "step": 200 + }, + { + "epoch": 0.05435370470524608, + "grad_norm": 1.400843620300293, + "learning_rate": 7.207207207207208e-06, + "loss": 1.2907, + "mean_token_accuracy": 0.6703051328659058, + "num_tokens": 102645161.0, + "step": 201 + }, + { + "epoch": 0.05462412114656571, + "grad_norm": 7.389487266540527, + "learning_rate": 7.243243243243244e-06, + "loss": 3.7004, + "mean_token_accuracy": 0.3315087854862213, + "num_tokens": 103169368.0, + "step": 202 + }, + { + "epoch": 0.054894537587885345, + "grad_norm": 2.0744800567626953, + "learning_rate": 7.27927927927928e-06, + "loss": 3.8035, + "mean_token_accuracy": 0.29171228408813477, + "num_tokens": 103693576.0, + "step": 203 + }, + { + "epoch": 0.055164954029204974, + "grad_norm": 2.3603832721710205, + "learning_rate": 7.3153153153153155e-06, + "loss": 3.8521, + "mean_token_accuracy": 0.3059489130973816, + "num_tokens": 104159468.0, + "step": 204 + }, + { + "epoch": 0.05543537047052461, + "grad_norm": 1.697482705116272, + "learning_rate": 7.3513513513513525e-06, + "loss": 3.7485, + "mean_token_accuracy": 0.30127841234207153, + "num_tokens": 104683725.0, + "step": 205 + }, + { + "epoch": 0.05570578691184424, + "grad_norm": 2.3584039211273193, + "learning_rate": 7.387387387387388e-06, + "loss": 3.6164, + "mean_token_accuracy": 0.31274017691612244, + "num_tokens": 105155783.0, + "step": 206 + }, + { + "epoch": 0.055976203353163875, + "grad_norm": 2.2635068893432617, + "learning_rate": 7.423423423423424e-06, + "loss": 4.003, + "mean_token_accuracy": 0.28579193353652954, + "num_tokens": 105680064.0, + "step": 207 + }, + { + "epoch": 0.056246619794483504, + "grad_norm": 1.7146154642105103, + "learning_rate": 7.45945945945946e-06, + "loss": 3.7513, + "mean_token_accuracy": 0.292556494474411, + "num_tokens": 106204338.0, + "step": 208 + }, + { + "epoch": 0.056517036235803134, + "grad_norm": 1.7706249952316284, + "learning_rate": 7.495495495495496e-06, + "loss": 3.9925, + "mean_token_accuracy": 0.29481226205825806, + "num_tokens": 106728602.0, + "step": 209 + }, + { + "epoch": 0.05678745267712277, + "grad_norm": 1.7312355041503906, + "learning_rate": 7.531531531531532e-06, + "loss": 3.8736, + "mean_token_accuracy": 0.3088110089302063, + "num_tokens": 107221224.0, + "step": 210 + }, + { + "epoch": 0.0570578691184424, + "grad_norm": 2.90425443649292, + "learning_rate": 7.567567567567569e-06, + "loss": 3.3669, + "mean_token_accuracy": 0.32660651206970215, + "num_tokens": 107745390.0, + "step": 211 + }, + { + "epoch": 0.057328285559762035, + "grad_norm": 2.7422759532928467, + "learning_rate": 7.603603603603604e-06, + "loss": 3.846, + "mean_token_accuracy": 0.3080369234085083, + "num_tokens": 108269508.0, + "step": 212 + }, + { + "epoch": 0.057598702001081664, + "grad_norm": 2.539693593978882, + "learning_rate": 7.63963963963964e-06, + "loss": 3.8692, + "mean_token_accuracy": 0.30797648429870605, + "num_tokens": 108793666.0, + "step": 213 + }, + { + "epoch": 0.0578691184424013, + "grad_norm": 1.9854689836502075, + "learning_rate": 7.675675675675676e-06, + "loss": 3.8827, + "mean_token_accuracy": 0.3329686224460602, + "num_tokens": 109261396.0, + "step": 214 + }, + { + "epoch": 0.05813953488372093, + "grad_norm": 2.471647262573242, + "learning_rate": 7.711711711711712e-06, + "loss": 3.7594, + "mean_token_accuracy": 0.30424296855926514, + "num_tokens": 109785674.0, + "step": 215 + }, + { + "epoch": 0.058409951325040566, + "grad_norm": 3.063370943069458, + "learning_rate": 7.747747747747749e-06, + "loss": 3.5188, + "mean_token_accuracy": 0.3286072313785553, + "num_tokens": 110309862.0, + "step": 216 + }, + { + "epoch": 0.058680367766360195, + "grad_norm": 2.5846991539001465, + "learning_rate": 7.783783783783784e-06, + "loss": 3.5684, + "mean_token_accuracy": 0.3216283321380615, + "num_tokens": 110834041.0, + "step": 217 + }, + { + "epoch": 0.058950784207679824, + "grad_norm": 2.104090452194214, + "learning_rate": 7.819819819819821e-06, + "loss": 4.0797, + "mean_token_accuracy": 0.2893562316894531, + "num_tokens": 111358292.0, + "step": 218 + }, + { + "epoch": 0.05922120064899946, + "grad_norm": 3.268273115158081, + "learning_rate": 7.855855855855857e-06, + "loss": 3.3706, + "mean_token_accuracy": 0.33590173721313477, + "num_tokens": 111845102.0, + "step": 219 + }, + { + "epoch": 0.05949161709031909, + "grad_norm": 3.003304958343506, + "learning_rate": 7.891891891891894e-06, + "loss": 3.866, + "mean_token_accuracy": 0.30482780933380127, + "num_tokens": 112369381.0, + "step": 220 + }, + { + "epoch": 0.059762033531638725, + "grad_norm": 1.240090012550354, + "learning_rate": 7.927927927927929e-06, + "loss": 1.1881, + "mean_token_accuracy": 0.6838078498840332, + "num_tokens": 112893415.0, + "step": 221 + }, + { + "epoch": 0.060032449972958354, + "grad_norm": 2.6829910278320312, + "learning_rate": 7.963963963963964e-06, + "loss": 3.5115, + "mean_token_accuracy": 0.3539430499076843, + "num_tokens": 113354162.0, + "step": 222 + }, + { + "epoch": 0.06030286641427799, + "grad_norm": 3.444755792617798, + "learning_rate": 8.000000000000001e-06, + "loss": 3.601, + "mean_token_accuracy": 0.32273995876312256, + "num_tokens": 113878265.0, + "step": 223 + }, + { + "epoch": 0.06057328285559762, + "grad_norm": 1.6883090734481812, + "learning_rate": 8.036036036036037e-06, + "loss": 3.4737, + "mean_token_accuracy": 0.325718492269516, + "num_tokens": 114402317.0, + "step": 224 + }, + { + "epoch": 0.060843699296917256, + "grad_norm": 1.806186318397522, + "learning_rate": 8.072072072072072e-06, + "loss": 3.752, + "mean_token_accuracy": 0.31389743089675903, + "num_tokens": 114896722.0, + "step": 225 + }, + { + "epoch": 0.061114115738236885, + "grad_norm": 2.531572103500366, + "learning_rate": 8.108108108108109e-06, + "loss": 3.6079, + "mean_token_accuracy": 0.3276051878929138, + "num_tokens": 115420855.0, + "step": 226 + }, + { + "epoch": 0.061384532179556514, + "grad_norm": 2.484109878540039, + "learning_rate": 8.144144144144144e-06, + "loss": 3.7734, + "mean_token_accuracy": 0.31748610734939575, + "num_tokens": 115945043.0, + "step": 227 + }, + { + "epoch": 0.06165494862087615, + "grad_norm": 2.2005105018615723, + "learning_rate": 8.18018018018018e-06, + "loss": 3.8816, + "mean_token_accuracy": 0.3051668405532837, + "num_tokens": 116469309.0, + "step": 228 + }, + { + "epoch": 0.06192536506219578, + "grad_norm": 2.605402708053589, + "learning_rate": 8.216216216216217e-06, + "loss": 3.5576, + "mean_token_accuracy": 0.3364900052547455, + "num_tokens": 116955746.0, + "step": 229 + }, + { + "epoch": 0.062195781503515415, + "grad_norm": 1.953395128250122, + "learning_rate": 8.252252252252254e-06, + "loss": 3.4745, + "mean_token_accuracy": 0.35580527782440186, + "num_tokens": 117418660.0, + "step": 230 + }, + { + "epoch": 0.062466197944835045, + "grad_norm": 2.271320104598999, + "learning_rate": 8.288288288288289e-06, + "loss": 3.5673, + "mean_token_accuracy": 0.3190290927886963, + "num_tokens": 117942821.0, + "step": 231 + }, + { + "epoch": 0.06273661438615467, + "grad_norm": 2.335784435272217, + "learning_rate": 8.324324324324326e-06, + "loss": 3.7371, + "mean_token_accuracy": 0.3080819249153137, + "num_tokens": 118467048.0, + "step": 232 + }, + { + "epoch": 0.06300703082747432, + "grad_norm": 2.116196393966675, + "learning_rate": 8.360360360360362e-06, + "loss": 3.8169, + "mean_token_accuracy": 0.3146495521068573, + "num_tokens": 118938682.0, + "step": 233 + }, + { + "epoch": 0.06327744726879395, + "grad_norm": 1.8489640951156616, + "learning_rate": 8.396396396396397e-06, + "loss": 3.701, + "mean_token_accuracy": 0.3123469352722168, + "num_tokens": 119462807.0, + "step": 234 + }, + { + "epoch": 0.06354786371011358, + "grad_norm": 1.892356276512146, + "learning_rate": 8.432432432432434e-06, + "loss": 3.33, + "mean_token_accuracy": 0.33034923672676086, + "num_tokens": 119987074.0, + "step": 235 + }, + { + "epoch": 0.0638182801514332, + "grad_norm": 2.06467342376709, + "learning_rate": 8.46846846846847e-06, + "loss": 3.5306, + "mean_token_accuracy": 0.32130393385887146, + "num_tokens": 120511358.0, + "step": 236 + }, + { + "epoch": 0.06408869659275283, + "grad_norm": 1.921406626701355, + "learning_rate": 8.504504504504505e-06, + "loss": 3.7195, + "mean_token_accuracy": 0.3248932659626007, + "num_tokens": 121035622.0, + "step": 237 + }, + { + "epoch": 0.06435911303407248, + "grad_norm": 1.7402100563049316, + "learning_rate": 8.540540540540542e-06, + "loss": 3.496, + "mean_token_accuracy": 0.33227407932281494, + "num_tokens": 121559849.0, + "step": 238 + }, + { + "epoch": 0.0646295294753921, + "grad_norm": 1.8767977952957153, + "learning_rate": 8.576576576576577e-06, + "loss": 3.7315, + "mean_token_accuracy": 0.32181516289711, + "num_tokens": 122056150.0, + "step": 239 + }, + { + "epoch": 0.06489994591671173, + "grad_norm": 7.350437164306641, + "learning_rate": 8.612612612612612e-06, + "loss": 3.1593, + "mean_token_accuracy": 0.36935243010520935, + "num_tokens": 122580376.0, + "step": 240 + }, + { + "epoch": 0.06517036235803136, + "grad_norm": 0.8817612528800964, + "learning_rate": 8.64864864864865e-06, + "loss": 1.2664, + "mean_token_accuracy": 0.6897443532943726, + "num_tokens": 123016782.0, + "step": 241 + }, + { + "epoch": 0.06544077879935101, + "grad_norm": 2.8577749729156494, + "learning_rate": 8.684684684684686e-06, + "loss": 3.3829, + "mean_token_accuracy": 0.34561511874198914, + "num_tokens": 123497362.0, + "step": 242 + }, + { + "epoch": 0.06571119524067064, + "grad_norm": 2.1106131076812744, + "learning_rate": 8.720720720720722e-06, + "loss": 3.612, + "mean_token_accuracy": 0.33963555097579956, + "num_tokens": 123969258.0, + "step": 243 + }, + { + "epoch": 0.06598161168199027, + "grad_norm": 2.2066071033477783, + "learning_rate": 8.756756756756759e-06, + "loss": 3.7, + "mean_token_accuracy": 0.33428698778152466, + "num_tokens": 124446311.0, + "step": 244 + }, + { + "epoch": 0.0662520281233099, + "grad_norm": 2.0799624919891357, + "learning_rate": 8.792792792792794e-06, + "loss": 3.803, + "mean_token_accuracy": 0.3223491907119751, + "num_tokens": 124963238.0, + "step": 245 + }, + { + "epoch": 0.06652244456462952, + "grad_norm": 1.7878408432006836, + "learning_rate": 8.82882882882883e-06, + "loss": 3.5753, + "mean_token_accuracy": 0.33781033754348755, + "num_tokens": 125480450.0, + "step": 246 + }, + { + "epoch": 0.06679286100594917, + "grad_norm": 2.0596861839294434, + "learning_rate": 8.864864864864866e-06, + "loss": 3.5859, + "mean_token_accuracy": 0.3316994309425354, + "num_tokens": 126004734.0, + "step": 247 + }, + { + "epoch": 0.0670632774472688, + "grad_norm": 2.2816975116729736, + "learning_rate": 8.900900900900902e-06, + "loss": 3.7384, + "mean_token_accuracy": 0.3284192681312561, + "num_tokens": 126529000.0, + "step": 248 + }, + { + "epoch": 0.06733369388858843, + "grad_norm": 2.782817840576172, + "learning_rate": 8.936936936936937e-06, + "loss": 3.1891, + "mean_token_accuracy": 0.32817524671554565, + "num_tokens": 127053277.0, + "step": 249 + }, + { + "epoch": 0.06760411032990805, + "grad_norm": 2.15238094329834, + "learning_rate": 8.972972972972974e-06, + "loss": 3.5498, + "mean_token_accuracy": 0.32856547832489014, + "num_tokens": 127577424.0, + "step": 250 + }, + { + "epoch": 0.0678745267712277, + "grad_norm": 1.5843677520751953, + "learning_rate": 9.00900900900901e-06, + "loss": 3.6662, + "mean_token_accuracy": 0.32955875992774963, + "num_tokens": 128058622.0, + "step": 251 + }, + { + "epoch": 0.06814494321254733, + "grad_norm": 1.8433129787445068, + "learning_rate": 9.045045045045045e-06, + "loss": 3.7111, + "mean_token_accuracy": 0.32709571719169617, + "num_tokens": 128582903.0, + "step": 252 + }, + { + "epoch": 0.06841535965386696, + "grad_norm": 1.7202582359313965, + "learning_rate": 9.081081081081082e-06, + "loss": 3.7653, + "mean_token_accuracy": 0.3335127830505371, + "num_tokens": 129072012.0, + "step": 253 + }, + { + "epoch": 0.06868577609518658, + "grad_norm": 1.3363375663757324, + "learning_rate": 9.117117117117117e-06, + "loss": 3.6719, + "mean_token_accuracy": 0.3359074294567108, + "num_tokens": 129596261.0, + "step": 254 + }, + { + "epoch": 0.06895619253650621, + "grad_norm": 1.9022183418273926, + "learning_rate": 9.153153153153154e-06, + "loss": 3.55, + "mean_token_accuracy": 0.318397581577301, + "num_tokens": 130073668.0, + "step": 255 + }, + { + "epoch": 0.06922660897782586, + "grad_norm": 1.4958577156066895, + "learning_rate": 9.189189189189191e-06, + "loss": 3.5292, + "mean_token_accuracy": 0.3452869653701782, + "num_tokens": 130597818.0, + "step": 256 + }, + { + "epoch": 0.06949702541914549, + "grad_norm": 1.999333143234253, + "learning_rate": 9.225225225225227e-06, + "loss": 3.515, + "mean_token_accuracy": 0.3357631266117096, + "num_tokens": 131121941.0, + "step": 257 + }, + { + "epoch": 0.06976744186046512, + "grad_norm": 1.7367216348648071, + "learning_rate": 9.261261261261262e-06, + "loss": 3.6091, + "mean_token_accuracy": 0.3834938406944275, + "num_tokens": 131541947.0, + "step": 258 + }, + { + "epoch": 0.07003785830178474, + "grad_norm": 1.9846428632736206, + "learning_rate": 9.297297297297299e-06, + "loss": 3.7315, + "mean_token_accuracy": 0.30015283823013306, + "num_tokens": 132066156.0, + "step": 259 + }, + { + "epoch": 0.07030827474310439, + "grad_norm": 2.342339277267456, + "learning_rate": 9.333333333333334e-06, + "loss": 3.5652, + "mean_token_accuracy": 0.33099421858787537, + "num_tokens": 132590406.0, + "step": 260 + }, + { + "epoch": 0.07057869118442402, + "grad_norm": 0.9564107656478882, + "learning_rate": 9.36936936936937e-06, + "loss": 1.1859, + "mean_token_accuracy": 0.6746491193771362, + "num_tokens": 133114665.0, + "step": 261 + }, + { + "epoch": 0.07084910762574365, + "grad_norm": 2.1726808547973633, + "learning_rate": 9.405405405405407e-06, + "loss": 3.3702, + "mean_token_accuracy": 0.36037999391555786, + "num_tokens": 133563554.0, + "step": 262 + }, + { + "epoch": 0.07111952406706328, + "grad_norm": 2.001310348510742, + "learning_rate": 9.441441441441442e-06, + "loss": 3.4965, + "mean_token_accuracy": 0.3291923999786377, + "num_tokens": 134087800.0, + "step": 263 + }, + { + "epoch": 0.0713899405083829, + "grad_norm": 1.6895627975463867, + "learning_rate": 9.477477477477477e-06, + "loss": 3.5401, + "mean_token_accuracy": 0.33538633584976196, + "num_tokens": 134612065.0, + "step": 264 + }, + { + "epoch": 0.07166035694970255, + "grad_norm": 2.112427234649658, + "learning_rate": 9.513513513513514e-06, + "loss": 3.2352, + "mean_token_accuracy": 0.3653791546821594, + "num_tokens": 135136309.0, + "step": 265 + }, + { + "epoch": 0.07193077339102218, + "grad_norm": 2.861398458480835, + "learning_rate": 9.54954954954955e-06, + "loss": 3.446, + "mean_token_accuracy": 0.32781982421875, + "num_tokens": 135660377.0, + "step": 266 + }, + { + "epoch": 0.0722011898323418, + "grad_norm": 1.7296565771102905, + "learning_rate": 9.585585585585587e-06, + "loss": 3.4097, + "mean_token_accuracy": 0.34647321701049805, + "num_tokens": 136136792.0, + "step": 267 + }, + { + "epoch": 0.07247160627366143, + "grad_norm": 3.610811948776245, + "learning_rate": 9.621621621621622e-06, + "loss": 3.4655, + "mean_token_accuracy": 0.3885596990585327, + "num_tokens": 136596668.0, + "step": 268 + }, + { + "epoch": 0.07274202271498108, + "grad_norm": 3.3108251094818115, + "learning_rate": 9.65765765765766e-06, + "loss": 3.4644, + "mean_token_accuracy": 0.3523981273174286, + "num_tokens": 137120903.0, + "step": 269 + }, + { + "epoch": 0.0730124391563007, + "grad_norm": 2.3054981231689453, + "learning_rate": 9.693693693693694e-06, + "loss": 3.2934, + "mean_token_accuracy": 0.34640398621559143, + "num_tokens": 137645055.0, + "step": 270 + }, + { + "epoch": 0.07328285559762034, + "grad_norm": 2.7956831455230713, + "learning_rate": 9.729729729729732e-06, + "loss": 3.623, + "mean_token_accuracy": 0.3781070113182068, + "num_tokens": 138104389.0, + "step": 271 + }, + { + "epoch": 0.07355327203893997, + "grad_norm": 3.3703818321228027, + "learning_rate": 9.765765765765767e-06, + "loss": 3.3627, + "mean_token_accuracy": 0.3521038889884949, + "num_tokens": 138628557.0, + "step": 272 + }, + { + "epoch": 0.0738236884802596, + "grad_norm": 3.213268995285034, + "learning_rate": 9.801801801801802e-06, + "loss": 3.5219, + "mean_token_accuracy": 0.3480474352836609, + "num_tokens": 139152741.0, + "step": 273 + }, + { + "epoch": 0.07409410492157924, + "grad_norm": 2.524378538131714, + "learning_rate": 9.83783783783784e-06, + "loss": 3.5649, + "mean_token_accuracy": 0.35164064168930054, + "num_tokens": 139621635.0, + "step": 274 + }, + { + "epoch": 0.07436452136289887, + "grad_norm": 3.710583448410034, + "learning_rate": 9.873873873873875e-06, + "loss": 3.6424, + "mean_token_accuracy": 0.3430122137069702, + "num_tokens": 140145873.0, + "step": 275 + }, + { + "epoch": 0.0746349378042185, + "grad_norm": 2.835299253463745, + "learning_rate": 9.90990990990991e-06, + "loss": 3.5459, + "mean_token_accuracy": 0.3772485554218292, + "num_tokens": 140605313.0, + "step": 276 + }, + { + "epoch": 0.07490535424553812, + "grad_norm": 2.3909072875976562, + "learning_rate": 9.945945945945947e-06, + "loss": 3.6404, + "mean_token_accuracy": 0.3444654941558838, + "num_tokens": 141129440.0, + "step": 277 + }, + { + "epoch": 0.07517577068685775, + "grad_norm": 2.537929058074951, + "learning_rate": 9.981981981981982e-06, + "loss": 3.2974, + "mean_token_accuracy": 0.3580111861228943, + "num_tokens": 141605033.0, + "step": 278 + }, + { + "epoch": 0.0754461871281774, + "grad_norm": 2.5878777503967285, + "learning_rate": 1.001801801801802e-05, + "loss": 3.2821, + "mean_token_accuracy": 0.3643641471862793, + "num_tokens": 142129205.0, + "step": 279 + }, + { + "epoch": 0.07571660356949703, + "grad_norm": 4.083503723144531, + "learning_rate": 1.0054054054054055e-05, + "loss": 3.6401, + "mean_token_accuracy": 0.36749744415283203, + "num_tokens": 142653392.0, + "step": 280 + }, + { + "epoch": 0.07598702001081666, + "grad_norm": 1.0379648208618164, + "learning_rate": 1.009009009009009e-05, + "loss": 1.1907, + "mean_token_accuracy": 0.6790620684623718, + "num_tokens": 143177643.0, + "step": 281 + }, + { + "epoch": 0.07625743645213628, + "grad_norm": 2.904026985168457, + "learning_rate": 1.0126126126126127e-05, + "loss": 3.7078, + "mean_token_accuracy": 0.32802435755729675, + "num_tokens": 143701922.0, + "step": 282 + }, + { + "epoch": 0.07652785289345593, + "grad_norm": 2.367011547088623, + "learning_rate": 1.0162162162162164e-05, + "loss": 3.6632, + "mean_token_accuracy": 0.32023105025291443, + "num_tokens": 144226111.0, + "step": 283 + }, + { + "epoch": 0.07679826933477556, + "grad_norm": 1.9431264400482178, + "learning_rate": 1.01981981981982e-05, + "loss": 3.2424, + "mean_token_accuracy": 0.36453235149383545, + "num_tokens": 144750329.0, + "step": 284 + }, + { + "epoch": 0.07706868577609519, + "grad_norm": 2.3565514087677, + "learning_rate": 1.0234234234234235e-05, + "loss": 3.2848, + "mean_token_accuracy": 0.3651161789894104, + "num_tokens": 145274553.0, + "step": 285 + }, + { + "epoch": 0.07733910221741482, + "grad_norm": 3.165166139602661, + "learning_rate": 1.027027027027027e-05, + "loss": 3.0817, + "mean_token_accuracy": 0.3923535943031311, + "num_tokens": 145786475.0, + "step": 286 + }, + { + "epoch": 0.07760951865873444, + "grad_norm": 2.386357069015503, + "learning_rate": 1.0306306306306305e-05, + "loss": 3.3252, + "mean_token_accuracy": 0.3601371645927429, + "num_tokens": 146291480.0, + "step": 287 + }, + { + "epoch": 0.07787993510005409, + "grad_norm": 1.948519229888916, + "learning_rate": 1.0342342342342344e-05, + "loss": 3.5729, + "mean_token_accuracy": 0.3407531976699829, + "num_tokens": 146815538.0, + "step": 288 + }, + { + "epoch": 0.07815035154137372, + "grad_norm": 2.2419817447662354, + "learning_rate": 1.037837837837838e-05, + "loss": 3.2285, + "mean_token_accuracy": 0.38131409883499146, + "num_tokens": 147295956.0, + "step": 289 + }, + { + "epoch": 0.07842076798269335, + "grad_norm": 2.0768163204193115, + "learning_rate": 1.0414414414414415e-05, + "loss": 3.4793, + "mean_token_accuracy": 0.3738369643688202, + "num_tokens": 147748938.0, + "step": 290 + }, + { + "epoch": 0.07869118442401297, + "grad_norm": 2.526585817337036, + "learning_rate": 1.0450450450450452e-05, + "loss": 3.4844, + "mean_token_accuracy": 0.3907634913921356, + "num_tokens": 148170030.0, + "step": 291 + }, + { + "epoch": 0.07896160086533262, + "grad_norm": 2.61186146736145, + "learning_rate": 1.0486486486486487e-05, + "loss": 3.4141, + "mean_token_accuracy": 0.3650810420513153, + "num_tokens": 148628511.0, + "step": 292 + }, + { + "epoch": 0.07923201730665225, + "grad_norm": 2.5316689014434814, + "learning_rate": 1.0522522522522523e-05, + "loss": 3.4643, + "mean_token_accuracy": 0.3566904067993164, + "num_tokens": 149152790.0, + "step": 293 + }, + { + "epoch": 0.07950243374797188, + "grad_norm": 3.6856749057769775, + "learning_rate": 1.055855855855856e-05, + "loss": 3.3123, + "mean_token_accuracy": 0.37233051657676697, + "num_tokens": 149676971.0, + "step": 294 + }, + { + "epoch": 0.0797728501892915, + "grad_norm": 3.013028383255005, + "learning_rate": 1.0594594594594597e-05, + "loss": 3.4978, + "mean_token_accuracy": 0.3333134651184082, + "num_tokens": 150201229.0, + "step": 295 + }, + { + "epoch": 0.08004326663061113, + "grad_norm": 2.2327687740325928, + "learning_rate": 1.0630630630630632e-05, + "loss": 3.1265, + "mean_token_accuracy": 0.392192006111145, + "num_tokens": 150725476.0, + "step": 296 + }, + { + "epoch": 0.08031368307193078, + "grad_norm": 3.4038162231445312, + "learning_rate": 1.0666666666666667e-05, + "loss": 3.4077, + "mean_token_accuracy": 0.36377599835395813, + "num_tokens": 151197117.0, + "step": 297 + }, + { + "epoch": 0.0805840995132504, + "grad_norm": 2.4157423973083496, + "learning_rate": 1.0702702702702703e-05, + "loss": 3.4282, + "mean_token_accuracy": 0.3608620762825012, + "num_tokens": 151721299.0, + "step": 298 + }, + { + "epoch": 0.08085451595457004, + "grad_norm": 1.9519281387329102, + "learning_rate": 1.0738738738738738e-05, + "loss": 3.4968, + "mean_token_accuracy": 0.34825682640075684, + "num_tokens": 152245447.0, + "step": 299 + }, + { + "epoch": 0.08112493239588967, + "grad_norm": 3.255866765975952, + "learning_rate": 1.0774774774774777e-05, + "loss": 3.343, + "mean_token_accuracy": 0.41033321619033813, + "num_tokens": 152704241.0, + "step": 300 + }, + { + "epoch": 0.08139534883720931, + "grad_norm": 1.2285887002944946, + "learning_rate": 1.0810810810810812e-05, + "loss": 1.1509, + "mean_token_accuracy": 0.7076197266578674, + "num_tokens": 153196089.0, + "step": 301 + }, + { + "epoch": 0.08166576527852894, + "grad_norm": 3.8446884155273438, + "learning_rate": 1.0846846846846847e-05, + "loss": 3.382, + "mean_token_accuracy": 0.3755962550640106, + "num_tokens": 153720293.0, + "step": 302 + }, + { + "epoch": 0.08193618171984857, + "grad_norm": 3.008420467376709, + "learning_rate": 1.0882882882882884e-05, + "loss": 3.246, + "mean_token_accuracy": 0.41439124941825867, + "num_tokens": 154133919.0, + "step": 303 + }, + { + "epoch": 0.0822065981611682, + "grad_norm": 2.1216137409210205, + "learning_rate": 1.091891891891892e-05, + "loss": 3.5324, + "mean_token_accuracy": 0.3411775231361389, + "num_tokens": 154658164.0, + "step": 304 + }, + { + "epoch": 0.08247701460248782, + "grad_norm": 2.9312360286712646, + "learning_rate": 1.0954954954954955e-05, + "loss": 3.3026, + "mean_token_accuracy": 0.36989688873291016, + "num_tokens": 155127255.0, + "step": 305 + }, + { + "epoch": 0.08274743104380747, + "grad_norm": 2.907892942428589, + "learning_rate": 1.0990990990990992e-05, + "loss": 3.4534, + "mean_token_accuracy": 0.3527883291244507, + "num_tokens": 155651405.0, + "step": 306 + }, + { + "epoch": 0.0830178474851271, + "grad_norm": 2.3893730640411377, + "learning_rate": 1.102702702702703e-05, + "loss": 3.2444, + "mean_token_accuracy": 0.36414486169815063, + "num_tokens": 156175689.0, + "step": 307 + }, + { + "epoch": 0.08328826392644673, + "grad_norm": 2.494453191757202, + "learning_rate": 1.1063063063063065e-05, + "loss": 3.2726, + "mean_token_accuracy": 0.36479926109313965, + "num_tokens": 156699937.0, + "step": 308 + }, + { + "epoch": 0.08355868036776636, + "grad_norm": 1.9356884956359863, + "learning_rate": 1.10990990990991e-05, + "loss": 3.4759, + "mean_token_accuracy": 0.3517117500305176, + "num_tokens": 157224144.0, + "step": 309 + }, + { + "epoch": 0.083829096809086, + "grad_norm": 2.468442678451538, + "learning_rate": 1.1135135135135135e-05, + "loss": 3.4131, + "mean_token_accuracy": 0.3628694415092468, + "num_tokens": 157748328.0, + "step": 310 + }, + { + "epoch": 0.08409951325040563, + "grad_norm": 2.340104579925537, + "learning_rate": 1.117117117117117e-05, + "loss": 3.4643, + "mean_token_accuracy": 0.3595651090145111, + "num_tokens": 158229813.0, + "step": 311 + }, + { + "epoch": 0.08436992969172526, + "grad_norm": 1.6359436511993408, + "learning_rate": 1.120720720720721e-05, + "loss": 3.371, + "mean_token_accuracy": 0.3598788380622864, + "num_tokens": 158754076.0, + "step": 312 + }, + { + "epoch": 0.08464034613304489, + "grad_norm": 2.02172589302063, + "learning_rate": 1.1243243243243245e-05, + "loss": 3.5227, + "mean_token_accuracy": 0.36278730630874634, + "num_tokens": 159239857.0, + "step": 313 + }, + { + "epoch": 0.08491076257436452, + "grad_norm": 1.8778430223464966, + "learning_rate": 1.127927927927928e-05, + "loss": 3.5077, + "mean_token_accuracy": 0.35900288820266724, + "num_tokens": 159757956.0, + "step": 314 + }, + { + "epoch": 0.08518117901568416, + "grad_norm": 2.0315608978271484, + "learning_rate": 1.1315315315315315e-05, + "loss": 3.7018, + "mean_token_accuracy": 0.33159300684928894, + "num_tokens": 160223980.0, + "step": 315 + }, + { + "epoch": 0.08545159545700379, + "grad_norm": 1.9374598264694214, + "learning_rate": 1.1351351351351352e-05, + "loss": 3.3721, + "mean_token_accuracy": 0.36356112360954285, + "num_tokens": 160711704.0, + "step": 316 + }, + { + "epoch": 0.08572201189832342, + "grad_norm": 1.6118264198303223, + "learning_rate": 1.1387387387387388e-05, + "loss": 3.2107, + "mean_token_accuracy": 0.36253994703292847, + "num_tokens": 161235927.0, + "step": 317 + }, + { + "epoch": 0.08599242833964305, + "grad_norm": 2.0894174575805664, + "learning_rate": 1.1423423423423425e-05, + "loss": 3.296, + "mean_token_accuracy": 0.3582703173160553, + "num_tokens": 161760056.0, + "step": 318 + }, + { + "epoch": 0.08626284478096269, + "grad_norm": 1.8950645923614502, + "learning_rate": 1.1459459459459462e-05, + "loss": 3.39, + "mean_token_accuracy": 0.3545306921005249, + "num_tokens": 162284149.0, + "step": 319 + }, + { + "epoch": 0.08653326122228232, + "grad_norm": 1.9543299674987793, + "learning_rate": 1.1495495495495497e-05, + "loss": 3.346, + "mean_token_accuracy": 0.371843159198761, + "num_tokens": 162751893.0, + "step": 320 + }, + { + "epoch": 0.08680367766360195, + "grad_norm": 1.2912145853042603, + "learning_rate": 1.1531531531531532e-05, + "loss": 1.1065, + "mean_token_accuracy": 0.7024353742599487, + "num_tokens": 163219551.0, + "step": 321 + }, + { + "epoch": 0.08707409410492158, + "grad_norm": 2.7105753421783447, + "learning_rate": 1.1567567567567568e-05, + "loss": 3.0866, + "mean_token_accuracy": 0.3940880298614502, + "num_tokens": 163709907.0, + "step": 322 + }, + { + "epoch": 0.0873445105462412, + "grad_norm": 2.5048141479492188, + "learning_rate": 1.1603603603603603e-05, + "loss": 3.5601, + "mean_token_accuracy": 0.35085466504096985, + "num_tokens": 164234175.0, + "step": 323 + }, + { + "epoch": 0.08761492698756085, + "grad_norm": 2.5593478679656982, + "learning_rate": 1.1639639639639642e-05, + "loss": 3.3626, + "mean_token_accuracy": 0.35863855481147766, + "num_tokens": 164758395.0, + "step": 324 + }, + { + "epoch": 0.08788534342888048, + "grad_norm": 3.6132993698120117, + "learning_rate": 1.1675675675675677e-05, + "loss": 3.2502, + "mean_token_accuracy": 0.3870399594306946, + "num_tokens": 165238063.0, + "step": 325 + }, + { + "epoch": 0.0881557598702001, + "grad_norm": 2.2145371437072754, + "learning_rate": 1.1711711711711713e-05, + "loss": 3.244, + "mean_token_accuracy": 0.3599753677845001, + "num_tokens": 165762299.0, + "step": 326 + }, + { + "epoch": 0.08842617631151974, + "grad_norm": 2.3969719409942627, + "learning_rate": 1.1747747747747748e-05, + "loss": 3.1817, + "mean_token_accuracy": 0.37249717116355896, + "num_tokens": 166286543.0, + "step": 327 + }, + { + "epoch": 0.08869659275283938, + "grad_norm": 2.8735625743865967, + "learning_rate": 1.1783783783783785e-05, + "loss": 3.5788, + "mean_token_accuracy": 0.3469610810279846, + "num_tokens": 166810812.0, + "step": 328 + }, + { + "epoch": 0.08896700919415901, + "grad_norm": 2.524768352508545, + "learning_rate": 1.181981981981982e-05, + "loss": 3.0382, + "mean_token_accuracy": 0.3796457350254059, + "num_tokens": 167334981.0, + "step": 329 + }, + { + "epoch": 0.08923742563547864, + "grad_norm": 3.861762285232544, + "learning_rate": 1.1855855855855857e-05, + "loss": 2.9645, + "mean_token_accuracy": 0.3937443494796753, + "num_tokens": 167859208.0, + "step": 330 + }, + { + "epoch": 0.08950784207679827, + "grad_norm": 2.1102490425109863, + "learning_rate": 1.1891891891891894e-05, + "loss": 3.3137, + "mean_token_accuracy": 0.3790540397167206, + "num_tokens": 168383488.0, + "step": 331 + }, + { + "epoch": 0.0897782585181179, + "grad_norm": 4.283322811126709, + "learning_rate": 1.192792792792793e-05, + "loss": 3.4094, + "mean_token_accuracy": 0.39128026366233826, + "num_tokens": 168848238.0, + "step": 332 + }, + { + "epoch": 0.09004867495943754, + "grad_norm": 2.417846202850342, + "learning_rate": 1.1963963963963965e-05, + "loss": 3.1699, + "mean_token_accuracy": 0.38125574588775635, + "num_tokens": 169351405.0, + "step": 333 + }, + { + "epoch": 0.09031909140075717, + "grad_norm": 2.1442372798919678, + "learning_rate": 1.2e-05, + "loss": 3.5795, + "mean_token_accuracy": 0.3314664363861084, + "num_tokens": 169875667.0, + "step": 334 + }, + { + "epoch": 0.0905895078420768, + "grad_norm": 2.605917453765869, + "learning_rate": 1.2036036036036036e-05, + "loss": 3.4062, + "mean_token_accuracy": 0.3546343147754669, + "num_tokens": 170399946.0, + "step": 335 + }, + { + "epoch": 0.09085992428339643, + "grad_norm": 2.8149173259735107, + "learning_rate": 1.2072072072072074e-05, + "loss": 3.3109, + "mean_token_accuracy": 0.3638969361782074, + "num_tokens": 170924110.0, + "step": 336 + }, + { + "epoch": 0.09113034072471607, + "grad_norm": 2.4338510036468506, + "learning_rate": 1.210810810810811e-05, + "loss": 3.2329, + "mean_token_accuracy": 0.38032329082489014, + "num_tokens": 171448286.0, + "step": 337 + }, + { + "epoch": 0.0914007571660357, + "grad_norm": 4.633322715759277, + "learning_rate": 1.2144144144144145e-05, + "loss": 3.2489, + "mean_token_accuracy": 0.3696860671043396, + "num_tokens": 171972522.0, + "step": 338 + }, + { + "epoch": 0.09167117360735533, + "grad_norm": 5.035653591156006, + "learning_rate": 1.218018018018018e-05, + "loss": 3.3058, + "mean_token_accuracy": 0.3624318540096283, + "num_tokens": 172496796.0, + "step": 339 + }, + { + "epoch": 0.09194159004867496, + "grad_norm": 3.1259703636169434, + "learning_rate": 1.2216216216216217e-05, + "loss": 3.2966, + "mean_token_accuracy": 0.3616825342178345, + "num_tokens": 173020996.0, + "step": 340 + }, + { + "epoch": 0.09221200648999459, + "grad_norm": 1.4647387266159058, + "learning_rate": 1.2252252252252253e-05, + "loss": 1.3153, + "mean_token_accuracy": 0.6639432907104492, + "num_tokens": 173545161.0, + "step": 341 + }, + { + "epoch": 0.09248242293131423, + "grad_norm": 5.8377275466918945, + "learning_rate": 1.228828828828829e-05, + "loss": 3.4874, + "mean_token_accuracy": 0.366782009601593, + "num_tokens": 174006520.0, + "step": 342 + }, + { + "epoch": 0.09275283937263386, + "grad_norm": 5.7174296379089355, + "learning_rate": 1.2324324324324327e-05, + "loss": 3.3334, + "mean_token_accuracy": 0.3618483543395996, + "num_tokens": 174496678.0, + "step": 343 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 5.507826328277588, + "learning_rate": 1.2360360360360362e-05, + "loss": 2.9481, + "mean_token_accuracy": 0.4024239778518677, + "num_tokens": 174916980.0, + "step": 344 + }, + { + "epoch": 0.09329367225527312, + "grad_norm": 2.935412883758545, + "learning_rate": 1.2396396396396398e-05, + "loss": 3.3339, + "mean_token_accuracy": 0.35410621762275696, + "num_tokens": 175441258.0, + "step": 345 + }, + { + "epoch": 0.09356408869659276, + "grad_norm": 4.697361946105957, + "learning_rate": 1.2432432432432433e-05, + "loss": 3.1706, + "mean_token_accuracy": 0.3893749713897705, + "num_tokens": 175965496.0, + "step": 346 + }, + { + "epoch": 0.09383450513791239, + "grad_norm": 7.272116661071777, + "learning_rate": 1.2468468468468468e-05, + "loss": 3.3775, + "mean_token_accuracy": 0.365527480840683, + "num_tokens": 176489780.0, + "step": 347 + }, + { + "epoch": 0.09410492157923202, + "grad_norm": 4.873509407043457, + "learning_rate": 1.2504504504504507e-05, + "loss": 3.5334, + "mean_token_accuracy": 0.3530566692352295, + "num_tokens": 177013967.0, + "step": 348 + }, + { + "epoch": 0.09437533802055165, + "grad_norm": 3.16273832321167, + "learning_rate": 1.2540540540540542e-05, + "loss": 3.1621, + "mean_token_accuracy": 0.3822454810142517, + "num_tokens": 177538242.0, + "step": 349 + }, + { + "epoch": 0.09464575446187128, + "grad_norm": 4.409372329711914, + "learning_rate": 1.2576576576576578e-05, + "loss": 3.0813, + "mean_token_accuracy": 0.3840554356575012, + "num_tokens": 178062289.0, + "step": 350 + }, + { + "epoch": 0.09491617090319092, + "grad_norm": 4.263370990753174, + "learning_rate": 1.2612612612612613e-05, + "loss": 3.1884, + "mean_token_accuracy": 0.37988823652267456, + "num_tokens": 178586413.0, + "step": 351 + }, + { + "epoch": 0.09518658734451055, + "grad_norm": 3.0288288593292236, + "learning_rate": 1.264864864864865e-05, + "loss": 3.0915, + "mean_token_accuracy": 0.38795822858810425, + "num_tokens": 179110675.0, + "step": 352 + }, + { + "epoch": 0.09545700378583018, + "grad_norm": 3.223400354385376, + "learning_rate": 1.2684684684684685e-05, + "loss": 3.3281, + "mean_token_accuracy": 0.36517658829689026, + "num_tokens": 179634893.0, + "step": 353 + }, + { + "epoch": 0.0957274202271498, + "grad_norm": 2.380676746368408, + "learning_rate": 1.2720720720720722e-05, + "loss": 3.4861, + "mean_token_accuracy": 0.3684387803077698, + "num_tokens": 180159147.0, + "step": 354 + }, + { + "epoch": 0.09599783666846945, + "grad_norm": 2.773383378982544, + "learning_rate": 1.2756756756756758e-05, + "loss": 3.25, + "mean_token_accuracy": 0.3778010606765747, + "num_tokens": 180678603.0, + "step": 355 + }, + { + "epoch": 0.09626825310978908, + "grad_norm": 2.911902904510498, + "learning_rate": 1.2792792792792795e-05, + "loss": 3.152, + "mean_token_accuracy": 0.3867396116256714, + "num_tokens": 181176206.0, + "step": 356 + }, + { + "epoch": 0.09653866955110871, + "grad_norm": 2.699197292327881, + "learning_rate": 1.282882882882883e-05, + "loss": 3.2807, + "mean_token_accuracy": 0.36368435621261597, + "num_tokens": 181700344.0, + "step": 357 + }, + { + "epoch": 0.09680908599242834, + "grad_norm": 2.8612701892852783, + "learning_rate": 1.2864864864864865e-05, + "loss": 3.3261, + "mean_token_accuracy": 0.37491375207901, + "num_tokens": 182224509.0, + "step": 358 + }, + { + "epoch": 0.09707950243374797, + "grad_norm": 3.4115307331085205, + "learning_rate": 1.29009009009009e-05, + "loss": 3.3666, + "mean_token_accuracy": 0.38962680101394653, + "num_tokens": 182690643.0, + "step": 359 + }, + { + "epoch": 0.09734991887506761, + "grad_norm": 2.688364267349243, + "learning_rate": 1.293693693693694e-05, + "loss": 3.496, + "mean_token_accuracy": 0.37074992060661316, + "num_tokens": 183195289.0, + "step": 360 + }, + { + "epoch": 0.09762033531638724, + "grad_norm": 1.678063988685608, + "learning_rate": 1.2972972972972975e-05, + "loss": 1.1802, + "mean_token_accuracy": 0.6895753145217896, + "num_tokens": 183719471.0, + "step": 361 + }, + { + "epoch": 0.09789075175770687, + "grad_norm": 5.011818885803223, + "learning_rate": 1.300900900900901e-05, + "loss": 3.4073, + "mean_token_accuracy": 0.3772871196269989, + "num_tokens": 184190385.0, + "step": 362 + }, + { + "epoch": 0.0981611681990265, + "grad_norm": 5.071710586547852, + "learning_rate": 1.3045045045045045e-05, + "loss": 3.2077, + "mean_token_accuracy": 0.4030190706253052, + "num_tokens": 184714613.0, + "step": 363 + }, + { + "epoch": 0.09843158464034614, + "grad_norm": 2.488710641860962, + "learning_rate": 1.3081081081081083e-05, + "loss": 3.5099, + "mean_token_accuracy": 0.35411161184310913, + "num_tokens": 185238891.0, + "step": 364 + }, + { + "epoch": 0.09870200108166577, + "grad_norm": 1.807631015777588, + "learning_rate": 1.3117117117117118e-05, + "loss": 2.9622, + "mean_token_accuracy": 0.37867850065231323, + "num_tokens": 185763043.0, + "step": 365 + }, + { + "epoch": 0.0989724175229854, + "grad_norm": 3.5989184379577637, + "learning_rate": 1.3153153153153155e-05, + "loss": 3.4402, + "mean_token_accuracy": 0.35903269052505493, + "num_tokens": 186287311.0, + "step": 366 + }, + { + "epoch": 0.09924283396430503, + "grad_norm": 3.107598304748535, + "learning_rate": 1.318918918918919e-05, + "loss": 3.2081, + "mean_token_accuracy": 0.380618155002594, + "num_tokens": 186811524.0, + "step": 367 + }, + { + "epoch": 0.09951325040562466, + "grad_norm": 2.6440329551696777, + "learning_rate": 1.3225225225225227e-05, + "loss": 3.4741, + "mean_token_accuracy": 0.35641682147979736, + "num_tokens": 187335756.0, + "step": 368 + }, + { + "epoch": 0.0997836668469443, + "grad_norm": 2.4320273399353027, + "learning_rate": 1.3261261261261263e-05, + "loss": 2.9967, + "mean_token_accuracy": 0.3875637650489807, + "num_tokens": 187860017.0, + "step": 369 + }, + { + "epoch": 0.10005408328826393, + "grad_norm": 2.980323553085327, + "learning_rate": 1.3297297297297298e-05, + "loss": 3.4053, + "mean_token_accuracy": 0.3626766800880432, + "num_tokens": 188384203.0, + "step": 370 + }, + { + "epoch": 0.10032449972958356, + "grad_norm": 2.645524024963379, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.9455, + "mean_token_accuracy": 0.39733222126960754, + "num_tokens": 188903646.0, + "step": 371 + }, + { + "epoch": 0.10059491617090319, + "grad_norm": 2.143906831741333, + "learning_rate": 1.3369369369369369e-05, + "loss": 3.2992, + "mean_token_accuracy": 0.37957268953323364, + "num_tokens": 189385535.0, + "step": 372 + }, + { + "epoch": 0.10086533261222283, + "grad_norm": 1.8058979511260986, + "learning_rate": 1.3405405405405407e-05, + "loss": 3.1984, + "mean_token_accuracy": 0.3893955945968628, + "num_tokens": 189900943.0, + "step": 373 + }, + { + "epoch": 0.10113574905354246, + "grad_norm": 1.5696027278900146, + "learning_rate": 1.3441441441441443e-05, + "loss": 3.2569, + "mean_token_accuracy": 0.36697039008140564, + "num_tokens": 190425221.0, + "step": 374 + }, + { + "epoch": 0.10140616549486209, + "grad_norm": 2.1615583896636963, + "learning_rate": 1.3477477477477478e-05, + "loss": 3.2512, + "mean_token_accuracy": 0.41224899888038635, + "num_tokens": 190834197.0, + "step": 375 + }, + { + "epoch": 0.10167658193618172, + "grad_norm": 1.5635559558868408, + "learning_rate": 1.3513513513513515e-05, + "loss": 3.0637, + "mean_token_accuracy": 0.38607141375541687, + "num_tokens": 191319760.0, + "step": 376 + }, + { + "epoch": 0.10194699837750135, + "grad_norm": 1.7313528060913086, + "learning_rate": 1.354954954954955e-05, + "loss": 3.3772, + "mean_token_accuracy": 0.3733062744140625, + "num_tokens": 191843851.0, + "step": 377 + }, + { + "epoch": 0.10221741481882099, + "grad_norm": 1.7380986213684082, + "learning_rate": 1.3585585585585586e-05, + "loss": 3.382, + "mean_token_accuracy": 0.3677327036857605, + "num_tokens": 192368129.0, + "step": 378 + }, + { + "epoch": 0.10248783126014062, + "grad_norm": 1.9631719589233398, + "learning_rate": 1.3621621621621623e-05, + "loss": 3.246, + "mean_token_accuracy": 0.3697909414768219, + "num_tokens": 192837230.0, + "step": 379 + }, + { + "epoch": 0.10275824770146025, + "grad_norm": 1.795838713645935, + "learning_rate": 1.365765765765766e-05, + "loss": 3.1688, + "mean_token_accuracy": 0.4212944507598877, + "num_tokens": 193297675.0, + "step": 380 + }, + { + "epoch": 0.10302866414277988, + "grad_norm": 1.2834819555282593, + "learning_rate": 1.3693693693693695e-05, + "loss": 1.2246, + "mean_token_accuracy": 0.6745494604110718, + "num_tokens": 193821949.0, + "step": 381 + }, + { + "epoch": 0.1032990805840995, + "grad_norm": 3.2808914184570312, + "learning_rate": 1.372972972972973e-05, + "loss": 3.259, + "mean_token_accuracy": 0.3870149254798889, + "num_tokens": 194296337.0, + "step": 382 + }, + { + "epoch": 0.10356949702541915, + "grad_norm": 2.6356818675994873, + "learning_rate": 1.3765765765765766e-05, + "loss": 3.1264, + "mean_token_accuracy": 0.39574557542800903, + "num_tokens": 194820555.0, + "step": 383 + }, + { + "epoch": 0.10383991346673878, + "grad_norm": 2.3627214431762695, + "learning_rate": 1.3801801801801801e-05, + "loss": 3.2276, + "mean_token_accuracy": 0.3872603178024292, + "num_tokens": 195344839.0, + "step": 384 + }, + { + "epoch": 0.10411032990805841, + "grad_norm": 2.5282745361328125, + "learning_rate": 1.383783783783784e-05, + "loss": 3.2377, + "mean_token_accuracy": 0.3974454402923584, + "num_tokens": 195812164.0, + "step": 385 + }, + { + "epoch": 0.10438074634937804, + "grad_norm": 1.8099793195724487, + "learning_rate": 1.3873873873873875e-05, + "loss": 3.3154, + "mean_token_accuracy": 0.3887695372104645, + "num_tokens": 196298158.0, + "step": 386 + }, + { + "epoch": 0.10465116279069768, + "grad_norm": 2.19936466217041, + "learning_rate": 1.390990990990991e-05, + "loss": 3.317, + "mean_token_accuracy": 0.37372148036956787, + "num_tokens": 196822361.0, + "step": 387 + }, + { + "epoch": 0.10492157923201731, + "grad_norm": 2.581470489501953, + "learning_rate": 1.3945945945945946e-05, + "loss": 3.3595, + "mean_token_accuracy": 0.38312941789627075, + "num_tokens": 197346544.0, + "step": 388 + }, + { + "epoch": 0.10519199567333694, + "grad_norm": 2.261378526687622, + "learning_rate": 1.3981981981981983e-05, + "loss": 3.1759, + "mean_token_accuracy": 0.38640713691711426, + "num_tokens": 197870826.0, + "step": 389 + }, + { + "epoch": 0.10546241211465657, + "grad_norm": 1.9628186225891113, + "learning_rate": 1.4018018018018018e-05, + "loss": 2.9795, + "mean_token_accuracy": 0.39931079745292664, + "num_tokens": 198394977.0, + "step": 390 + }, + { + "epoch": 0.1057328285559762, + "grad_norm": 1.5381832122802734, + "learning_rate": 1.4054054054054055e-05, + "loss": 3.0237, + "mean_token_accuracy": 0.3909919559955597, + "num_tokens": 198919164.0, + "step": 391 + }, + { + "epoch": 0.10600324499729584, + "grad_norm": 1.9984492063522339, + "learning_rate": 1.4090090090090092e-05, + "loss": 3.5348, + "mean_token_accuracy": 0.3697656989097595, + "num_tokens": 199443442.0, + "step": 392 + }, + { + "epoch": 0.10627366143861547, + "grad_norm": 7.963676452636719, + "learning_rate": 1.4126126126126128e-05, + "loss": 3.0864, + "mean_token_accuracy": 0.39496952295303345, + "num_tokens": 199967515.0, + "step": 393 + }, + { + "epoch": 0.1065440778799351, + "grad_norm": 3.044731855392456, + "learning_rate": 1.4162162162162163e-05, + "loss": 3.2371, + "mean_token_accuracy": 0.3768002688884735, + "num_tokens": 200491047.0, + "step": 394 + }, + { + "epoch": 0.10681449432125473, + "grad_norm": 2.3241755962371826, + "learning_rate": 1.4198198198198198e-05, + "loss": 3.3057, + "mean_token_accuracy": 0.37639951705932617, + "num_tokens": 201015228.0, + "step": 395 + }, + { + "epoch": 0.10708491076257437, + "grad_norm": 1.8142709732055664, + "learning_rate": 1.4234234234234234e-05, + "loss": 3.2887, + "mean_token_accuracy": 0.3807913064956665, + "num_tokens": 201539357.0, + "step": 396 + }, + { + "epoch": 0.107355327203894, + "grad_norm": 10.864251136779785, + "learning_rate": 1.4270270270270272e-05, + "loss": 3.0154, + "mean_token_accuracy": 0.4556397795677185, + "num_tokens": 202063639.0, + "step": 397 + }, + { + "epoch": 0.10762574364521363, + "grad_norm": 2.5269887447357178, + "learning_rate": 1.4306306306306308e-05, + "loss": 3.1995, + "mean_token_accuracy": 0.3952258229255676, + "num_tokens": 202587701.0, + "step": 398 + }, + { + "epoch": 0.10789616008653326, + "grad_norm": 1.912433385848999, + "learning_rate": 1.4342342342342343e-05, + "loss": 3.096, + "mean_token_accuracy": 0.39819440245628357, + "num_tokens": 203088357.0, + "step": 399 + }, + { + "epoch": 0.10816657652785289, + "grad_norm": 2.447415828704834, + "learning_rate": 1.4378378378378378e-05, + "loss": 2.8448, + "mean_token_accuracy": 0.4159303307533264, + "num_tokens": 203612484.0, + "step": 400 + }, + { + "epoch": 0.10843699296917253, + "grad_norm": 2.17952561378479, + "learning_rate": 1.4414414414414416e-05, + "loss": 1.367, + "mean_token_accuracy": 0.6615312099456787, + "num_tokens": 204136753.0, + "step": 401 + }, + { + "epoch": 0.10870740941049216, + "grad_norm": 4.238644123077393, + "learning_rate": 1.4450450450450451e-05, + "loss": 3.3444, + "mean_token_accuracy": 0.3966636061668396, + "num_tokens": 204568560.0, + "step": 402 + }, + { + "epoch": 0.10897782585181179, + "grad_norm": 3.0403876304626465, + "learning_rate": 1.4486486486486488e-05, + "loss": 3.0765, + "mean_token_accuracy": 0.42813295125961304, + "num_tokens": 205031094.0, + "step": 403 + }, + { + "epoch": 0.10924824229313142, + "grad_norm": 2.6850831508636475, + "learning_rate": 1.4522522522522525e-05, + "loss": 2.9726, + "mean_token_accuracy": 0.40957748889923096, + "num_tokens": 205555314.0, + "step": 404 + }, + { + "epoch": 0.10951865873445106, + "grad_norm": 3.0229058265686035, + "learning_rate": 1.455855855855856e-05, + "loss": 3.2079, + "mean_token_accuracy": 0.40387898683547974, + "num_tokens": 206045747.0, + "step": 405 + }, + { + "epoch": 0.10978907517577069, + "grad_norm": 2.4910805225372314, + "learning_rate": 1.4594594594594596e-05, + "loss": 3.1318, + "mean_token_accuracy": 0.3895113468170166, + "num_tokens": 206564284.0, + "step": 406 + }, + { + "epoch": 0.11005949161709032, + "grad_norm": 3.1487834453582764, + "learning_rate": 1.4630630630630631e-05, + "loss": 2.8442, + "mean_token_accuracy": 0.41587138175964355, + "num_tokens": 207068821.0, + "step": 407 + }, + { + "epoch": 0.11032990805840995, + "grad_norm": 3.2748842239379883, + "learning_rate": 1.4666666666666666e-05, + "loss": 3.1675, + "mean_token_accuracy": 0.41177263855934143, + "num_tokens": 207592890.0, + "step": 408 + }, + { + "epoch": 0.11060032449972958, + "grad_norm": 2.42340087890625, + "learning_rate": 1.4702702702702705e-05, + "loss": 3.2009, + "mean_token_accuracy": 0.38569822907447815, + "num_tokens": 208116969.0, + "step": 409 + }, + { + "epoch": 0.11087074094104922, + "grad_norm": 3.155510902404785, + "learning_rate": 1.473873873873874e-05, + "loss": 3.0587, + "mean_token_accuracy": 0.3946463167667389, + "num_tokens": 208641248.0, + "step": 410 + }, + { + "epoch": 0.11114115738236885, + "grad_norm": 35.984676361083984, + "learning_rate": 1.4774774774774776e-05, + "loss": 3.5326, + "mean_token_accuracy": 0.3660382032394409, + "num_tokens": 209165530.0, + "step": 411 + }, + { + "epoch": 0.11141157382368848, + "grad_norm": 3.0454163551330566, + "learning_rate": 1.4810810810810811e-05, + "loss": 2.9778, + "mean_token_accuracy": 0.399384081363678, + "num_tokens": 209689756.0, + "step": 412 + }, + { + "epoch": 0.11168199026500811, + "grad_norm": 2.8983755111694336, + "learning_rate": 1.4846846846846848e-05, + "loss": 3.0225, + "mean_token_accuracy": 0.4169549345970154, + "num_tokens": 210168984.0, + "step": 413 + }, + { + "epoch": 0.11195240670632775, + "grad_norm": 2.5540952682495117, + "learning_rate": 1.4882882882882883e-05, + "loss": 2.8718, + "mean_token_accuracy": 0.4142993986606598, + "num_tokens": 210693189.0, + "step": 414 + }, + { + "epoch": 0.11222282314764738, + "grad_norm": 3.1511847972869873, + "learning_rate": 1.491891891891892e-05, + "loss": 3.1746, + "mean_token_accuracy": 0.38840025663375854, + "num_tokens": 211217406.0, + "step": 415 + }, + { + "epoch": 0.11249323958896701, + "grad_norm": 3.5532093048095703, + "learning_rate": 1.4954954954954957e-05, + "loss": 3.1388, + "mean_token_accuracy": 0.39322394132614136, + "num_tokens": 211710689.0, + "step": 416 + }, + { + "epoch": 0.11276365603028664, + "grad_norm": 3.0405898094177246, + "learning_rate": 1.4990990990990993e-05, + "loss": 3.0133, + "mean_token_accuracy": 0.3949645757675171, + "num_tokens": 212234826.0, + "step": 417 + }, + { + "epoch": 0.11303407247160627, + "grad_norm": 1.9939486980438232, + "learning_rate": 1.5027027027027028e-05, + "loss": 3.3196, + "mean_token_accuracy": 0.373066246509552, + "num_tokens": 212759092.0, + "step": 418 + }, + { + "epoch": 0.11330448891292591, + "grad_norm": 2.8445544242858887, + "learning_rate": 1.5063063063063063e-05, + "loss": 2.8831, + "mean_token_accuracy": 0.42289096117019653, + "num_tokens": 213283142.0, + "step": 419 + }, + { + "epoch": 0.11357490535424554, + "grad_norm": 3.251836061477661, + "learning_rate": 1.5099099099099099e-05, + "loss": 3.0982, + "mean_token_accuracy": 0.40916481614112854, + "num_tokens": 213807344.0, + "step": 420 + }, + { + "epoch": 0.11384532179556517, + "grad_norm": 1.3881995677947998, + "learning_rate": 1.5135135135135138e-05, + "loss": 1.1989, + "mean_token_accuracy": 0.6828949451446533, + "num_tokens": 214331509.0, + "step": 421 + }, + { + "epoch": 0.1141157382368848, + "grad_norm": 4.016326427459717, + "learning_rate": 1.5171171171171173e-05, + "loss": 3.24, + "mean_token_accuracy": 0.37244313955307007, + "num_tokens": 214855745.0, + "step": 422 + }, + { + "epoch": 0.11438615467820444, + "grad_norm": 3.974956512451172, + "learning_rate": 1.5207207207207208e-05, + "loss": 3.2892, + "mean_token_accuracy": 0.37301206588745117, + "num_tokens": 215343556.0, + "step": 423 + }, + { + "epoch": 0.11465657111952407, + "grad_norm": 3.2616024017333984, + "learning_rate": 1.5243243243243244e-05, + "loss": 2.8482, + "mean_token_accuracy": 0.42398011684417725, + "num_tokens": 215867804.0, + "step": 424 + }, + { + "epoch": 0.1149269875608437, + "grad_norm": 4.290948390960693, + "learning_rate": 1.527927927927928e-05, + "loss": 3.2727, + "mean_token_accuracy": 0.3860287666320801, + "num_tokens": 216372644.0, + "step": 425 + }, + { + "epoch": 0.11519740400216333, + "grad_norm": 4.770145893096924, + "learning_rate": 1.5315315315315316e-05, + "loss": 3.1171, + "mean_token_accuracy": 0.40880054235458374, + "num_tokens": 216896903.0, + "step": 426 + }, + { + "epoch": 0.11546782044348296, + "grad_norm": 3.164710521697998, + "learning_rate": 1.5351351351351353e-05, + "loss": 3.1293, + "mean_token_accuracy": 0.3968995213508606, + "num_tokens": 217421162.0, + "step": 427 + }, + { + "epoch": 0.1157382368848026, + "grad_norm": 1.8844904899597168, + "learning_rate": 1.538738738738739e-05, + "loss": 2.9982, + "mean_token_accuracy": 0.387157678604126, + "num_tokens": 217945432.0, + "step": 428 + }, + { + "epoch": 0.11600865332612223, + "grad_norm": 2.344466209411621, + "learning_rate": 1.5423423423423424e-05, + "loss": 3.2258, + "mean_token_accuracy": 0.3888000249862671, + "num_tokens": 218469640.0, + "step": 429 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 2.009641170501709, + "learning_rate": 1.545945945945946e-05, + "loss": 3.215, + "mean_token_accuracy": 0.38758182525634766, + "num_tokens": 218993844.0, + "step": 430 + }, + { + "epoch": 0.11654948620876149, + "grad_norm": 3.90264892578125, + "learning_rate": 1.5495495495495498e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.4538784921169281, + "num_tokens": 219508946.0, + "step": 431 + }, + { + "epoch": 0.11681990265008113, + "grad_norm": 2.504432201385498, + "learning_rate": 1.553153153153153e-05, + "loss": 3.1228, + "mean_token_accuracy": 0.4065558910369873, + "num_tokens": 220033111.0, + "step": 432 + }, + { + "epoch": 0.11709031909140076, + "grad_norm": 2.382951498031616, + "learning_rate": 1.556756756756757e-05, + "loss": 3.0635, + "mean_token_accuracy": 0.4023590087890625, + "num_tokens": 220557003.0, + "step": 433 + }, + { + "epoch": 0.11736073553272039, + "grad_norm": 2.437650680541992, + "learning_rate": 1.5603603603603605e-05, + "loss": 3.0831, + "mean_token_accuracy": 0.4025985598564148, + "num_tokens": 221081261.0, + "step": 434 + }, + { + "epoch": 0.11763115197404002, + "grad_norm": 1.7414510250091553, + "learning_rate": 1.5639639639639642e-05, + "loss": 2.9578, + "mean_token_accuracy": 0.40090903639793396, + "num_tokens": 221605543.0, + "step": 435 + }, + { + "epoch": 0.11790156841535965, + "grad_norm": 1.921467900276184, + "learning_rate": 1.5675675675675676e-05, + "loss": 2.9266, + "mean_token_accuracy": 0.40140336751937866, + "num_tokens": 222086494.0, + "step": 436 + }, + { + "epoch": 0.11817198485667929, + "grad_norm": 2.0038321018218994, + "learning_rate": 1.5711711711711713e-05, + "loss": 3.16, + "mean_token_accuracy": 0.3930995464324951, + "num_tokens": 222610727.0, + "step": 437 + }, + { + "epoch": 0.11844240129799892, + "grad_norm": 2.3062100410461426, + "learning_rate": 1.5747747747747747e-05, + "loss": 3.152, + "mean_token_accuracy": 0.3945178985595703, + "num_tokens": 223130205.0, + "step": 438 + }, + { + "epoch": 0.11871281773931855, + "grad_norm": 2.4526267051696777, + "learning_rate": 1.5783783783783787e-05, + "loss": 2.8291, + "mean_token_accuracy": 0.39582735300064087, + "num_tokens": 223648948.0, + "step": 439 + }, + { + "epoch": 0.11898323418063818, + "grad_norm": 2.512810707092285, + "learning_rate": 1.581981981981982e-05, + "loss": 3.1574, + "mean_token_accuracy": 0.3816109299659729, + "num_tokens": 224173230.0, + "step": 440 + }, + { + "epoch": 0.11925365062195782, + "grad_norm": 1.190622091293335, + "learning_rate": 1.5855855855855858e-05, + "loss": 1.2264, + "mean_token_accuracy": 0.6767313480377197, + "num_tokens": 224697334.0, + "step": 441 + }, + { + "epoch": 0.11952406706327745, + "grad_norm": 3.8787713050842285, + "learning_rate": 1.589189189189189e-05, + "loss": 3.2589, + "mean_token_accuracy": 0.40374457836151123, + "num_tokens": 225221549.0, + "step": 442 + }, + { + "epoch": 0.11979448350459708, + "grad_norm": 3.4857606887817383, + "learning_rate": 1.592792792792793e-05, + "loss": 2.8997, + "mean_token_accuracy": 0.4128066301345825, + "num_tokens": 225745729.0, + "step": 443 + }, + { + "epoch": 0.12006489994591671, + "grad_norm": 3.12178373336792, + "learning_rate": 1.5963963963963966e-05, + "loss": 3.1911, + "mean_token_accuracy": 0.4040737450122833, + "num_tokens": 226252053.0, + "step": 444 + }, + { + "epoch": 0.12033531638723634, + "grad_norm": 2.873185873031616, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.1811, + "mean_token_accuracy": 0.38425248861312866, + "num_tokens": 226729878.0, + "step": 445 + }, + { + "epoch": 0.12060573282855598, + "grad_norm": 3.2927868366241455, + "learning_rate": 1.6036036036036036e-05, + "loss": 3.1246, + "mean_token_accuracy": 0.39374732971191406, + "num_tokens": 227231111.0, + "step": 446 + }, + { + "epoch": 0.12087614926987561, + "grad_norm": 2.539360761642456, + "learning_rate": 1.6072072072072073e-05, + "loss": 3.1543, + "mean_token_accuracy": 0.39928150177001953, + "num_tokens": 227755343.0, + "step": 447 + }, + { + "epoch": 0.12114656571119524, + "grad_norm": 2.4518353939056396, + "learning_rate": 1.610810810810811e-05, + "loss": 3.0726, + "mean_token_accuracy": 0.3933662474155426, + "num_tokens": 228279531.0, + "step": 448 + }, + { + "epoch": 0.12141698215251487, + "grad_norm": 1.9490175247192383, + "learning_rate": 1.6144144144144144e-05, + "loss": 2.8591, + "mean_token_accuracy": 0.4122278094291687, + "num_tokens": 228803612.0, + "step": 449 + }, + { + "epoch": 0.12168739859383451, + "grad_norm": 2.0134425163269043, + "learning_rate": 1.618018018018018e-05, + "loss": 3.0611, + "mean_token_accuracy": 0.39449405670166016, + "num_tokens": 229327883.0, + "step": 450 + }, + { + "epoch": 0.12195781503515414, + "grad_norm": 2.1467528343200684, + "learning_rate": 1.6216216216216218e-05, + "loss": 3.1383, + "mean_token_accuracy": 0.3929635286331177, + "num_tokens": 229852036.0, + "step": 451 + }, + { + "epoch": 0.12222823147647377, + "grad_norm": 2.6754984855651855, + "learning_rate": 1.6252252252252255e-05, + "loss": 3.142, + "mean_token_accuracy": 0.3771977424621582, + "num_tokens": 230376222.0, + "step": 452 + }, + { + "epoch": 0.1224986479177934, + "grad_norm": 2.0657925605773926, + "learning_rate": 1.628828828828829e-05, + "loss": 3.1955, + "mean_token_accuracy": 0.4111741781234741, + "num_tokens": 230861559.0, + "step": 453 + }, + { + "epoch": 0.12276906435911303, + "grad_norm": 2.2204442024230957, + "learning_rate": 1.6324324324324326e-05, + "loss": 2.9122, + "mean_token_accuracy": 0.40790677070617676, + "num_tokens": 231364583.0, + "step": 454 + }, + { + "epoch": 0.12303948080043267, + "grad_norm": 2.0818746089935303, + "learning_rate": 1.636036036036036e-05, + "loss": 3.0945, + "mean_token_accuracy": 0.3790567219257355, + "num_tokens": 231888716.0, + "step": 455 + }, + { + "epoch": 0.1233098972417523, + "grad_norm": 1.7914170026779175, + "learning_rate": 1.6396396396396396e-05, + "loss": 3.1709, + "mean_token_accuracy": 0.3844211995601654, + "num_tokens": 232412817.0, + "step": 456 + }, + { + "epoch": 0.12358031368307193, + "grad_norm": 1.7766048908233643, + "learning_rate": 1.6432432432432434e-05, + "loss": 3.0458, + "mean_token_accuracy": 0.38713496923446655, + "num_tokens": 232937035.0, + "step": 457 + }, + { + "epoch": 0.12385073012439156, + "grad_norm": 1.5833090543746948, + "learning_rate": 1.646846846846847e-05, + "loss": 3.1443, + "mean_token_accuracy": 0.3887878656387329, + "num_tokens": 233461249.0, + "step": 458 + }, + { + "epoch": 0.1241211465657112, + "grad_norm": 1.8160353899002075, + "learning_rate": 1.6504504504504508e-05, + "loss": 2.974, + "mean_token_accuracy": 0.40149426460266113, + "num_tokens": 233985455.0, + "step": 459 + }, + { + "epoch": 0.12439156300703083, + "grad_norm": 2.3155133724212646, + "learning_rate": 1.654054054054054e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.4803757071495056, + "num_tokens": 234509677.0, + "step": 460 + }, + { + "epoch": 0.12466197944835046, + "grad_norm": 1.6618236303329468, + "learning_rate": 1.6576576576576578e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.6631706357002258, + "num_tokens": 235033919.0, + "step": 461 + }, + { + "epoch": 0.12493239588967009, + "grad_norm": 3.0759456157684326, + "learning_rate": 1.6612612612612612e-05, + "loss": 3.1434, + "mean_token_accuracy": 0.3982223868370056, + "num_tokens": 235558124.0, + "step": 462 + }, + { + "epoch": 0.12520281233098973, + "grad_norm": 2.3465380668640137, + "learning_rate": 1.6648648648648652e-05, + "loss": 3.1847, + "mean_token_accuracy": 0.39440232515335083, + "num_tokens": 236082380.0, + "step": 463 + }, + { + "epoch": 0.12547322877230935, + "grad_norm": 2.0319039821624756, + "learning_rate": 1.6684684684684686e-05, + "loss": 3.0905, + "mean_token_accuracy": 0.3897816836833954, + "num_tokens": 236606659.0, + "step": 464 + }, + { + "epoch": 0.125743645213629, + "grad_norm": 2.0098178386688232, + "learning_rate": 1.6720720720720723e-05, + "loss": 3.0089, + "mean_token_accuracy": 0.4050188660621643, + "num_tokens": 237130898.0, + "step": 465 + }, + { + "epoch": 0.12601406165494863, + "grad_norm": 1.9706387519836426, + "learning_rate": 1.6756756756756757e-05, + "loss": 2.9217, + "mean_token_accuracy": 0.4054255485534668, + "num_tokens": 237655037.0, + "step": 466 + }, + { + "epoch": 0.12628447809626825, + "grad_norm": 1.8138891458511353, + "learning_rate": 1.6792792792792794e-05, + "loss": 3.0846, + "mean_token_accuracy": 0.3940916657447815, + "num_tokens": 238179243.0, + "step": 467 + }, + { + "epoch": 0.1265548945375879, + "grad_norm": 2.2346296310424805, + "learning_rate": 1.682882882882883e-05, + "loss": 3.2229, + "mean_token_accuracy": 0.38929638266563416, + "num_tokens": 238703421.0, + "step": 468 + }, + { + "epoch": 0.1268253109789075, + "grad_norm": 2.8957228660583496, + "learning_rate": 1.6864864864864868e-05, + "loss": 3.1183, + "mean_token_accuracy": 0.3989739418029785, + "num_tokens": 239227681.0, + "step": 469 + }, + { + "epoch": 0.12709572742022715, + "grad_norm": 1.8397197723388672, + "learning_rate": 1.69009009009009e-05, + "loss": 3.0935, + "mean_token_accuracy": 0.3825445771217346, + "num_tokens": 239751805.0, + "step": 470 + }, + { + "epoch": 0.1273661438615468, + "grad_norm": 1.9550949335098267, + "learning_rate": 1.693693693693694e-05, + "loss": 3.1237, + "mean_token_accuracy": 0.38787031173706055, + "num_tokens": 240276080.0, + "step": 471 + }, + { + "epoch": 0.1276365603028664, + "grad_norm": 1.5430220365524292, + "learning_rate": 1.6972972972972975e-05, + "loss": 2.896, + "mean_token_accuracy": 0.41291773319244385, + "num_tokens": 240800315.0, + "step": 472 + }, + { + "epoch": 0.12790697674418605, + "grad_norm": 1.8329180479049683, + "learning_rate": 1.700900900900901e-05, + "loss": 2.9669, + "mean_token_accuracy": 0.4034630358219147, + "num_tokens": 241299801.0, + "step": 473 + }, + { + "epoch": 0.12817739318550567, + "grad_norm": 1.7792261838912964, + "learning_rate": 1.7045045045045046e-05, + "loss": 3.1894, + "mean_token_accuracy": 0.39884626865386963, + "num_tokens": 241823961.0, + "step": 474 + }, + { + "epoch": 0.1284478096268253, + "grad_norm": 2.146578788757324, + "learning_rate": 1.7081081081081083e-05, + "loss": 2.9346, + "mean_token_accuracy": 0.41291165351867676, + "num_tokens": 242290617.0, + "step": 475 + }, + { + "epoch": 0.12871822606814495, + "grad_norm": 2.0638160705566406, + "learning_rate": 1.711711711711712e-05, + "loss": 3.0123, + "mean_token_accuracy": 0.41536012291908264, + "num_tokens": 242814772.0, + "step": 476 + }, + { + "epoch": 0.12898864250946457, + "grad_norm": 1.9703158140182495, + "learning_rate": 1.7153153153153154e-05, + "loss": 3.0626, + "mean_token_accuracy": 0.4086377024650574, + "num_tokens": 243281628.0, + "step": 477 + }, + { + "epoch": 0.1292590589507842, + "grad_norm": 2.5056095123291016, + "learning_rate": 1.718918918918919e-05, + "loss": 3.2133, + "mean_token_accuracy": 0.3871099352836609, + "num_tokens": 243805889.0, + "step": 478 + }, + { + "epoch": 0.12952947539210383, + "grad_norm": 2.2647511959075928, + "learning_rate": 1.7225225225225225e-05, + "loss": 3.0599, + "mean_token_accuracy": 0.4050810933113098, + "num_tokens": 244330153.0, + "step": 479 + }, + { + "epoch": 0.12979989183342347, + "grad_norm": 2.0445854663848877, + "learning_rate": 1.726126126126126e-05, + "loss": 3.1886, + "mean_token_accuracy": 0.40210428833961487, + "num_tokens": 244801005.0, + "step": 480 + }, + { + "epoch": 0.1300703082747431, + "grad_norm": 1.9797738790512085, + "learning_rate": 1.72972972972973e-05, + "loss": 1.1523, + "mean_token_accuracy": 0.7008960843086243, + "num_tokens": 245325244.0, + "step": 481 + }, + { + "epoch": 0.13034072471606273, + "grad_norm": 4.374832630157471, + "learning_rate": 1.7333333333333336e-05, + "loss": 3.209, + "mean_token_accuracy": 0.37302884459495544, + "num_tokens": 245849427.0, + "step": 482 + }, + { + "epoch": 0.13061114115738237, + "grad_norm": 2.3612546920776367, + "learning_rate": 1.7369369369369373e-05, + "loss": 3.0875, + "mean_token_accuracy": 0.40799108147621155, + "num_tokens": 246373701.0, + "step": 483 + }, + { + "epoch": 0.13088155759870201, + "grad_norm": 3.0569918155670166, + "learning_rate": 1.7405405405405406e-05, + "loss": 3.3702, + "mean_token_accuracy": 0.38343316316604614, + "num_tokens": 246846816.0, + "step": 484 + }, + { + "epoch": 0.13115197404002163, + "grad_norm": 3.0156137943267822, + "learning_rate": 1.7441441441441443e-05, + "loss": 3.0567, + "mean_token_accuracy": 0.4116263687610626, + "num_tokens": 247346206.0, + "step": 485 + }, + { + "epoch": 0.13142239048134127, + "grad_norm": 2.1560120582580566, + "learning_rate": 1.7477477477477477e-05, + "loss": 2.9099, + "mean_token_accuracy": 0.4189079701900482, + "num_tokens": 247870468.0, + "step": 486 + }, + { + "epoch": 0.1316928069226609, + "grad_norm": 3.209341287612915, + "learning_rate": 1.7513513513513517e-05, + "loss": 2.7933, + "mean_token_accuracy": 0.4357534348964691, + "num_tokens": 248394580.0, + "step": 487 + }, + { + "epoch": 0.13196322336398053, + "grad_norm": 2.1898105144500732, + "learning_rate": 1.754954954954955e-05, + "loss": 2.8477, + "mean_token_accuracy": 0.43627429008483887, + "num_tokens": 248858860.0, + "step": 488 + }, + { + "epoch": 0.13223363980530017, + "grad_norm": 2.1511998176574707, + "learning_rate": 1.7585585585585588e-05, + "loss": 2.9521, + "mean_token_accuracy": 0.3788193464279175, + "num_tokens": 249383113.0, + "step": 489 + }, + { + "epoch": 0.1325040562466198, + "grad_norm": 9.64275074005127, + "learning_rate": 1.7621621621621622e-05, + "loss": 2.6913, + "mean_token_accuracy": 0.4783014953136444, + "num_tokens": 249907228.0, + "step": 490 + }, + { + "epoch": 0.13277447268793943, + "grad_norm": 3.184426784515381, + "learning_rate": 1.765765765765766e-05, + "loss": 3.1806, + "mean_token_accuracy": 0.4025574028491974, + "num_tokens": 250388054.0, + "step": 491 + }, + { + "epoch": 0.13304488912925905, + "grad_norm": 2.105506181716919, + "learning_rate": 1.7693693693693696e-05, + "loss": 3.0131, + "mean_token_accuracy": 0.41029536724090576, + "num_tokens": 250899195.0, + "step": 492 + }, + { + "epoch": 0.1333153055705787, + "grad_norm": 2.4794440269470215, + "learning_rate": 1.7729729729729733e-05, + "loss": 2.9962, + "mean_token_accuracy": 0.40864577889442444, + "num_tokens": 251423327.0, + "step": 493 + }, + { + "epoch": 0.13358572201189833, + "grad_norm": 1.816339373588562, + "learning_rate": 1.7765765765765767e-05, + "loss": 2.9075, + "mean_token_accuracy": 0.44726088643074036, + "num_tokens": 251872966.0, + "step": 494 + }, + { + "epoch": 0.13385613845321795, + "grad_norm": 2.107215404510498, + "learning_rate": 1.7801801801801804e-05, + "loss": 3.0812, + "mean_token_accuracy": 0.39382296800613403, + "num_tokens": 252397228.0, + "step": 495 + }, + { + "epoch": 0.1341265548945376, + "grad_norm": 1.6985359191894531, + "learning_rate": 1.783783783783784e-05, + "loss": 3.1451, + "mean_token_accuracy": 0.40968602895736694, + "num_tokens": 252921338.0, + "step": 496 + }, + { + "epoch": 0.1343969713358572, + "grad_norm": 2.379769802093506, + "learning_rate": 1.7873873873873874e-05, + "loss": 3.118, + "mean_token_accuracy": 0.4013592004776001, + "num_tokens": 253391549.0, + "step": 497 + }, + { + "epoch": 0.13466738777717685, + "grad_norm": 2.480206251144409, + "learning_rate": 1.790990990990991e-05, + "loss": 3.1958, + "mean_token_accuracy": 0.3827289342880249, + "num_tokens": 253915726.0, + "step": 498 + }, + { + "epoch": 0.1349378042184965, + "grad_norm": 1.9026786088943481, + "learning_rate": 1.7945945945945948e-05, + "loss": 2.82, + "mean_token_accuracy": 0.42462849617004395, + "num_tokens": 254382112.0, + "step": 499 + }, + { + "epoch": 0.1352082206598161, + "grad_norm": 2.3271267414093018, + "learning_rate": 1.7981981981981985e-05, + "loss": 3.0052, + "mean_token_accuracy": 0.41235989332199097, + "num_tokens": 254890489.0, + "step": 500 + }, + { + "epoch": 0.13547863710113575, + "grad_norm": 2.694542407989502, + "learning_rate": 1.801801801801802e-05, + "loss": 1.248, + "mean_token_accuracy": 0.6715545654296875, + "num_tokens": 255414709.0, + "step": 501 + }, + { + "epoch": 0.1357490535424554, + "grad_norm": 3.8355722427368164, + "learning_rate": 1.8054054054054056e-05, + "loss": 3.0851, + "mean_token_accuracy": 0.42616814374923706, + "num_tokens": 255905108.0, + "step": 502 + }, + { + "epoch": 0.136019469983775, + "grad_norm": 2.5850062370300293, + "learning_rate": 1.809009009009009e-05, + "loss": 2.9068, + "mean_token_accuracy": 0.4003147482872009, + "num_tokens": 256429347.0, + "step": 503 + }, + { + "epoch": 0.13628988642509465, + "grad_norm": 3.325103282928467, + "learning_rate": 1.8126126126126127e-05, + "loss": 2.9038, + "mean_token_accuracy": 0.4331192970275879, + "num_tokens": 256859421.0, + "step": 504 + }, + { + "epoch": 0.13656030286641427, + "grad_norm": 3.317488431930542, + "learning_rate": 1.8162162162162164e-05, + "loss": 3.1035, + "mean_token_accuracy": 0.39965522289276123, + "num_tokens": 257383698.0, + "step": 505 + }, + { + "epoch": 0.1368307193077339, + "grad_norm": 2.695826292037964, + "learning_rate": 1.81981981981982e-05, + "loss": 2.9473, + "mean_token_accuracy": 0.4101276397705078, + "num_tokens": 257907977.0, + "step": 506 + }, + { + "epoch": 0.13710113574905355, + "grad_norm": 2.7007973194122314, + "learning_rate": 1.8234234234234234e-05, + "loss": 2.9303, + "mean_token_accuracy": 0.408380389213562, + "num_tokens": 258422083.0, + "step": 507 + }, + { + "epoch": 0.13737155219037317, + "grad_norm": 1.9301501512527466, + "learning_rate": 1.827027027027027e-05, + "loss": 3.1704, + "mean_token_accuracy": 0.3928096294403076, + "num_tokens": 258946344.0, + "step": 508 + }, + { + "epoch": 0.1376419686316928, + "grad_norm": 2.1362462043762207, + "learning_rate": 1.830630630630631e-05, + "loss": 2.8884, + "mean_token_accuracy": 0.43547672033309937, + "num_tokens": 259470568.0, + "step": 509 + }, + { + "epoch": 0.13791238507301243, + "grad_norm": 2.0937254428863525, + "learning_rate": 1.8342342342342342e-05, + "loss": 2.8295, + "mean_token_accuracy": 0.4455909729003906, + "num_tokens": 259932412.0, + "step": 510 + }, + { + "epoch": 0.13818280151433207, + "grad_norm": 2.4872488975524902, + "learning_rate": 1.8378378378378383e-05, + "loss": 3.0868, + "mean_token_accuracy": 0.419521689414978, + "num_tokens": 260417352.0, + "step": 511 + }, + { + "epoch": 0.13845321795565171, + "grad_norm": 1.819952130317688, + "learning_rate": 1.8414414414414416e-05, + "loss": 3.1684, + "mean_token_accuracy": 0.40955156087875366, + "num_tokens": 260904017.0, + "step": 512 + }, + { + "epoch": 0.13872363439697133, + "grad_norm": 1.9957629442214966, + "learning_rate": 1.8450450450450453e-05, + "loss": 3.0911, + "mean_token_accuracy": 0.4195464849472046, + "num_tokens": 261381824.0, + "step": 513 + }, + { + "epoch": 0.13899405083829097, + "grad_norm": 2.040964126586914, + "learning_rate": 1.8486486486486487e-05, + "loss": 3.0062, + "mean_token_accuracy": 0.4068336486816406, + "num_tokens": 261906012.0, + "step": 514 + }, + { + "epoch": 0.1392644672796106, + "grad_norm": 1.8516217470169067, + "learning_rate": 1.8522522522522524e-05, + "loss": 3.0702, + "mean_token_accuracy": 0.40284138917922974, + "num_tokens": 262430275.0, + "step": 515 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 2.249633550643921, + "learning_rate": 1.855855855855856e-05, + "loss": 2.9324, + "mean_token_accuracy": 0.41856223344802856, + "num_tokens": 262910007.0, + "step": 516 + }, + { + "epoch": 0.13980530016224987, + "grad_norm": 1.8178739547729492, + "learning_rate": 1.8594594594594598e-05, + "loss": 3.099, + "mean_token_accuracy": 0.42647814750671387, + "num_tokens": 263370918.0, + "step": 517 + }, + { + "epoch": 0.1400757166035695, + "grad_norm": 2.1766607761383057, + "learning_rate": 1.863063063063063e-05, + "loss": 3.0396, + "mean_token_accuracy": 0.41890665888786316, + "num_tokens": 263875838.0, + "step": 518 + }, + { + "epoch": 0.14034613304488913, + "grad_norm": 2.0028624534606934, + "learning_rate": 1.866666666666667e-05, + "loss": 2.9143, + "mean_token_accuracy": 0.40717950463294983, + "num_tokens": 264399912.0, + "step": 519 + }, + { + "epoch": 0.14061654948620878, + "grad_norm": 2.4203670024871826, + "learning_rate": 1.8702702702702706e-05, + "loss": 2.9212, + "mean_token_accuracy": 0.42496296763420105, + "num_tokens": 264907507.0, + "step": 520 + }, + { + "epoch": 0.1408869659275284, + "grad_norm": 1.8575886487960815, + "learning_rate": 1.873873873873874e-05, + "loss": 1.2502, + "mean_token_accuracy": 0.6716299057006836, + "num_tokens": 265419188.0, + "step": 521 + }, + { + "epoch": 0.14115738236884803, + "grad_norm": 2.2455923557281494, + "learning_rate": 1.8774774774774776e-05, + "loss": 3.0363, + "mean_token_accuracy": 0.3995438814163208, + "num_tokens": 265943434.0, + "step": 522 + }, + { + "epoch": 0.14142779881016765, + "grad_norm": 1.727116584777832, + "learning_rate": 1.8810810810810813e-05, + "loss": 3.0481, + "mean_token_accuracy": 0.4055332541465759, + "num_tokens": 266467589.0, + "step": 523 + }, + { + "epoch": 0.1416982152514873, + "grad_norm": 1.716110110282898, + "learning_rate": 1.884684684684685e-05, + "loss": 2.8957, + "mean_token_accuracy": 0.39786919951438904, + "num_tokens": 266991630.0, + "step": 524 + }, + { + "epoch": 0.14196863169280693, + "grad_norm": 1.8287043571472168, + "learning_rate": 1.8882882882882884e-05, + "loss": 2.6642, + "mean_token_accuracy": 0.4286285936832428, + "num_tokens": 267515863.0, + "step": 525 + }, + { + "epoch": 0.14223904813412655, + "grad_norm": 2.379183530807495, + "learning_rate": 1.891891891891892e-05, + "loss": 3.0795, + "mean_token_accuracy": 0.4134528934955597, + "num_tokens": 268021312.0, + "step": 526 + }, + { + "epoch": 0.1425094645754462, + "grad_norm": 2.106926679611206, + "learning_rate": 1.8954954954954955e-05, + "loss": 2.8937, + "mean_token_accuracy": 0.4148309826850891, + "num_tokens": 268545439.0, + "step": 527 + }, + { + "epoch": 0.1427798810167658, + "grad_norm": 1.9769207239151, + "learning_rate": 1.8990990990990992e-05, + "loss": 2.7319, + "mean_token_accuracy": 0.42494505643844604, + "num_tokens": 269069492.0, + "step": 528 + }, + { + "epoch": 0.14305029745808545, + "grad_norm": 2.06563663482666, + "learning_rate": 1.902702702702703e-05, + "loss": 2.9354, + "mean_token_accuracy": 0.41220158338546753, + "num_tokens": 269593671.0, + "step": 529 + }, + { + "epoch": 0.1433207138994051, + "grad_norm": 1.9536232948303223, + "learning_rate": 1.9063063063063066e-05, + "loss": 2.8977, + "mean_token_accuracy": 0.42362216114997864, + "num_tokens": 270117891.0, + "step": 530 + }, + { + "epoch": 0.1435911303407247, + "grad_norm": 2.1370294094085693, + "learning_rate": 1.90990990990991e-05, + "loss": 2.6937, + "mean_token_accuracy": 0.4330795407295227, + "num_tokens": 270620055.0, + "step": 531 + }, + { + "epoch": 0.14386154678204435, + "grad_norm": 2.078812837600708, + "learning_rate": 1.9135135135135137e-05, + "loss": 2.8412, + "mean_token_accuracy": 0.4184548854827881, + "num_tokens": 271144309.0, + "step": 532 + }, + { + "epoch": 0.14413196322336397, + "grad_norm": 2.117065906524658, + "learning_rate": 1.9171171171171174e-05, + "loss": 3.1373, + "mean_token_accuracy": 0.4574318528175354, + "num_tokens": 271503735.0, + "step": 533 + }, + { + "epoch": 0.1444023796646836, + "grad_norm": 2.570042133331299, + "learning_rate": 1.9207207207207207e-05, + "loss": 3.2608, + "mean_token_accuracy": 0.38610342144966125, + "num_tokens": 272027934.0, + "step": 534 + }, + { + "epoch": 0.14467279610600325, + "grad_norm": 2.108776330947876, + "learning_rate": 1.9243243243243244e-05, + "loss": 2.9828, + "mean_token_accuracy": 0.41589975357055664, + "num_tokens": 272552163.0, + "step": 535 + }, + { + "epoch": 0.14494321254732287, + "grad_norm": 2.057655096054077, + "learning_rate": 1.927927927927928e-05, + "loss": 2.9946, + "mean_token_accuracy": 0.40136122703552246, + "num_tokens": 273076416.0, + "step": 536 + }, + { + "epoch": 0.1452136289886425, + "grad_norm": 2.2632861137390137, + "learning_rate": 1.931531531531532e-05, + "loss": 2.8658, + "mean_token_accuracy": 0.42682626843452454, + "num_tokens": 273572626.0, + "step": 537 + }, + { + "epoch": 0.14548404542996216, + "grad_norm": 1.9842427968978882, + "learning_rate": 1.9351351351351352e-05, + "loss": 2.9631, + "mean_token_accuracy": 0.41168832778930664, + "num_tokens": 274091281.0, + "step": 538 + }, + { + "epoch": 0.14575446187128177, + "grad_norm": 2.2879230976104736, + "learning_rate": 1.938738738738739e-05, + "loss": 2.8699, + "mean_token_accuracy": 0.43360549211502075, + "num_tokens": 274615512.0, + "step": 539 + }, + { + "epoch": 0.1460248783126014, + "grad_norm": 2.457227945327759, + "learning_rate": 1.9423423423423423e-05, + "loss": 2.8358, + "mean_token_accuracy": 0.4290093183517456, + "num_tokens": 275139760.0, + "step": 540 + }, + { + "epoch": 0.14629529475392103, + "grad_norm": 1.6855483055114746, + "learning_rate": 1.9459459459459463e-05, + "loss": 1.3083, + "mean_token_accuracy": 0.6673076748847961, + "num_tokens": 275658369.0, + "step": 541 + }, + { + "epoch": 0.14656571119524067, + "grad_norm": 3.883800983428955, + "learning_rate": 1.9495495495495497e-05, + "loss": 3.226, + "mean_token_accuracy": 0.3901311755180359, + "num_tokens": 276182628.0, + "step": 542 + }, + { + "epoch": 0.14683612763656032, + "grad_norm": 2.528411626815796, + "learning_rate": 1.9531531531531534e-05, + "loss": 3.1422, + "mean_token_accuracy": 0.381341814994812, + "num_tokens": 276706818.0, + "step": 543 + }, + { + "epoch": 0.14710654407787993, + "grad_norm": 3.092188596725464, + "learning_rate": 1.956756756756757e-05, + "loss": 3.1148, + "mean_token_accuracy": 0.40416228771209717, + "num_tokens": 277231047.0, + "step": 544 + }, + { + "epoch": 0.14737696051919957, + "grad_norm": 2.3058722019195557, + "learning_rate": 1.9603603603603604e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.4588029384613037, + "num_tokens": 277691530.0, + "step": 545 + }, + { + "epoch": 0.1476473769605192, + "grad_norm": 4.016175270080566, + "learning_rate": 1.963963963963964e-05, + "loss": 2.8584, + "mean_token_accuracy": 0.42658329010009766, + "num_tokens": 278215808.0, + "step": 546 + }, + { + "epoch": 0.14791779340183883, + "grad_norm": 2.1656205654144287, + "learning_rate": 1.967567567567568e-05, + "loss": 3.0409, + "mean_token_accuracy": 0.4221344590187073, + "num_tokens": 278740055.0, + "step": 547 + }, + { + "epoch": 0.14818820984315847, + "grad_norm": 2.6139447689056396, + "learning_rate": 1.9711711711711716e-05, + "loss": 2.993, + "mean_token_accuracy": 0.3972267508506775, + "num_tokens": 279264327.0, + "step": 548 + }, + { + "epoch": 0.1484586262844781, + "grad_norm": 2.5929338932037354, + "learning_rate": 1.974774774774775e-05, + "loss": 2.8719, + "mean_token_accuracy": 0.42512738704681396, + "num_tokens": 279788468.0, + "step": 549 + }, + { + "epoch": 0.14872904272579773, + "grad_norm": 1.8990387916564941, + "learning_rate": 1.9783783783783786e-05, + "loss": 2.8496, + "mean_token_accuracy": 0.41725659370422363, + "num_tokens": 280312675.0, + "step": 550 + }, + { + "epoch": 0.14899945916711735, + "grad_norm": 2.880263328552246, + "learning_rate": 1.981981981981982e-05, + "loss": 2.7967, + "mean_token_accuracy": 0.43701422214508057, + "num_tokens": 280784705.0, + "step": 551 + }, + { + "epoch": 0.149269875608437, + "grad_norm": 2.7469639778137207, + "learning_rate": 1.9855855855855857e-05, + "loss": 2.8203, + "mean_token_accuracy": 0.42689990997314453, + "num_tokens": 281308923.0, + "step": 552 + }, + { + "epoch": 0.14954029204975663, + "grad_norm": 2.1809680461883545, + "learning_rate": 1.9891891891891894e-05, + "loss": 2.9573, + "mean_token_accuracy": 0.41846251487731934, + "num_tokens": 281833107.0, + "step": 553 + }, + { + "epoch": 0.14981070849107625, + "grad_norm": 2.2467761039733887, + "learning_rate": 1.992792792792793e-05, + "loss": 2.9273, + "mean_token_accuracy": 0.4170498549938202, + "num_tokens": 282357377.0, + "step": 554 + }, + { + "epoch": 0.1500811249323959, + "grad_norm": 1.889521598815918, + "learning_rate": 1.9963963963963965e-05, + "loss": 2.9462, + "mean_token_accuracy": 0.43104344606399536, + "num_tokens": 282813511.0, + "step": 555 + }, + { + "epoch": 0.1503515413737155, + "grad_norm": 2.293567180633545, + "learning_rate": 2e-05, + "loss": 3.0231, + "mean_token_accuracy": 0.4164881110191345, + "num_tokens": 283337769.0, + "step": 556 + }, + { + "epoch": 0.15062195781503515, + "grad_norm": 2.2132375240325928, + "learning_rate": 1.999999986192677e-05, + "loss": 2.7193, + "mean_token_accuracy": 0.41060394048690796, + "num_tokens": 283861940.0, + "step": 557 + }, + { + "epoch": 0.1508923742563548, + "grad_norm": 3.8097381591796875, + "learning_rate": 1.999999944770707e-05, + "loss": 2.8935, + "mean_token_accuracy": 0.4276787340641022, + "num_tokens": 284386105.0, + "step": 558 + }, + { + "epoch": 0.1511627906976744, + "grad_norm": 2.510437488555908, + "learning_rate": 1.9999998757340923e-05, + "loss": 2.9472, + "mean_token_accuracy": 0.4441462755203247, + "num_tokens": 284831870.0, + "step": 559 + }, + { + "epoch": 0.15143320713899405, + "grad_norm": 2.3518385887145996, + "learning_rate": 1.9999997790828347e-05, + "loss": 2.8999, + "mean_token_accuracy": 0.4096308946609497, + "num_tokens": 285317932.0, + "step": 560 + }, + { + "epoch": 0.1517036235803137, + "grad_norm": 1.186809778213501, + "learning_rate": 1.999999654816937e-05, + "loss": 1.3008, + "mean_token_accuracy": 0.6683101654052734, + "num_tokens": 285842018.0, + "step": 561 + }, + { + "epoch": 0.1519740400216333, + "grad_norm": 3.726809024810791, + "learning_rate": 1.9999995029364035e-05, + "loss": 2.56, + "mean_token_accuracy": 0.43605560064315796, + "num_tokens": 286366205.0, + "step": 562 + }, + { + "epoch": 0.15224445646295295, + "grad_norm": 3.6043546199798584, + "learning_rate": 1.9999993234412383e-05, + "loss": 2.9489, + "mean_token_accuracy": 0.4204670786857605, + "num_tokens": 286890419.0, + "step": 563 + }, + { + "epoch": 0.15251487290427257, + "grad_norm": 4.4317626953125, + "learning_rate": 1.999999116331447e-05, + "loss": 2.9599, + "mean_token_accuracy": 0.4358893036842346, + "num_tokens": 287414532.0, + "step": 564 + }, + { + "epoch": 0.1527852893455922, + "grad_norm": 6.042898178100586, + "learning_rate": 1.9999988816070365e-05, + "loss": 2.7826, + "mean_token_accuracy": 0.4330078959465027, + "num_tokens": 287938811.0, + "step": 565 + }, + { + "epoch": 0.15305570578691186, + "grad_norm": 3.232499361038208, + "learning_rate": 1.9999986192680134e-05, + "loss": 2.9718, + "mean_token_accuracy": 0.42520928382873535, + "num_tokens": 288413565.0, + "step": 566 + }, + { + "epoch": 0.15332612222823147, + "grad_norm": 3.1103131771087646, + "learning_rate": 1.9999983293143858e-05, + "loss": 2.9406, + "mean_token_accuracy": 0.4020698070526123, + "num_tokens": 288937781.0, + "step": 567 + }, + { + "epoch": 0.1535965386695511, + "grad_norm": 6.574828147888184, + "learning_rate": 1.9999980117461627e-05, + "loss": 3.0405, + "mean_token_accuracy": 0.4193100333213806, + "num_tokens": 289462055.0, + "step": 568 + }, + { + "epoch": 0.15386695511087073, + "grad_norm": 3.2718257904052734, + "learning_rate": 1.999997666563354e-05, + "loss": 2.9621, + "mean_token_accuracy": 0.46130096912384033, + "num_tokens": 289920919.0, + "step": 569 + }, + { + "epoch": 0.15413737155219037, + "grad_norm": 2.7922604084014893, + "learning_rate": 1.99999729376597e-05, + "loss": 3.0149, + "mean_token_accuracy": 0.41151517629623413, + "num_tokens": 290445163.0, + "step": 570 + }, + { + "epoch": 0.15440778799351002, + "grad_norm": 6.191995143890381, + "learning_rate": 1.9999968933540226e-05, + "loss": 3.1845, + "mean_token_accuracy": 0.38418644666671753, + "num_tokens": 290969354.0, + "step": 571 + }, + { + "epoch": 0.15467820443482963, + "grad_norm": 3.651679754257202, + "learning_rate": 1.9999964653275238e-05, + "loss": 2.9062, + "mean_token_accuracy": 0.43166911602020264, + "num_tokens": 291470750.0, + "step": 572 + }, + { + "epoch": 0.15494862087614927, + "grad_norm": 3.381164073944092, + "learning_rate": 1.9999960096864864e-05, + "loss": 2.8556, + "mean_token_accuracy": 0.4273921251296997, + "num_tokens": 291994974.0, + "step": 573 + }, + { + "epoch": 0.1552190373174689, + "grad_norm": 4.831196308135986, + "learning_rate": 1.999995526430925e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.47985002398490906, + "num_tokens": 292519240.0, + "step": 574 + }, + { + "epoch": 0.15548945375878853, + "grad_norm": 3.660109281539917, + "learning_rate": 1.999995015560854e-05, + "loss": 2.8208, + "mean_token_accuracy": 0.46663233637809753, + "num_tokens": 292977941.0, + "step": 575 + }, + { + "epoch": 0.15575987020010817, + "grad_norm": 34.52461242675781, + "learning_rate": 1.999994477076289e-05, + "loss": 2.7417, + "mean_token_accuracy": 0.46074384450912476, + "num_tokens": 293502149.0, + "step": 576 + }, + { + "epoch": 0.1560302866414278, + "grad_norm": 7.017449855804443, + "learning_rate": 1.999993910977247e-05, + "loss": 2.9543, + "mean_token_accuracy": 0.4142402410507202, + "num_tokens": 294026287.0, + "step": 577 + }, + { + "epoch": 0.15630070308274743, + "grad_norm": 6.343939304351807, + "learning_rate": 1.9999933172637453e-05, + "loss": 3.0697, + "mean_token_accuracy": 0.42541369795799255, + "num_tokens": 294548396.0, + "step": 578 + }, + { + "epoch": 0.15657111952406708, + "grad_norm": 3.85422682762146, + "learning_rate": 1.9999926959358016e-05, + "loss": 2.6724, + "mean_token_accuracy": 0.444202721118927, + "num_tokens": 295072480.0, + "step": 579 + }, + { + "epoch": 0.1568415359653867, + "grad_norm": 2.08271861076355, + "learning_rate": 1.9999920469934353e-05, + "loss": 2.9498, + "mean_token_accuracy": 0.4052678644657135, + "num_tokens": 295596759.0, + "step": 580 + }, + { + "epoch": 0.15711195240670633, + "grad_norm": 2.0245792865753174, + "learning_rate": 1.9999913704366663e-05, + "loss": 1.2277, + "mean_token_accuracy": 0.6742420792579651, + "num_tokens": 296121032.0, + "step": 581 + }, + { + "epoch": 0.15738236884802595, + "grad_norm": 6.160236835479736, + "learning_rate": 1.9999906662655157e-05, + "loss": 3.115, + "mean_token_accuracy": 0.3990286588668823, + "num_tokens": 296645181.0, + "step": 582 + }, + { + "epoch": 0.1576527852893456, + "grad_norm": 5.309808731079102, + "learning_rate": 1.9999899344800044e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.4312346577644348, + "num_tokens": 297169311.0, + "step": 583 + }, + { + "epoch": 0.15792320173066524, + "grad_norm": 2.0257277488708496, + "learning_rate": 1.999989175080155e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.4423627257347107, + "num_tokens": 297693554.0, + "step": 584 + }, + { + "epoch": 0.15819361817198485, + "grad_norm": 38.42069625854492, + "learning_rate": 1.9999883880659913e-05, + "loss": 3.1337, + "mean_token_accuracy": 0.3871457874774933, + "num_tokens": 298217834.0, + "step": 585 + }, + { + "epoch": 0.1584640346133045, + "grad_norm": 6.811807632446289, + "learning_rate": 1.9999875734375373e-05, + "loss": 3.0718, + "mean_token_accuracy": 0.40787240862846375, + "num_tokens": 298742011.0, + "step": 586 + }, + { + "epoch": 0.1587344510546241, + "grad_norm": 7.620343208312988, + "learning_rate": 1.9999867311948175e-05, + "loss": 2.9361, + "mean_token_accuracy": 0.4423830807209015, + "num_tokens": 299266232.0, + "step": 587 + }, + { + "epoch": 0.15900486749594375, + "grad_norm": 4.3389811515808105, + "learning_rate": 1.9999858613378583e-05, + "loss": 2.8781, + "mean_token_accuracy": 0.4165979027748108, + "num_tokens": 299744693.0, + "step": 588 + }, + { + "epoch": 0.1592752839372634, + "grad_norm": 3.799171209335327, + "learning_rate": 1.9999849638666866e-05, + "loss": 2.8386, + "mean_token_accuracy": 0.4195820391178131, + "num_tokens": 300268819.0, + "step": 589 + }, + { + "epoch": 0.159545700378583, + "grad_norm": 4.031172275543213, + "learning_rate": 1.999984038781329e-05, + "loss": 3.0843, + "mean_token_accuracy": 0.4025548994541168, + "num_tokens": 300793018.0, + "step": 590 + }, + { + "epoch": 0.15981611681990265, + "grad_norm": 4.0121564865112305, + "learning_rate": 1.9999830860818144e-05, + "loss": 2.8258, + "mean_token_accuracy": 0.4304823577404022, + "num_tokens": 301317222.0, + "step": 591 + }, + { + "epoch": 0.16008653326122227, + "grad_norm": 3.2784416675567627, + "learning_rate": 1.9999821057681723e-05, + "loss": 2.9505, + "mean_token_accuracy": 0.4183279573917389, + "num_tokens": 301841500.0, + "step": 592 + }, + { + "epoch": 0.1603569497025419, + "grad_norm": 2.9131345748901367, + "learning_rate": 1.999981097840432e-05, + "loss": 2.8994, + "mean_token_accuracy": 0.4254991114139557, + "num_tokens": 302343960.0, + "step": 593 + }, + { + "epoch": 0.16062736614386156, + "grad_norm": 3.214965343475342, + "learning_rate": 1.9999800622986256e-05, + "loss": 3.0428, + "mean_token_accuracy": 0.4048927426338196, + "num_tokens": 302868193.0, + "step": 594 + }, + { + "epoch": 0.16089778258518117, + "grad_norm": 2.713487148284912, + "learning_rate": 1.999978999142784e-05, + "loss": 3.0316, + "mean_token_accuracy": 0.40151143074035645, + "num_tokens": 303392446.0, + "step": 595 + }, + { + "epoch": 0.1611681990265008, + "grad_norm": 2.7654731273651123, + "learning_rate": 1.99997790837294e-05, + "loss": 2.9532, + "mean_token_accuracy": 0.4292231798171997, + "num_tokens": 303883872.0, + "step": 596 + }, + { + "epoch": 0.16143861546782046, + "grad_norm": 2.856205701828003, + "learning_rate": 1.999976789989127e-05, + "loss": 3.1004, + "mean_token_accuracy": 0.3917355537414551, + "num_tokens": 304408139.0, + "step": 597 + }, + { + "epoch": 0.16170903190914007, + "grad_norm": 1.8613473176956177, + "learning_rate": 1.9999756439913796e-05, + "loss": 2.8322, + "mean_token_accuracy": 0.4193456172943115, + "num_tokens": 304932276.0, + "step": 598 + }, + { + "epoch": 0.16197944835045971, + "grad_norm": 1.8954004049301147, + "learning_rate": 1.999974470379733e-05, + "loss": 2.7531, + "mean_token_accuracy": 0.4223683178424835, + "num_tokens": 305456512.0, + "step": 599 + }, + { + "epoch": 0.16224986479177933, + "grad_norm": 2.279942274093628, + "learning_rate": 1.9999732691542223e-05, + "loss": 2.9066, + "mean_token_accuracy": 0.42175373435020447, + "num_tokens": 305980647.0, + "step": 600 + }, + { + "epoch": 0.16252028123309897, + "grad_norm": 1.3189401626586914, + "learning_rate": 1.9999720403148857e-05, + "loss": 1.2844, + "mean_token_accuracy": 0.67878258228302, + "num_tokens": 306472792.0, + "step": 601 + }, + { + "epoch": 0.16279069767441862, + "grad_norm": 3.3675670623779297, + "learning_rate": 1.99997078386176e-05, + "loss": 2.8626, + "mean_token_accuracy": 0.4401630759239197, + "num_tokens": 306987644.0, + "step": 602 + }, + { + "epoch": 0.16306111411573823, + "grad_norm": 2.860612630844116, + "learning_rate": 1.999969499794884e-05, + "loss": 3.0114, + "mean_token_accuracy": 0.4189320504665375, + "num_tokens": 307470459.0, + "step": 603 + }, + { + "epoch": 0.16333153055705787, + "grad_norm": 2.879396438598633, + "learning_rate": 1.9999681881142973e-05, + "loss": 2.8819, + "mean_token_accuracy": 0.4266401529312134, + "num_tokens": 307994734.0, + "step": 604 + }, + { + "epoch": 0.1636019469983775, + "grad_norm": 2.7917492389678955, + "learning_rate": 1.9999668488200395e-05, + "loss": 3.0066, + "mean_token_accuracy": 0.3969046473503113, + "num_tokens": 308518849.0, + "step": 605 + }, + { + "epoch": 0.16387236343969713, + "grad_norm": 2.086880683898926, + "learning_rate": 1.9999654819121526e-05, + "loss": 3.0273, + "mean_token_accuracy": 0.41355764865875244, + "num_tokens": 309019489.0, + "step": 606 + }, + { + "epoch": 0.16414277988101678, + "grad_norm": 2.4037296772003174, + "learning_rate": 1.999964087390678e-05, + "loss": 2.9013, + "mean_token_accuracy": 0.4354769289493561, + "num_tokens": 309543647.0, + "step": 607 + }, + { + "epoch": 0.1644131963223364, + "grad_norm": 1.9461089372634888, + "learning_rate": 1.9999626652556587e-05, + "loss": 2.7635, + "mean_token_accuracy": 0.440741628408432, + "num_tokens": 310067771.0, + "step": 608 + }, + { + "epoch": 0.16468361276365603, + "grad_norm": 2.2379791736602783, + "learning_rate": 1.9999612155071377e-05, + "loss": 2.9267, + "mean_token_accuracy": 0.42923861742019653, + "num_tokens": 310591951.0, + "step": 609 + }, + { + "epoch": 0.16495402920497565, + "grad_norm": 1.8264479637145996, + "learning_rate": 1.9999597381451604e-05, + "loss": 2.7869, + "mean_token_accuracy": 0.4223497211933136, + "num_tokens": 311116211.0, + "step": 610 + }, + { + "epoch": 0.1652244456462953, + "grad_norm": 1.5239406824111938, + "learning_rate": 1.9999582331697714e-05, + "loss": 2.7084, + "mean_token_accuracy": 0.4313236474990845, + "num_tokens": 311640453.0, + "step": 611 + }, + { + "epoch": 0.16549486208761494, + "grad_norm": 1.6094118356704712, + "learning_rate": 1.9999567005810176e-05, + "loss": 2.8225, + "mean_token_accuracy": 0.4186461269855499, + "num_tokens": 312164673.0, + "step": 612 + }, + { + "epoch": 0.16576527852893455, + "grad_norm": 1.817280650138855, + "learning_rate": 1.9999551403789456e-05, + "loss": 2.9646, + "mean_token_accuracy": 0.42419567704200745, + "num_tokens": 312688946.0, + "step": 613 + }, + { + "epoch": 0.1660356949702542, + "grad_norm": 1.9035918712615967, + "learning_rate": 1.9999535525636033e-05, + "loss": 2.8211, + "mean_token_accuracy": 0.43143969774246216, + "num_tokens": 313184276.0, + "step": 614 + }, + { + "epoch": 0.16630611141157384, + "grad_norm": 1.8089160919189453, + "learning_rate": 1.999951937135039e-05, + "loss": 2.8325, + "mean_token_accuracy": 0.43186715245246887, + "num_tokens": 313708456.0, + "step": 615 + }, + { + "epoch": 0.16657652785289345, + "grad_norm": 2.1505489349365234, + "learning_rate": 1.999950294093303e-05, + "loss": 2.7518, + "mean_token_accuracy": 0.4346056282520294, + "num_tokens": 314179848.0, + "step": 616 + }, + { + "epoch": 0.1668469442942131, + "grad_norm": 2.268474817276001, + "learning_rate": 1.9999486234384454e-05, + "loss": 2.505, + "mean_token_accuracy": 0.45644861459732056, + "num_tokens": 314704056.0, + "step": 617 + }, + { + "epoch": 0.1671173607355327, + "grad_norm": 2.178734064102173, + "learning_rate": 1.9999469251705174e-05, + "loss": 2.8674, + "mean_token_accuracy": 0.43623340129852295, + "num_tokens": 315228332.0, + "step": 618 + }, + { + "epoch": 0.16738777717685235, + "grad_norm": 1.9100086688995361, + "learning_rate": 1.999945199289571e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.4541923403739929, + "num_tokens": 315752553.0, + "step": 619 + }, + { + "epoch": 0.167658193618172, + "grad_norm": 2.0952422618865967, + "learning_rate": 1.9999434457956596e-05, + "loss": 2.7243, + "mean_token_accuracy": 0.45880675315856934, + "num_tokens": 316276830.0, + "step": 620 + }, + { + "epoch": 0.1679286100594916, + "grad_norm": 1.971539855003357, + "learning_rate": 1.9999416646888366e-05, + "loss": 1.32, + "mean_token_accuracy": 0.6718019247055054, + "num_tokens": 316746264.0, + "step": 621 + }, + { + "epoch": 0.16819902650081126, + "grad_norm": 3.1962289810180664, + "learning_rate": 1.999939855969157e-05, + "loss": 2.7486, + "mean_token_accuracy": 0.44329339265823364, + "num_tokens": 317214252.0, + "step": 622 + }, + { + "epoch": 0.16846944294213087, + "grad_norm": 2.6468985080718994, + "learning_rate": 1.9999380196366756e-05, + "loss": 3.1123, + "mean_token_accuracy": 0.40764182806015015, + "num_tokens": 317738525.0, + "step": 623 + }, + { + "epoch": 0.1687398593834505, + "grad_norm": 2.265068531036377, + "learning_rate": 1.9999361556914497e-05, + "loss": 2.7986, + "mean_token_accuracy": 0.4175000786781311, + "num_tokens": 318262724.0, + "step": 624 + }, + { + "epoch": 0.16901027582477016, + "grad_norm": 2.0014488697052, + "learning_rate": 1.9999342641335355e-05, + "loss": 2.9013, + "mean_token_accuracy": 0.42654019594192505, + "num_tokens": 318786966.0, + "step": 625 + }, + { + "epoch": 0.16928069226608977, + "grad_norm": 2.329498052597046, + "learning_rate": 1.999932344962992e-05, + "loss": 2.9162, + "mean_token_accuracy": 0.42204549908638, + "num_tokens": 319311180.0, + "step": 626 + }, + { + "epoch": 0.16955110870740941, + "grad_norm": 2.7711117267608643, + "learning_rate": 1.9999303981798778e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.4531976580619812, + "num_tokens": 319835437.0, + "step": 627 + }, + { + "epoch": 0.16982152514872903, + "grad_norm": 2.116208791732788, + "learning_rate": 1.9999284237842518e-05, + "loss": 2.9129, + "mean_token_accuracy": 0.42868685722351074, + "num_tokens": 320341777.0, + "step": 628 + }, + { + "epoch": 0.17009194159004867, + "grad_norm": 2.635910749435425, + "learning_rate": 1.9999264217761757e-05, + "loss": 2.9106, + "mean_token_accuracy": 0.43675708770751953, + "num_tokens": 320834168.0, + "step": 629 + }, + { + "epoch": 0.17036235803136832, + "grad_norm": 2.071741819381714, + "learning_rate": 1.9999243921557102e-05, + "loss": 2.914, + "mean_token_accuracy": 0.41765841841697693, + "num_tokens": 321358211.0, + "step": 630 + }, + { + "epoch": 0.17063277447268793, + "grad_norm": 2.4533498287200928, + "learning_rate": 1.9999223349229176e-05, + "loss": 2.5399, + "mean_token_accuracy": 0.4522894620895386, + "num_tokens": 321882409.0, + "step": 631 + }, + { + "epoch": 0.17090319091400757, + "grad_norm": 2.501744031906128, + "learning_rate": 1.999920250077862e-05, + "loss": 2.8638, + "mean_token_accuracy": 0.4129284620285034, + "num_tokens": 322406684.0, + "step": 632 + }, + { + "epoch": 0.17117360735532722, + "grad_norm": 2.1019797325134277, + "learning_rate": 1.999918137620606e-05, + "loss": 2.9533, + "mean_token_accuracy": 0.4260921776294708, + "num_tokens": 322930801.0, + "step": 633 + }, + { + "epoch": 0.17144402379664683, + "grad_norm": 2.9422149658203125, + "learning_rate": 1.9999159975512156e-05, + "loss": 2.699, + "mean_token_accuracy": 0.44771695137023926, + "num_tokens": 323405039.0, + "step": 634 + }, + { + "epoch": 0.17171444023796648, + "grad_norm": 2.436999559402466, + "learning_rate": 1.9999138298697557e-05, + "loss": 2.9373, + "mean_token_accuracy": 0.43926042318344116, + "num_tokens": 323929194.0, + "step": 635 + }, + { + "epoch": 0.1719848566792861, + "grad_norm": 2.5499472618103027, + "learning_rate": 1.999911634576293e-05, + "loss": 3.0051, + "mean_token_accuracy": 0.4177851676940918, + "num_tokens": 324453443.0, + "step": 636 + }, + { + "epoch": 0.17225527312060573, + "grad_norm": 2.262681722640991, + "learning_rate": 1.9999094116708946e-05, + "loss": 2.9081, + "mean_token_accuracy": 0.4066230058670044, + "num_tokens": 324977561.0, + "step": 637 + }, + { + "epoch": 0.17252568956192538, + "grad_norm": 1.5451560020446777, + "learning_rate": 1.999907161153629e-05, + "loss": 2.7879, + "mean_token_accuracy": 0.4379664659500122, + "num_tokens": 325496390.0, + "step": 638 + }, + { + "epoch": 0.172796106003245, + "grad_norm": 2.6214535236358643, + "learning_rate": 1.9999048830245656e-05, + "loss": 2.9194, + "mean_token_accuracy": 0.3913092017173767, + "num_tokens": 326020659.0, + "step": 639 + }, + { + "epoch": 0.17306652244456464, + "grad_norm": 1.6569355726242065, + "learning_rate": 1.9999025772837738e-05, + "loss": 2.707, + "mean_token_accuracy": 0.43147456645965576, + "num_tokens": 326523978.0, + "step": 640 + }, + { + "epoch": 0.17333693888588425, + "grad_norm": 1.3857321739196777, + "learning_rate": 1.9999002439313247e-05, + "loss": 1.2347, + "mean_token_accuracy": 0.6757117509841919, + "num_tokens": 327048104.0, + "step": 641 + }, + { + "epoch": 0.1736073553272039, + "grad_norm": 5.139114856719971, + "learning_rate": 1.999897882967289e-05, + "loss": 2.601, + "mean_token_accuracy": 0.4483335614204407, + "num_tokens": 327572239.0, + "step": 642 + }, + { + "epoch": 0.17387777176852354, + "grad_norm": 4.7953972816467285, + "learning_rate": 1.9998954943917406e-05, + "loss": 2.8938, + "mean_token_accuracy": 0.41819316148757935, + "num_tokens": 328075910.0, + "step": 643 + }, + { + "epoch": 0.17414818820984315, + "grad_norm": 2.00699520111084, + "learning_rate": 1.9998930782047517e-05, + "loss": 2.8228, + "mean_token_accuracy": 0.43540772795677185, + "num_tokens": 328596445.0, + "step": 644 + }, + { + "epoch": 0.1744186046511628, + "grad_norm": 3.765657424926758, + "learning_rate": 1.999890634406397e-05, + "loss": 2.7764, + "mean_token_accuracy": 0.4303877353668213, + "num_tokens": 329120694.0, + "step": 645 + }, + { + "epoch": 0.1746890210924824, + "grad_norm": 3.836668014526367, + "learning_rate": 1.9998881629967508e-05, + "loss": 2.7229, + "mean_token_accuracy": 0.45726704597473145, + "num_tokens": 329607788.0, + "step": 646 + }, + { + "epoch": 0.17495943753380205, + "grad_norm": 2.456963539123535, + "learning_rate": 1.99988566397589e-05, + "loss": 2.6325, + "mean_token_accuracy": 0.45085904002189636, + "num_tokens": 330132020.0, + "step": 647 + }, + { + "epoch": 0.1752298539751217, + "grad_norm": 3.136883020401001, + "learning_rate": 1.9998831373438897e-05, + "loss": 2.9594, + "mean_token_accuracy": 0.41562730073928833, + "num_tokens": 330656183.0, + "step": 648 + }, + { + "epoch": 0.1755002704164413, + "grad_norm": 4.405272483825684, + "learning_rate": 1.999880583100829e-05, + "loss": 2.9714, + "mean_token_accuracy": 0.42371267080307007, + "num_tokens": 331180454.0, + "step": 649 + }, + { + "epoch": 0.17577068685776095, + "grad_norm": 2.1342978477478027, + "learning_rate": 1.999878001246785e-05, + "loss": 2.7408, + "mean_token_accuracy": 0.43118488788604736, + "num_tokens": 331704651.0, + "step": 650 + }, + { + "epoch": 0.1760411032990806, + "grad_norm": 2.9660022258758545, + "learning_rate": 1.9998753917818387e-05, + "loss": 2.8669, + "mean_token_accuracy": 0.4307316541671753, + "num_tokens": 332228852.0, + "step": 651 + }, + { + "epoch": 0.1763115197404002, + "grad_norm": 2.79703688621521, + "learning_rate": 1.999872754706068e-05, + "loss": 2.6279, + "mean_token_accuracy": 0.4448985755443573, + "num_tokens": 332708816.0, + "step": 652 + }, + { + "epoch": 0.17658193618171986, + "grad_norm": 7.079424858093262, + "learning_rate": 1.9998700900195553e-05, + "loss": 2.3975, + "mean_token_accuracy": 0.495478093624115, + "num_tokens": 333176191.0, + "step": 653 + }, + { + "epoch": 0.17685235262303947, + "grad_norm": 2.970374584197998, + "learning_rate": 1.999867397722382e-05, + "loss": 2.9868, + "mean_token_accuracy": 0.43993663787841797, + "num_tokens": 333607561.0, + "step": 654 + }, + { + "epoch": 0.17712276906435911, + "grad_norm": 2.0707454681396484, + "learning_rate": 1.99986467781463e-05, + "loss": 2.8287, + "mean_token_accuracy": 0.4099693298339844, + "num_tokens": 334080121.0, + "step": 655 + }, + { + "epoch": 0.17739318550567876, + "grad_norm": 2.1135735511779785, + "learning_rate": 1.9998619302963838e-05, + "loss": 2.8876, + "mean_token_accuracy": 0.42749151587486267, + "num_tokens": 334604363.0, + "step": 656 + }, + { + "epoch": 0.17766360194699837, + "grad_norm": 37.409393310546875, + "learning_rate": 1.9998591551677275e-05, + "loss": 2.9556, + "mean_token_accuracy": 0.3980798125267029, + "num_tokens": 335128583.0, + "step": 657 + }, + { + "epoch": 0.17793401838831802, + "grad_norm": 3.6970107555389404, + "learning_rate": 1.9998563524287456e-05, + "loss": 2.9943, + "mean_token_accuracy": 0.4131154417991638, + "num_tokens": 335652796.0, + "step": 658 + }, + { + "epoch": 0.17820443482963763, + "grad_norm": 2.333047866821289, + "learning_rate": 1.9998535220795246e-05, + "loss": 2.7223, + "mean_token_accuracy": 0.42146244645118713, + "num_tokens": 336177015.0, + "step": 659 + }, + { + "epoch": 0.17847485127095727, + "grad_norm": 1.7441996335983276, + "learning_rate": 1.9998506641201513e-05, + "loss": 2.8007, + "mean_token_accuracy": 0.41538572311401367, + "num_tokens": 336701285.0, + "step": 660 + }, + { + "epoch": 0.17874526771227692, + "grad_norm": 1.099034309387207, + "learning_rate": 1.9998477785507132e-05, + "loss": 1.1652, + "mean_token_accuracy": 0.7042285203933716, + "num_tokens": 337225559.0, + "step": 661 + }, + { + "epoch": 0.17901568415359653, + "grad_norm": 4.0340471267700195, + "learning_rate": 1.999844865371299e-05, + "loss": 2.9338, + "mean_token_accuracy": 0.4237003028392792, + "num_tokens": 337749771.0, + "step": 662 + }, + { + "epoch": 0.17928610059491618, + "grad_norm": 2.8697879314422607, + "learning_rate": 1.999841924581998e-05, + "loss": 2.7034, + "mean_token_accuracy": 0.44513019919395447, + "num_tokens": 338270800.0, + "step": 663 + }, + { + "epoch": 0.1795565170362358, + "grad_norm": 2.0722014904022217, + "learning_rate": 1.9998389561829007e-05, + "loss": 2.736, + "mean_token_accuracy": 0.44360607862472534, + "num_tokens": 338795066.0, + "step": 664 + }, + { + "epoch": 0.17982693347755543, + "grad_norm": 2.2405261993408203, + "learning_rate": 1.999835960174098e-05, + "loss": 2.7786, + "mean_token_accuracy": 0.43497708439826965, + "num_tokens": 339319243.0, + "step": 665 + }, + { + "epoch": 0.18009734991887508, + "grad_norm": 2.2531416416168213, + "learning_rate": 1.9998329365556815e-05, + "loss": 2.7268, + "mean_token_accuracy": 0.4271637797355652, + "num_tokens": 339843529.0, + "step": 666 + }, + { + "epoch": 0.1803677663601947, + "grad_norm": 2.2395617961883545, + "learning_rate": 1.9998298853277446e-05, + "loss": 2.8029, + "mean_token_accuracy": 0.4233623445034027, + "num_tokens": 340367713.0, + "step": 667 + }, + { + "epoch": 0.18063818280151434, + "grad_norm": 1.825899600982666, + "learning_rate": 1.9998268064903802e-05, + "loss": 2.9005, + "mean_token_accuracy": 0.413369745016098, + "num_tokens": 340891952.0, + "step": 668 + }, + { + "epoch": 0.18090859924283395, + "grad_norm": 2.156092643737793, + "learning_rate": 1.9998237000436834e-05, + "loss": 2.6821, + "mean_token_accuracy": 0.44483187794685364, + "num_tokens": 341416181.0, + "step": 669 + }, + { + "epoch": 0.1811790156841536, + "grad_norm": 2.143594741821289, + "learning_rate": 1.9998205659877495e-05, + "loss": 2.755, + "mean_token_accuracy": 0.44017478823661804, + "num_tokens": 341940344.0, + "step": 670 + }, + { + "epoch": 0.18144943212547324, + "grad_norm": 1.8540756702423096, + "learning_rate": 1.9998174043226744e-05, + "loss": 2.8394, + "mean_token_accuracy": 0.4221706986427307, + "num_tokens": 342464496.0, + "step": 671 + }, + { + "epoch": 0.18171984856679285, + "grad_norm": 2.226914882659912, + "learning_rate": 1.9998142150485546e-05, + "loss": 2.8126, + "mean_token_accuracy": 0.40587133169174194, + "num_tokens": 342988644.0, + "step": 672 + }, + { + "epoch": 0.1819902650081125, + "grad_norm": 2.0323524475097656, + "learning_rate": 1.999810998165489e-05, + "loss": 2.9509, + "mean_token_accuracy": 0.4203208386898041, + "num_tokens": 343512923.0, + "step": 673 + }, + { + "epoch": 0.18226068144943214, + "grad_norm": 1.8627537488937378, + "learning_rate": 1.9998077536735757e-05, + "loss": 2.6911, + "mean_token_accuracy": 0.4610963761806488, + "num_tokens": 344037081.0, + "step": 674 + }, + { + "epoch": 0.18253109789075175, + "grad_norm": 2.4906041622161865, + "learning_rate": 1.9998044815729145e-05, + "loss": 2.8005, + "mean_token_accuracy": 0.4658811688423157, + "num_tokens": 344425036.0, + "step": 675 + }, + { + "epoch": 0.1828015143320714, + "grad_norm": 1.80672025680542, + "learning_rate": 1.9998011818636053e-05, + "loss": 2.8472, + "mean_token_accuracy": 0.4298822283744812, + "num_tokens": 344949310.0, + "step": 676 + }, + { + "epoch": 0.183071930773391, + "grad_norm": 3.9960224628448486, + "learning_rate": 1.9997978545457495e-05, + "loss": 2.942, + "mean_token_accuracy": 0.4081610441207886, + "num_tokens": 345429154.0, + "step": 677 + }, + { + "epoch": 0.18334234721471065, + "grad_norm": 3.0029752254486084, + "learning_rate": 1.99979449961945e-05, + "loss": 2.9711, + "mean_token_accuracy": 0.4205109775066376, + "num_tokens": 345953410.0, + "step": 678 + }, + { + "epoch": 0.1836127636560303, + "grad_norm": 2.5413732528686523, + "learning_rate": 1.999791117084809e-05, + "loss": 3.0326, + "mean_token_accuracy": 0.43279772996902466, + "num_tokens": 346447932.0, + "step": 679 + }, + { + "epoch": 0.1838831800973499, + "grad_norm": 2.1942591667175293, + "learning_rate": 1.99978770694193e-05, + "loss": 2.613, + "mean_token_accuracy": 0.4636339843273163, + "num_tokens": 346972070.0, + "step": 680 + }, + { + "epoch": 0.18415359653866956, + "grad_norm": 1.471069097518921, + "learning_rate": 1.9997842691909185e-05, + "loss": 1.2528, + "mean_token_accuracy": 0.6746060848236084, + "num_tokens": 347496273.0, + "step": 681 + }, + { + "epoch": 0.18442401297998917, + "grad_norm": 3.426938533782959, + "learning_rate": 1.9997808038318793e-05, + "loss": 2.8672, + "mean_token_accuracy": 0.4197445511817932, + "num_tokens": 348020559.0, + "step": 682 + }, + { + "epoch": 0.18469442942130881, + "grad_norm": 2.67348575592041, + "learning_rate": 1.9997773108649194e-05, + "loss": 2.4701, + "mean_token_accuracy": 0.45234280824661255, + "num_tokens": 348544705.0, + "step": 683 + }, + { + "epoch": 0.18496484586262846, + "grad_norm": 1.9424597024917603, + "learning_rate": 1.999773790290145e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.4679696261882782, + "num_tokens": 349068916.0, + "step": 684 + }, + { + "epoch": 0.18523526230394807, + "grad_norm": 2.4131624698638916, + "learning_rate": 1.9997702421076648e-05, + "loss": 2.7583, + "mean_token_accuracy": 0.4389876425266266, + "num_tokens": 349593075.0, + "step": 685 + }, + { + "epoch": 0.18550567874526772, + "grad_norm": 1.927712082862854, + "learning_rate": 1.9997666663175874e-05, + "loss": 2.7084, + "mean_token_accuracy": 0.44129326939582825, + "num_tokens": 350093871.0, + "step": 686 + }, + { + "epoch": 0.18577609518658733, + "grad_norm": 2.007850170135498, + "learning_rate": 1.999763062920023e-05, + "loss": 2.7712, + "mean_token_accuracy": 0.44687581062316895, + "num_tokens": 350564464.0, + "step": 687 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 2.1611640453338623, + "learning_rate": 1.9997594319150813e-05, + "loss": 2.7418, + "mean_token_accuracy": 0.4554537534713745, + "num_tokens": 351025320.0, + "step": 688 + }, + { + "epoch": 0.18631692806922662, + "grad_norm": 1.7746760845184326, + "learning_rate": 1.999755773302875e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.43704330921173096, + "num_tokens": 351549434.0, + "step": 689 + }, + { + "epoch": 0.18658734451054623, + "grad_norm": 1.6443603038787842, + "learning_rate": 1.9997520870835147e-05, + "loss": 2.74, + "mean_token_accuracy": 0.44621390104293823, + "num_tokens": 352073625.0, + "step": 690 + }, + { + "epoch": 0.18685776095186588, + "grad_norm": 2.395350933074951, + "learning_rate": 1.9997483732571145e-05, + "loss": 2.6175, + "mean_token_accuracy": 0.44413650035858154, + "num_tokens": 352597735.0, + "step": 691 + }, + { + "epoch": 0.18712817739318552, + "grad_norm": 1.8625091314315796, + "learning_rate": 1.9997446318237887e-05, + "loss": 2.8121, + "mean_token_accuracy": 0.4374842047691345, + "num_tokens": 353121933.0, + "step": 692 + }, + { + "epoch": 0.18739859383450513, + "grad_norm": 3.009540319442749, + "learning_rate": 1.9997408627836512e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.4869333505630493, + "num_tokens": 353611461.0, + "step": 693 + }, + { + "epoch": 0.18766901027582478, + "grad_norm": 2.1808507442474365, + "learning_rate": 1.999737066136818e-05, + "loss": 2.6892, + "mean_token_accuracy": 0.43497636914253235, + "num_tokens": 354119427.0, + "step": 694 + }, + { + "epoch": 0.1879394267171444, + "grad_norm": 2.118950366973877, + "learning_rate": 1.9997332418834062e-05, + "loss": 2.7648, + "mean_token_accuracy": 0.436579167842865, + "num_tokens": 354641531.0, + "step": 695 + }, + { + "epoch": 0.18820984315846404, + "grad_norm": 2.0059151649475098, + "learning_rate": 1.999729390023532e-05, + "loss": 2.81, + "mean_token_accuracy": 0.4278535842895508, + "num_tokens": 355137096.0, + "step": 696 + }, + { + "epoch": 0.18848025959978368, + "grad_norm": 1.7780137062072754, + "learning_rate": 1.9997255105573146e-05, + "loss": 2.8477, + "mean_token_accuracy": 0.42980000376701355, + "num_tokens": 355661367.0, + "step": 697 + }, + { + "epoch": 0.1887506760411033, + "grad_norm": 1.7544623613357544, + "learning_rate": 1.9997216034848724e-05, + "loss": 2.6438, + "mean_token_accuracy": 0.4669966697692871, + "num_tokens": 356185642.0, + "step": 698 + }, + { + "epoch": 0.18902109248242294, + "grad_norm": 1.6880067586898804, + "learning_rate": 1.9997176688063257e-05, + "loss": 2.6093, + "mean_token_accuracy": 0.44194886088371277, + "num_tokens": 356709802.0, + "step": 699 + }, + { + "epoch": 0.18929150892374255, + "grad_norm": 2.6645684242248535, + "learning_rate": 1.9997137065217945e-05, + "loss": 2.5786, + "mean_token_accuracy": 0.5095893144607544, + "num_tokens": 357145747.0, + "step": 700 + }, + { + "epoch": 0.1895619253650622, + "grad_norm": 1.430101990699768, + "learning_rate": 1.999709716631402e-05, + "loss": 1.269, + "mean_token_accuracy": 0.6721527576446533, + "num_tokens": 357670025.0, + "step": 701 + }, + { + "epoch": 0.18983234180638184, + "grad_norm": 4.3114848136901855, + "learning_rate": 1.9997056991352682e-05, + "loss": 2.8249, + "mean_token_accuracy": 0.42162877321243286, + "num_tokens": 358194288.0, + "step": 702 + }, + { + "epoch": 0.19010275824770145, + "grad_norm": 3.191159725189209, + "learning_rate": 1.9997016540335184e-05, + "loss": 2.6965, + "mean_token_accuracy": 0.44403257966041565, + "num_tokens": 358700668.0, + "step": 703 + }, + { + "epoch": 0.1903731746890211, + "grad_norm": 2.181873321533203, + "learning_rate": 1.999697581326276e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.4556272029876709, + "num_tokens": 359163531.0, + "step": 704 + }, + { + "epoch": 0.1906435911303407, + "grad_norm": 2.7328739166259766, + "learning_rate": 1.9996934810136658e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.45607078075408936, + "num_tokens": 359687688.0, + "step": 705 + }, + { + "epoch": 0.19091400757166035, + "grad_norm": 2.3361902236938477, + "learning_rate": 1.999689353095814e-05, + "loss": 2.7112, + "mean_token_accuracy": 0.451831191778183, + "num_tokens": 360159230.0, + "step": 706 + }, + { + "epoch": 0.19118442401298, + "grad_norm": 3.6996524333953857, + "learning_rate": 1.9996851975728468e-05, + "loss": 2.651, + "mean_token_accuracy": 0.4561792016029358, + "num_tokens": 360659859.0, + "step": 707 + }, + { + "epoch": 0.1914548404542996, + "grad_norm": 2.6385059356689453, + "learning_rate": 1.9996810144448918e-05, + "loss": 2.9334, + "mean_token_accuracy": 0.42474573850631714, + "num_tokens": 361184099.0, + "step": 708 + }, + { + "epoch": 0.19172525689561926, + "grad_norm": 2.2888975143432617, + "learning_rate": 1.999676803712078e-05, + "loss": 2.8524, + "mean_token_accuracy": 0.43822014331817627, + "num_tokens": 361708357.0, + "step": 709 + }, + { + "epoch": 0.1919956733369389, + "grad_norm": 2.9634196758270264, + "learning_rate": 1.9996725653745337e-05, + "loss": 2.834, + "mean_token_accuracy": 0.4446816146373749, + "num_tokens": 362232628.0, + "step": 710 + }, + { + "epoch": 0.19226608977825851, + "grad_norm": 1.8541686534881592, + "learning_rate": 1.999668299432389e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.456092894077301, + "num_tokens": 362756743.0, + "step": 711 + }, + { + "epoch": 0.19253650621957816, + "grad_norm": 2.9048593044281006, + "learning_rate": 1.999664005885776e-05, + "loss": 2.8421, + "mean_token_accuracy": 0.41734200716018677, + "num_tokens": 363269756.0, + "step": 712 + }, + { + "epoch": 0.19280692266089777, + "grad_norm": 2.8825292587280273, + "learning_rate": 1.999659684734825e-05, + "loss": 2.7982, + "mean_token_accuracy": 0.45588597655296326, + "num_tokens": 363793962.0, + "step": 713 + }, + { + "epoch": 0.19307733910221742, + "grad_norm": 2.013528347015381, + "learning_rate": 1.999655335979669e-05, + "loss": 2.8293, + "mean_token_accuracy": 0.4244559109210968, + "num_tokens": 364287594.0, + "step": 714 + }, + { + "epoch": 0.19334775554353706, + "grad_norm": 2.386591911315918, + "learning_rate": 1.999650959620442e-05, + "loss": 2.8011, + "mean_token_accuracy": 0.44258803129196167, + "num_tokens": 364811812.0, + "step": 715 + }, + { + "epoch": 0.19361817198485667, + "grad_norm": 2.2539470195770264, + "learning_rate": 1.9996465556572778e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.4282553791999817, + "num_tokens": 365336069.0, + "step": 716 + }, + { + "epoch": 0.19388858842617632, + "grad_norm": 2.302373170852661, + "learning_rate": 1.9996421240903114e-05, + "loss": 2.8482, + "mean_token_accuracy": 0.4233928918838501, + "num_tokens": 365842643.0, + "step": 717 + }, + { + "epoch": 0.19415900486749593, + "grad_norm": 2.2194559574127197, + "learning_rate": 1.999637664919679e-05, + "loss": 2.4435, + "mean_token_accuracy": 0.4621792435646057, + "num_tokens": 366340445.0, + "step": 718 + }, + { + "epoch": 0.19442942130881558, + "grad_norm": 2.5635483264923096, + "learning_rate": 1.9996331781455174e-05, + "loss": 2.8256, + "mean_token_accuracy": 0.4242889881134033, + "num_tokens": 366864657.0, + "step": 719 + }, + { + "epoch": 0.19469983775013522, + "grad_norm": 1.750580906867981, + "learning_rate": 1.999628663767964e-05, + "loss": 2.8933, + "mean_token_accuracy": 0.4524526596069336, + "num_tokens": 367317229.0, + "step": 720 + }, + { + "epoch": 0.19497025419145483, + "grad_norm": 1.3734302520751953, + "learning_rate": 1.999624121787158e-05, + "loss": 1.2905, + "mean_token_accuracy": 0.6694660186767578, + "num_tokens": 367729856.0, + "step": 721 + }, + { + "epoch": 0.19524067063277448, + "grad_norm": 3.3785667419433594, + "learning_rate": 1.999619552203238e-05, + "loss": 2.7689, + "mean_token_accuracy": 0.43391531705856323, + "num_tokens": 368254130.0, + "step": 722 + }, + { + "epoch": 0.1955110870740941, + "grad_norm": 2.6135382652282715, + "learning_rate": 1.9996149550163448e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.4525967836380005, + "num_tokens": 368778318.0, + "step": 723 + }, + { + "epoch": 0.19578150351541374, + "grad_norm": 1.9996188879013062, + "learning_rate": 1.999610330226619e-05, + "loss": 2.7335, + "mean_token_accuracy": 0.4402751922607422, + "num_tokens": 369302442.0, + "step": 724 + }, + { + "epoch": 0.19605191995673338, + "grad_norm": 2.463981866836548, + "learning_rate": 1.9996056778342027e-05, + "loss": 2.7976, + "mean_token_accuracy": 0.4536248445510864, + "num_tokens": 369766708.0, + "step": 725 + }, + { + "epoch": 0.196322336398053, + "grad_norm": 2.137066602706909, + "learning_rate": 1.9996009978392386e-05, + "loss": 2.7529, + "mean_token_accuracy": 0.43890005350112915, + "num_tokens": 370290981.0, + "step": 726 + }, + { + "epoch": 0.19659275283937264, + "grad_norm": 2.1860105991363525, + "learning_rate": 1.9995962902418703e-05, + "loss": 2.7856, + "mean_token_accuracy": 0.4441436529159546, + "num_tokens": 370815256.0, + "step": 727 + }, + { + "epoch": 0.19686316928069228, + "grad_norm": 1.6951249837875366, + "learning_rate": 1.9995915550422423e-05, + "loss": 2.6792, + "mean_token_accuracy": 0.45026707649230957, + "num_tokens": 371303021.0, + "step": 728 + }, + { + "epoch": 0.1971335857220119, + "grad_norm": 2.1696414947509766, + "learning_rate": 1.9995867922405e-05, + "loss": 2.8863, + "mean_token_accuracy": 0.4363475441932678, + "num_tokens": 371827186.0, + "step": 729 + }, + { + "epoch": 0.19740400216333154, + "grad_norm": 1.983933925628662, + "learning_rate": 1.9995820018367893e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.471052885055542, + "num_tokens": 372302047.0, + "step": 730 + }, + { + "epoch": 0.19767441860465115, + "grad_norm": 1.7416168451309204, + "learning_rate": 1.9995771838312574e-05, + "loss": 2.8434, + "mean_token_accuracy": 0.43278342485427856, + "num_tokens": 372826268.0, + "step": 731 + }, + { + "epoch": 0.1979448350459708, + "grad_norm": 1.8836878538131714, + "learning_rate": 1.999572338224052e-05, + "loss": 2.8663, + "mean_token_accuracy": 0.42264869809150696, + "num_tokens": 373350446.0, + "step": 732 + }, + { + "epoch": 0.19821525148729044, + "grad_norm": 2.488948345184326, + "learning_rate": 1.9995674650153215e-05, + "loss": 2.5539, + "mean_token_accuracy": 0.49662840366363525, + "num_tokens": 373874711.0, + "step": 733 + }, + { + "epoch": 0.19848566792861005, + "grad_norm": 2.098832607269287, + "learning_rate": 1.9995625642052164e-05, + "loss": 2.6499, + "mean_token_accuracy": 0.4498678147792816, + "num_tokens": 374398942.0, + "step": 734 + }, + { + "epoch": 0.1987560843699297, + "grad_norm": 1.8330438137054443, + "learning_rate": 1.9995576357938854e-05, + "loss": 2.7903, + "mean_token_accuracy": 0.44769757986068726, + "num_tokens": 374863891.0, + "step": 735 + }, + { + "epoch": 0.1990265008112493, + "grad_norm": 2.0514822006225586, + "learning_rate": 1.9995526797814813e-05, + "loss": 2.8848, + "mean_token_accuracy": 0.42612192034721375, + "num_tokens": 375388089.0, + "step": 736 + }, + { + "epoch": 0.19929691725256896, + "grad_norm": 2.0668201446533203, + "learning_rate": 1.9995476961681557e-05, + "loss": 2.7836, + "mean_token_accuracy": 0.4444893002510071, + "num_tokens": 375912370.0, + "step": 737 + }, + { + "epoch": 0.1995673336938886, + "grad_norm": 1.6903225183486938, + "learning_rate": 1.999542684954061e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.45527389645576477, + "num_tokens": 376436580.0, + "step": 738 + }, + { + "epoch": 0.19983775013520821, + "grad_norm": 1.8675816059112549, + "learning_rate": 1.9995376461393516e-05, + "loss": 2.6109, + "mean_token_accuracy": 0.45463240146636963, + "num_tokens": 376930306.0, + "step": 739 + }, + { + "epoch": 0.20010816657652786, + "grad_norm": 1.9875717163085938, + "learning_rate": 1.999532579724182e-05, + "loss": 2.8458, + "mean_token_accuracy": 0.44198447465896606, + "num_tokens": 377395960.0, + "step": 740 + }, + { + "epoch": 0.20037858301784747, + "grad_norm": 1.35246741771698, + "learning_rate": 1.999527485708707e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6569004654884338, + "num_tokens": 377917188.0, + "step": 741 + }, + { + "epoch": 0.20064899945916712, + "grad_norm": 3.256295680999756, + "learning_rate": 1.9995223640930838e-05, + "loss": 2.4721, + "mean_token_accuracy": 0.48675286769866943, + "num_tokens": 378385374.0, + "step": 742 + }, + { + "epoch": 0.20091941590048676, + "grad_norm": 3.350527048110962, + "learning_rate": 1.9995172148774694e-05, + "loss": 2.796, + "mean_token_accuracy": 0.42876648902893066, + "num_tokens": 378909647.0, + "step": 743 + }, + { + "epoch": 0.20118983234180637, + "grad_norm": 1.7663207054138184, + "learning_rate": 1.999512038062021e-05, + "loss": 2.5665, + "mean_token_accuracy": 0.44863319396972656, + "num_tokens": 379427259.0, + "step": 744 + }, + { + "epoch": 0.20146024878312602, + "grad_norm": 3.6261661052703857, + "learning_rate": 1.999506833646898e-05, + "loss": 2.532, + "mean_token_accuracy": 0.46196267008781433, + "num_tokens": 379921272.0, + "step": 745 + }, + { + "epoch": 0.20173066522444566, + "grad_norm": 2.4188711643218994, + "learning_rate": 1.9995016016322604e-05, + "loss": 2.8439, + "mean_token_accuracy": 0.41228556632995605, + "num_tokens": 380445413.0, + "step": 746 + }, + { + "epoch": 0.20200108166576528, + "grad_norm": 4.863891124725342, + "learning_rate": 1.9994963420182684e-05, + "loss": 2.6334, + "mean_token_accuracy": 0.4872071444988251, + "num_tokens": 380969533.0, + "step": 747 + }, + { + "epoch": 0.20227149810708492, + "grad_norm": 2.5610365867614746, + "learning_rate": 1.999491054805083e-05, + "loss": 2.9403, + "mean_token_accuracy": 0.42162197828292847, + "num_tokens": 381493803.0, + "step": 748 + }, + { + "epoch": 0.20254191454840453, + "grad_norm": 3.0087571144104004, + "learning_rate": 1.999485739992867e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.462034672498703, + "num_tokens": 382018023.0, + "step": 749 + }, + { + "epoch": 0.20281233098972418, + "grad_norm": 2.3481040000915527, + "learning_rate": 1.999480397581783e-05, + "loss": 2.7787, + "mean_token_accuracy": 0.4320737421512604, + "num_tokens": 382520835.0, + "step": 750 + }, + { + "epoch": 0.20308274743104382, + "grad_norm": 2.6653919219970703, + "learning_rate": 1.9994750275719958e-05, + "loss": 2.8598, + "mean_token_accuracy": 0.42903023958206177, + "num_tokens": 383045013.0, + "step": 751 + }, + { + "epoch": 0.20335316387236343, + "grad_norm": 1.9944483041763306, + "learning_rate": 1.999469629963669e-05, + "loss": 2.72, + "mean_token_accuracy": 0.4275173842906952, + "num_tokens": 383569213.0, + "step": 752 + }, + { + "epoch": 0.20362358031368308, + "grad_norm": 2.2991185188293457, + "learning_rate": 1.9994642047569694e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.44857046008110046, + "num_tokens": 384093436.0, + "step": 753 + }, + { + "epoch": 0.2038939967550027, + "grad_norm": 2.5286550521850586, + "learning_rate": 1.9994587519520626e-05, + "loss": 2.806, + "mean_token_accuracy": 0.42991143465042114, + "num_tokens": 384615516.0, + "step": 754 + }, + { + "epoch": 0.20416441319632234, + "grad_norm": 1.874782681465149, + "learning_rate": 1.9994532715491157e-05, + "loss": 2.6431, + "mean_token_accuracy": 0.4335857629776001, + "num_tokens": 385139754.0, + "step": 755 + }, + { + "epoch": 0.20443482963764198, + "grad_norm": 3.380159378051758, + "learning_rate": 1.9994477635482976e-05, + "loss": 2.7596, + "mean_token_accuracy": 0.4524844288825989, + "num_tokens": 385663948.0, + "step": 756 + }, + { + "epoch": 0.2047052460789616, + "grad_norm": 2.385660409927368, + "learning_rate": 1.9994422279497772e-05, + "loss": 2.6869, + "mean_token_accuracy": 0.4570015072822571, + "num_tokens": 386188215.0, + "step": 757 + }, + { + "epoch": 0.20497566252028124, + "grad_norm": 2.239384651184082, + "learning_rate": 1.9994366647537244e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.4573313295841217, + "num_tokens": 386690203.0, + "step": 758 + }, + { + "epoch": 0.20524607896160085, + "grad_norm": 2.1415300369262695, + "learning_rate": 1.9994310739603088e-05, + "loss": 2.7071, + "mean_token_accuracy": 0.4446750283241272, + "num_tokens": 387214342.0, + "step": 759 + }, + { + "epoch": 0.2055164954029205, + "grad_norm": 1.7967455387115479, + "learning_rate": 1.9994254555697032e-05, + "loss": 2.6056, + "mean_token_accuracy": 0.4446844160556793, + "num_tokens": 387738427.0, + "step": 760 + }, + { + "epoch": 0.20578691184424014, + "grad_norm": 1.306820034980774, + "learning_rate": 1.99941980958208e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.6361311674118042, + "num_tokens": 388237627.0, + "step": 761 + }, + { + "epoch": 0.20605732828555975, + "grad_norm": 3.3511879444122314, + "learning_rate": 1.9994141359976115e-05, + "loss": 2.6593, + "mean_token_accuracy": 0.44648051261901855, + "num_tokens": 388761802.0, + "step": 762 + }, + { + "epoch": 0.2063277447268794, + "grad_norm": 3.432685375213623, + "learning_rate": 1.9994084348164723e-05, + "loss": 2.8129, + "mean_token_accuracy": 0.44877883791923523, + "num_tokens": 389224103.0, + "step": 763 + }, + { + "epoch": 0.206598161168199, + "grad_norm": 1.6057885885238647, + "learning_rate": 1.9994027060388376e-05, + "loss": 2.84, + "mean_token_accuracy": 0.4378078877925873, + "num_tokens": 389707308.0, + "step": 764 + }, + { + "epoch": 0.20686857760951866, + "grad_norm": 2.320617437362671, + "learning_rate": 1.9993969496648828e-05, + "loss": 2.761, + "mean_token_accuracy": 0.43548545241355896, + "num_tokens": 390231483.0, + "step": 765 + }, + { + "epoch": 0.2071389940508383, + "grad_norm": 2.016535997390747, + "learning_rate": 1.999391165694785e-05, + "loss": 2.744, + "mean_token_accuracy": 0.4371589124202728, + "num_tokens": 390755557.0, + "step": 766 + }, + { + "epoch": 0.20740941049215791, + "grad_norm": 1.706797480583191, + "learning_rate": 1.999385354128721e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.4286634922027588, + "num_tokens": 391279741.0, + "step": 767 + }, + { + "epoch": 0.20767982693347756, + "grad_norm": 1.9395062923431396, + "learning_rate": 1.9993795149668694e-05, + "loss": 2.6972, + "mean_token_accuracy": 0.4521328806877136, + "num_tokens": 391784530.0, + "step": 768 + }, + { + "epoch": 0.2079502433747972, + "grad_norm": 1.793442964553833, + "learning_rate": 1.999373648209409e-05, + "loss": 2.7236, + "mean_token_accuracy": 0.46333569288253784, + "num_tokens": 392253956.0, + "step": 769 + }, + { + "epoch": 0.20822065981611682, + "grad_norm": 1.998964786529541, + "learning_rate": 1.9993677538565207e-05, + "loss": 2.7674, + "mean_token_accuracy": 0.43837398290634155, + "num_tokens": 392778070.0, + "step": 770 + }, + { + "epoch": 0.20849107625743646, + "grad_norm": 1.9208698272705078, + "learning_rate": 1.9993618319083844e-05, + "loss": 2.731, + "mean_token_accuracy": 0.4383993148803711, + "num_tokens": 393302256.0, + "step": 771 + }, + { + "epoch": 0.20876149269875607, + "grad_norm": 1.7445815801620483, + "learning_rate": 1.999355882365183e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.4370799660682678, + "num_tokens": 393826491.0, + "step": 772 + }, + { + "epoch": 0.20903190914007572, + "grad_norm": 1.934018850326538, + "learning_rate": 1.9993499052270974e-05, + "loss": 2.9414, + "mean_token_accuracy": 0.4244349002838135, + "num_tokens": 394350762.0, + "step": 773 + }, + { + "epoch": 0.20930232558139536, + "grad_norm": 1.9820928573608398, + "learning_rate": 1.9993439004943126e-05, + "loss": 2.7104, + "mean_token_accuracy": 0.4259346127510071, + "num_tokens": 394874974.0, + "step": 774 + }, + { + "epoch": 0.20957274202271498, + "grad_norm": 2.1796233654022217, + "learning_rate": 1.9993378681670117e-05, + "loss": 2.9597, + "mean_token_accuracy": 0.4333246648311615, + "num_tokens": 395375316.0, + "step": 775 + }, + { + "epoch": 0.20984315846403462, + "grad_norm": 2.3094804286956787, + "learning_rate": 1.9993318082453802e-05, + "loss": 2.6195, + "mean_token_accuracy": 0.4543563723564148, + "num_tokens": 395872271.0, + "step": 776 + }, + { + "epoch": 0.21011357490535423, + "grad_norm": 2.5621023178100586, + "learning_rate": 1.9993257207296046e-05, + "loss": 2.8181, + "mean_token_accuracy": 0.4396500289440155, + "num_tokens": 396378691.0, + "step": 777 + }, + { + "epoch": 0.21038399134667388, + "grad_norm": 2.1687259674072266, + "learning_rate": 1.9993196056198708e-05, + "loss": 2.7065, + "mean_token_accuracy": 0.4496220350265503, + "num_tokens": 396902853.0, + "step": 778 + }, + { + "epoch": 0.21065440778799352, + "grad_norm": 2.677537441253662, + "learning_rate": 1.9993134629163666e-05, + "loss": 2.85, + "mean_token_accuracy": 0.4486226439476013, + "num_tokens": 397427014.0, + "step": 779 + }, + { + "epoch": 0.21092482422931313, + "grad_norm": 2.3361315727233887, + "learning_rate": 1.9993072926192808e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.47306957840919495, + "num_tokens": 397951141.0, + "step": 780 + }, + { + "epoch": 0.21119524067063278, + "grad_norm": 1.596846580505371, + "learning_rate": 1.999301094728802e-05, + "loss": 1.2769, + "mean_token_accuracy": 0.6666742563247681, + "num_tokens": 398475416.0, + "step": 781 + }, + { + "epoch": 0.2114656571119524, + "grad_norm": 4.063570499420166, + "learning_rate": 1.999294869245122e-05, + "loss": 2.8342, + "mean_token_accuracy": 0.43356114625930786, + "num_tokens": 398999594.0, + "step": 782 + }, + { + "epoch": 0.21173607355327204, + "grad_norm": 3.657670736312866, + "learning_rate": 1.9992886161684303e-05, + "loss": 2.662, + "mean_token_accuracy": 0.4252784252166748, + "num_tokens": 399523717.0, + "step": 783 + }, + { + "epoch": 0.21200648999459168, + "grad_norm": 1.5857378244400024, + "learning_rate": 1.9992823354989193e-05, + "loss": 2.7705, + "mean_token_accuracy": 0.4380488395690918, + "num_tokens": 400047828.0, + "step": 784 + }, + { + "epoch": 0.2122769064359113, + "grad_norm": 2.273790121078491, + "learning_rate": 1.9992760272367815e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.44570472836494446, + "num_tokens": 400572046.0, + "step": 785 + }, + { + "epoch": 0.21254732287723094, + "grad_norm": 2.1010797023773193, + "learning_rate": 1.9992696913822107e-05, + "loss": 2.7833, + "mean_token_accuracy": 0.45876237750053406, + "num_tokens": 401032377.0, + "step": 786 + }, + { + "epoch": 0.21281773931855058, + "grad_norm": 2.1597847938537598, + "learning_rate": 1.999263327935401e-05, + "loss": 2.7284, + "mean_token_accuracy": 0.4352552592754364, + "num_tokens": 401556587.0, + "step": 787 + }, + { + "epoch": 0.2130881557598702, + "grad_norm": 1.8457057476043701, + "learning_rate": 1.999256936896548e-05, + "loss": 2.7962, + "mean_token_accuracy": 0.4385703206062317, + "num_tokens": 402080790.0, + "step": 788 + }, + { + "epoch": 0.21335857220118984, + "grad_norm": 1.5027356147766113, + "learning_rate": 1.9992505182658473e-05, + "loss": 2.7511, + "mean_token_accuracy": 0.4551544189453125, + "num_tokens": 402555246.0, + "step": 789 + }, + { + "epoch": 0.21362898864250945, + "grad_norm": 1.7867575883865356, + "learning_rate": 1.9992440720434964e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.4531986117362976, + "num_tokens": 403025935.0, + "step": 790 + }, + { + "epoch": 0.2138994050838291, + "grad_norm": 1.7198728322982788, + "learning_rate": 1.999237598229693e-05, + "loss": 2.6458, + "mean_token_accuracy": 0.4823717474937439, + "num_tokens": 403536485.0, + "step": 791 + }, + { + "epoch": 0.21416982152514874, + "grad_norm": 2.291548490524292, + "learning_rate": 1.9992310968246356e-05, + "loss": 2.9012, + "mean_token_accuracy": 0.43938297033309937, + "num_tokens": 404060594.0, + "step": 792 + }, + { + "epoch": 0.21444023796646836, + "grad_norm": 7.639558792114258, + "learning_rate": 1.9992245678285235e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.5089179873466492, + "num_tokens": 404566275.0, + "step": 793 + }, + { + "epoch": 0.214710654407788, + "grad_norm": 2.6382458209991455, + "learning_rate": 1.9992180112415577e-05, + "loss": 2.6021, + "mean_token_accuracy": 0.46228328347206116, + "num_tokens": 405057265.0, + "step": 794 + }, + { + "epoch": 0.21498107084910761, + "grad_norm": 2.2355077266693115, + "learning_rate": 1.9992114270639383e-05, + "loss": 2.9667, + "mean_token_accuracy": 0.4426376521587372, + "num_tokens": 405521375.0, + "step": 795 + }, + { + "epoch": 0.21525148729042726, + "grad_norm": 2.4202685356140137, + "learning_rate": 1.999204815295868e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.4660003185272217, + "num_tokens": 406045534.0, + "step": 796 + }, + { + "epoch": 0.2155219037317469, + "grad_norm": 2.404616117477417, + "learning_rate": 1.9991981759375495e-05, + "loss": 3.0173, + "mean_token_accuracy": 0.4160747230052948, + "num_tokens": 406569780.0, + "step": 797 + }, + { + "epoch": 0.21579232017306652, + "grad_norm": 2.3390793800354004, + "learning_rate": 1.9991915089891868e-05, + "loss": 2.7896, + "mean_token_accuracy": 0.43899887800216675, + "num_tokens": 407094060.0, + "step": 798 + }, + { + "epoch": 0.21606273661438616, + "grad_norm": 2.2080419063568115, + "learning_rate": 1.9991848144509843e-05, + "loss": 2.7115, + "mean_token_accuracy": 0.4532327950000763, + "num_tokens": 407618211.0, + "step": 799 + }, + { + "epoch": 0.21633315305570577, + "grad_norm": 2.6655983924865723, + "learning_rate": 1.999178092323147e-05, + "loss": 2.8065, + "mean_token_accuracy": 0.4463866949081421, + "num_tokens": 408103002.0, + "step": 800 + }, + { + "epoch": 0.21660356949702542, + "grad_norm": 1.3765515089035034, + "learning_rate": 1.9991713426058822e-05, + "loss": 1.2722, + "mean_token_accuracy": 0.6616781949996948, + "num_tokens": 408595773.0, + "step": 801 + }, + { + "epoch": 0.21687398593834506, + "grad_norm": 2.324983596801758, + "learning_rate": 1.9991645652993962e-05, + "loss": 2.4741, + "mean_token_accuracy": 0.4604479670524597, + "num_tokens": 409092754.0, + "step": 802 + }, + { + "epoch": 0.21714440237966467, + "grad_norm": 2.0328567028045654, + "learning_rate": 1.9991577604038965e-05, + "loss": 2.7952, + "mean_token_accuracy": 0.4356670677661896, + "num_tokens": 409616917.0, + "step": 803 + }, + { + "epoch": 0.21741481882098432, + "grad_norm": 1.6132938861846924, + "learning_rate": 1.9991509279195925e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.4429349899291992, + "num_tokens": 410094437.0, + "step": 804 + }, + { + "epoch": 0.21768523526230396, + "grad_norm": 2.049680233001709, + "learning_rate": 1.9991440678466942e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.4497126638889313, + "num_tokens": 410618576.0, + "step": 805 + }, + { + "epoch": 0.21795565170362358, + "grad_norm": 1.8374783992767334, + "learning_rate": 1.9991371801854114e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.45699578523635864, + "num_tokens": 411142682.0, + "step": 806 + }, + { + "epoch": 0.21822606814494322, + "grad_norm": 1.8821492195129395, + "learning_rate": 1.9991302649359562e-05, + "loss": 2.735, + "mean_token_accuracy": 0.4497929811477661, + "num_tokens": 411666860.0, + "step": 807 + }, + { + "epoch": 0.21849648458626283, + "grad_norm": 1.5248944759368896, + "learning_rate": 1.9991233220985397e-05, + "loss": 2.8479, + "mean_token_accuracy": 0.43538951873779297, + "num_tokens": 412191073.0, + "step": 808 + }, + { + "epoch": 0.21876690102758248, + "grad_norm": 1.4771031141281128, + "learning_rate": 1.999116351673376e-05, + "loss": 2.7913, + "mean_token_accuracy": 0.4308989942073822, + "num_tokens": 412715221.0, + "step": 809 + }, + { + "epoch": 0.21903731746890212, + "grad_norm": 1.446300745010376, + "learning_rate": 1.9991093536606783e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.45836079120635986, + "num_tokens": 413214071.0, + "step": 810 + }, + { + "epoch": 0.21930773391022174, + "grad_norm": 1.8399934768676758, + "learning_rate": 1.9991023280606615e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.4784989356994629, + "num_tokens": 413738191.0, + "step": 811 + }, + { + "epoch": 0.21957815035154138, + "grad_norm": 2.3474793434143066, + "learning_rate": 1.999095274873541e-05, + "loss": 2.8413, + "mean_token_accuracy": 0.442931592464447, + "num_tokens": 414262372.0, + "step": 812 + }, + { + "epoch": 0.219848566792861, + "grad_norm": 1.9467928409576416, + "learning_rate": 1.999088194099534e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.4526805877685547, + "num_tokens": 414770297.0, + "step": 813 + }, + { + "epoch": 0.22011898323418064, + "grad_norm": 1.5790438652038574, + "learning_rate": 1.9990810857388564e-05, + "loss": 2.6404, + "mean_token_accuracy": 0.45850059390068054, + "num_tokens": 415268663.0, + "step": 814 + }, + { + "epoch": 0.22038939967550028, + "grad_norm": 1.467049241065979, + "learning_rate": 1.9990739497917274e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.45193207263946533, + "num_tokens": 415748069.0, + "step": 815 + }, + { + "epoch": 0.2206598161168199, + "grad_norm": 1.756256103515625, + "learning_rate": 1.9990667862583657e-05, + "loss": 2.5174, + "mean_token_accuracy": 0.4816906452178955, + "num_tokens": 416269186.0, + "step": 816 + }, + { + "epoch": 0.22093023255813954, + "grad_norm": 1.7757713794708252, + "learning_rate": 1.9990595951389908e-05, + "loss": 2.8098, + "mean_token_accuracy": 0.4571903944015503, + "num_tokens": 416685802.0, + "step": 817 + }, + { + "epoch": 0.22120064899945915, + "grad_norm": 1.9483126401901245, + "learning_rate": 1.999052376433824e-05, + "loss": 2.5623, + "mean_token_accuracy": 0.44258207082748413, + "num_tokens": 417209896.0, + "step": 818 + }, + { + "epoch": 0.2214710654407788, + "grad_norm": 1.6703805923461914, + "learning_rate": 1.999045130143086e-05, + "loss": 2.7526, + "mean_token_accuracy": 0.4372915029525757, + "num_tokens": 417734077.0, + "step": 819 + }, + { + "epoch": 0.22174148188209844, + "grad_norm": 3.684884548187256, + "learning_rate": 1.9990378562669998e-05, + "loss": 2.5039, + "mean_token_accuracy": 0.5065629482269287, + "num_tokens": 418258307.0, + "step": 820 + }, + { + "epoch": 0.22201189832341806, + "grad_norm": 1.087341070175171, + "learning_rate": 1.9990305548057877e-05, + "loss": 1.2033, + "mean_token_accuracy": 0.6810636520385742, + "num_tokens": 418776523.0, + "step": 821 + }, + { + "epoch": 0.2222823147647377, + "grad_norm": 3.6094741821289062, + "learning_rate": 1.9990232257596747e-05, + "loss": 2.6708, + "mean_token_accuracy": 0.4268677234649658, + "num_tokens": 419300797.0, + "step": 822 + }, + { + "epoch": 0.22255273120605734, + "grad_norm": 2.184624671936035, + "learning_rate": 1.9990158691288852e-05, + "loss": 2.7817, + "mean_token_accuracy": 0.4387540817260742, + "num_tokens": 419824994.0, + "step": 823 + }, + { + "epoch": 0.22282314764737696, + "grad_norm": 1.7381882667541504, + "learning_rate": 1.999008484913645e-05, + "loss": 2.6706, + "mean_token_accuracy": 0.4585689306259155, + "num_tokens": 420349276.0, + "step": 824 + }, + { + "epoch": 0.2230935640886966, + "grad_norm": 1.634210467338562, + "learning_rate": 1.9990010731141805e-05, + "loss": 2.6787, + "mean_token_accuracy": 0.44350022077560425, + "num_tokens": 420873557.0, + "step": 825 + }, + { + "epoch": 0.22336398053001622, + "grad_norm": 1.826582908630371, + "learning_rate": 1.9989936337307197e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.4558086693286896, + "num_tokens": 421397620.0, + "step": 826 + }, + { + "epoch": 0.22363439697133586, + "grad_norm": 1.8245660066604614, + "learning_rate": 1.9989861667634903e-05, + "loss": 2.6458, + "mean_token_accuracy": 0.45945286750793457, + "num_tokens": 421921830.0, + "step": 827 + }, + { + "epoch": 0.2239048134126555, + "grad_norm": 1.851115345954895, + "learning_rate": 1.9989786722127214e-05, + "loss": 2.5079, + "mean_token_accuracy": 0.465953528881073, + "num_tokens": 422446018.0, + "step": 828 + }, + { + "epoch": 0.22417522985397512, + "grad_norm": 1.8078460693359375, + "learning_rate": 1.998971150078643e-05, + "loss": 2.7502, + "mean_token_accuracy": 0.4424861967563629, + "num_tokens": 422970184.0, + "step": 829 + }, + { + "epoch": 0.22444564629529476, + "grad_norm": 1.867891550064087, + "learning_rate": 1.9989636003614863e-05, + "loss": 2.6058, + "mean_token_accuracy": 0.45700308680534363, + "num_tokens": 423494371.0, + "step": 830 + }, + { + "epoch": 0.22471606273661437, + "grad_norm": 1.7266175746917725, + "learning_rate": 1.9989560230614823e-05, + "loss": 2.653, + "mean_token_accuracy": 0.4482629895210266, + "num_tokens": 424018579.0, + "step": 831 + }, + { + "epoch": 0.22498647917793402, + "grad_norm": 1.9126136302947998, + "learning_rate": 1.9989484181788644e-05, + "loss": 2.6055, + "mean_token_accuracy": 0.46678584814071655, + "num_tokens": 424542734.0, + "step": 832 + }, + { + "epoch": 0.22525689561925366, + "grad_norm": 1.9482442140579224, + "learning_rate": 1.998940785713865e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.4554755389690399, + "num_tokens": 425022258.0, + "step": 833 + }, + { + "epoch": 0.22552731206057328, + "grad_norm": 1.7219030857086182, + "learning_rate": 1.998933125666719e-05, + "loss": 2.8076, + "mean_token_accuracy": 0.43544769287109375, + "num_tokens": 425535792.0, + "step": 834 + }, + { + "epoch": 0.22579772850189292, + "grad_norm": 2.4900243282318115, + "learning_rate": 1.9989254380376607e-05, + "loss": 2.8347, + "mean_token_accuracy": 0.4320082664489746, + "num_tokens": 426060073.0, + "step": 835 + }, + { + "epoch": 0.22606814494321253, + "grad_norm": 2.2718405723571777, + "learning_rate": 1.9989177228269266e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.4479007124900818, + "num_tokens": 426552554.0, + "step": 836 + }, + { + "epoch": 0.22633856138453218, + "grad_norm": 2.2374181747436523, + "learning_rate": 1.9989099800347532e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.4705473780632019, + "num_tokens": 427076740.0, + "step": 837 + }, + { + "epoch": 0.22660897782585182, + "grad_norm": 3.4053077697753906, + "learning_rate": 1.9989022096613782e-05, + "loss": 2.884, + "mean_token_accuracy": 0.42146843671798706, + "num_tokens": 427600927.0, + "step": 838 + }, + { + "epoch": 0.22687939426717144, + "grad_norm": 1.6166375875473022, + "learning_rate": 1.9988944117070397e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.4836856424808502, + "num_tokens": 428125113.0, + "step": 839 + }, + { + "epoch": 0.22714981070849108, + "grad_norm": 1.789192795753479, + "learning_rate": 1.9988865861719775e-05, + "loss": 2.8054, + "mean_token_accuracy": 0.4280213713645935, + "num_tokens": 428649243.0, + "step": 840 + }, + { + "epoch": 0.22742022714981072, + "grad_norm": 1.1102715730667114, + "learning_rate": 1.9988787330564313e-05, + "loss": 1.2099, + "mean_token_accuracy": 0.6792304515838623, + "num_tokens": 429161539.0, + "step": 841 + }, + { + "epoch": 0.22769064359113034, + "grad_norm": 3.1645901203155518, + "learning_rate": 1.9988708523606416e-05, + "loss": 2.7582, + "mean_token_accuracy": 0.4531148672103882, + "num_tokens": 429685772.0, + "step": 842 + }, + { + "epoch": 0.22796106003244998, + "grad_norm": 2.307638168334961, + "learning_rate": 1.9988629440848516e-05, + "loss": 2.7385, + "mean_token_accuracy": 0.46856796741485596, + "num_tokens": 430153115.0, + "step": 843 + }, + { + "epoch": 0.2282314764737696, + "grad_norm": 2.2754135131835938, + "learning_rate": 1.9988550082293022e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.4476141929626465, + "num_tokens": 430677328.0, + "step": 844 + }, + { + "epoch": 0.22850189291508924, + "grad_norm": 2.1037039756774902, + "learning_rate": 1.9988470447942384e-05, + "loss": 2.7167, + "mean_token_accuracy": 0.44791150093078613, + "num_tokens": 431201461.0, + "step": 845 + }, + { + "epoch": 0.22877230935640888, + "grad_norm": 1.8858458995819092, + "learning_rate": 1.998839053779904e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.47045987844467163, + "num_tokens": 431725736.0, + "step": 846 + }, + { + "epoch": 0.2290427257977285, + "grad_norm": 1.93348228931427, + "learning_rate": 1.998831035186544e-05, + "loss": 2.5463, + "mean_token_accuracy": 0.47871071100234985, + "num_tokens": 432216704.0, + "step": 847 + }, + { + "epoch": 0.22931314223904814, + "grad_norm": 2.041050910949707, + "learning_rate": 1.998822989014404e-05, + "loss": 2.8799, + "mean_token_accuracy": 0.43311241269111633, + "num_tokens": 432736816.0, + "step": 848 + }, + { + "epoch": 0.22958355868036776, + "grad_norm": 2.011937141418457, + "learning_rate": 1.998814915263732e-05, + "loss": 2.8345, + "mean_token_accuracy": 0.43553340435028076, + "num_tokens": 433261096.0, + "step": 849 + }, + { + "epoch": 0.2298539751216874, + "grad_norm": 1.7769407033920288, + "learning_rate": 1.998806813934775e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.45584678649902344, + "num_tokens": 433785322.0, + "step": 850 + }, + { + "epoch": 0.23012439156300704, + "grad_norm": 1.9695051908493042, + "learning_rate": 1.9987986850277816e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.4607081413269043, + "num_tokens": 434268573.0, + "step": 851 + }, + { + "epoch": 0.23039480800432666, + "grad_norm": 1.679079294204712, + "learning_rate": 1.9987905285430018e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.4511350393295288, + "num_tokens": 434792790.0, + "step": 852 + }, + { + "epoch": 0.2306652244456463, + "grad_norm": 2.0410733222961426, + "learning_rate": 1.9987823444806848e-05, + "loss": 2.5814, + "mean_token_accuracy": 0.4616091251373291, + "num_tokens": 435306111.0, + "step": 853 + }, + { + "epoch": 0.23093564088696591, + "grad_norm": 2.217578887939453, + "learning_rate": 1.9987741328410825e-05, + "loss": 2.7281, + "mean_token_accuracy": 0.4701712727546692, + "num_tokens": 435778422.0, + "step": 854 + }, + { + "epoch": 0.23120605732828556, + "grad_norm": 1.916583776473999, + "learning_rate": 1.998765893624447e-05, + "loss": 2.533, + "mean_token_accuracy": 0.4809684157371521, + "num_tokens": 436302676.0, + "step": 855 + }, + { + "epoch": 0.2314764737696052, + "grad_norm": 2.6210174560546875, + "learning_rate": 1.9987576268310306e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.4596853256225586, + "num_tokens": 436826687.0, + "step": 856 + }, + { + "epoch": 0.23174689021092482, + "grad_norm": 1.9540692567825317, + "learning_rate": 1.9987493324610868e-05, + "loss": 2.6916, + "mean_token_accuracy": 0.4454384446144104, + "num_tokens": 437340397.0, + "step": 857 + }, + { + "epoch": 0.23201730665224446, + "grad_norm": 1.8583298921585083, + "learning_rate": 1.9987410105148714e-05, + "loss": 2.8093, + "mean_token_accuracy": 0.4514332413673401, + "num_tokens": 437864630.0, + "step": 858 + }, + { + "epoch": 0.2322877230935641, + "grad_norm": 2.2258613109588623, + "learning_rate": 1.9987326609926377e-05, + "loss": 2.7251, + "mean_token_accuracy": 0.442361056804657, + "num_tokens": 438388913.0, + "step": 859 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 85.59123992919922, + "learning_rate": 1.9987242838946438e-05, + "loss": 2.9827, + "mean_token_accuracy": 0.43758440017700195, + "num_tokens": 438913193.0, + "step": 860 + }, + { + "epoch": 0.23282855597620336, + "grad_norm": 1.1265960931777954, + "learning_rate": 1.9987158792211453e-05, + "loss": 1.2754, + "mean_token_accuracy": 0.6809484958648682, + "num_tokens": 439371856.0, + "step": 861 + }, + { + "epoch": 0.23309897241752298, + "grad_norm": 5.774274826049805, + "learning_rate": 1.9987074469724007e-05, + "loss": 2.6809, + "mean_token_accuracy": 0.4678001403808594, + "num_tokens": 439896006.0, + "step": 862 + }, + { + "epoch": 0.23336938885884262, + "grad_norm": 4.6882643699646, + "learning_rate": 1.998698987148669e-05, + "loss": 2.6496, + "mean_token_accuracy": 0.46881067752838135, + "num_tokens": 440396580.0, + "step": 863 + }, + { + "epoch": 0.23363980530016226, + "grad_norm": 2.518427848815918, + "learning_rate": 1.9986904997502093e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.46336629986763, + "num_tokens": 440920731.0, + "step": 864 + }, + { + "epoch": 0.23391022174148188, + "grad_norm": 3.5697174072265625, + "learning_rate": 1.9986819847772822e-05, + "loss": 2.8577, + "mean_token_accuracy": 0.44215869903564453, + "num_tokens": 441444856.0, + "step": 865 + }, + { + "epoch": 0.23418063818280152, + "grad_norm": 2.3688042163848877, + "learning_rate": 1.9986734422301492e-05, + "loss": 2.7995, + "mean_token_accuracy": 0.4492499828338623, + "num_tokens": 441969080.0, + "step": 866 + }, + { + "epoch": 0.23445105462412114, + "grad_norm": 2.62833571434021, + "learning_rate": 1.998664872109072e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.4619324803352356, + "num_tokens": 442493290.0, + "step": 867 + }, + { + "epoch": 0.23472147106544078, + "grad_norm": 2.5166726112365723, + "learning_rate": 1.9986562744143136e-05, + "loss": 2.5685, + "mean_token_accuracy": 0.4569992423057556, + "num_tokens": 443017355.0, + "step": 868 + }, + { + "epoch": 0.23499188750676042, + "grad_norm": 2.508923292160034, + "learning_rate": 1.9986476491461378e-05, + "loss": 2.6875, + "mean_token_accuracy": 0.4631533920764923, + "num_tokens": 443541458.0, + "step": 869 + }, + { + "epoch": 0.23526230394808004, + "grad_norm": 2.2810239791870117, + "learning_rate": 1.9986389963048098e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.46621230244636536, + "num_tokens": 444065633.0, + "step": 870 + }, + { + "epoch": 0.23553272038939968, + "grad_norm": 2.262258291244507, + "learning_rate": 1.9986303158905943e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.47039633989334106, + "num_tokens": 444568859.0, + "step": 871 + }, + { + "epoch": 0.2358031368307193, + "grad_norm": 2.3380367755889893, + "learning_rate": 1.9986216079037582e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.46744781732559204, + "num_tokens": 445093126.0, + "step": 872 + }, + { + "epoch": 0.23607355327203894, + "grad_norm": 3.1221084594726562, + "learning_rate": 1.9986128723445685e-05, + "loss": 2.8216, + "mean_token_accuracy": 0.437052845954895, + "num_tokens": 445617408.0, + "step": 873 + }, + { + "epoch": 0.23634396971335858, + "grad_norm": 1.9178152084350586, + "learning_rate": 1.998604109213293e-05, + "loss": 2.6436, + "mean_token_accuracy": 0.4705348014831543, + "num_tokens": 446141579.0, + "step": 874 + }, + { + "epoch": 0.2366143861546782, + "grad_norm": 1.8044297695159912, + "learning_rate": 1.998595318510201e-05, + "loss": 2.577, + "mean_token_accuracy": 0.4600541293621063, + "num_tokens": 446665789.0, + "step": 875 + }, + { + "epoch": 0.23688480259599784, + "grad_norm": 2.131686210632324, + "learning_rate": 1.998586500235562e-05, + "loss": 2.4867, + "mean_token_accuracy": 0.4792397618293762, + "num_tokens": 447190010.0, + "step": 876 + }, + { + "epoch": 0.23715521903731746, + "grad_norm": 1.9737498760223389, + "learning_rate": 1.9985776543896466e-05, + "loss": 2.8542, + "mean_token_accuracy": 0.4349603056907654, + "num_tokens": 447714269.0, + "step": 877 + }, + { + "epoch": 0.2374256354786371, + "grad_norm": 2.5243570804595947, + "learning_rate": 1.9985687809727263e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.43036648631095886, + "num_tokens": 448238529.0, + "step": 878 + }, + { + "epoch": 0.23769605191995674, + "grad_norm": 44.2614631652832, + "learning_rate": 1.998559879985073e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.4478413164615631, + "num_tokens": 448719753.0, + "step": 879 + }, + { + "epoch": 0.23796646836127636, + "grad_norm": 3.0626590251922607, + "learning_rate": 1.9985509514269608e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.44052693247795105, + "num_tokens": 449243934.0, + "step": 880 + }, + { + "epoch": 0.238236884802596, + "grad_norm": 1.3035176992416382, + "learning_rate": 1.998541995298662e-05, + "loss": 1.2326, + "mean_token_accuracy": 0.6614270210266113, + "num_tokens": 449768191.0, + "step": 881 + }, + { + "epoch": 0.23850730124391564, + "grad_norm": 6.043679714202881, + "learning_rate": 1.998533011600453e-05, + "loss": 2.3131, + "mean_token_accuracy": 0.5213392972946167, + "num_tokens": 450292445.0, + "step": 882 + }, + { + "epoch": 0.23877771768523526, + "grad_norm": 2.508270502090454, + "learning_rate": 1.9985240003326084e-05, + "loss": 2.6199, + "mean_token_accuracy": 0.4676879048347473, + "num_tokens": 450796681.0, + "step": 883 + }, + { + "epoch": 0.2390481341265549, + "grad_norm": 2.0921716690063477, + "learning_rate": 1.9985149614954055e-05, + "loss": 2.6106, + "mean_token_accuracy": 0.4514000415802002, + "num_tokens": 451320861.0, + "step": 884 + }, + { + "epoch": 0.23931855056787452, + "grad_norm": 1.6244791746139526, + "learning_rate": 1.998505895089121e-05, + "loss": 2.5887, + "mean_token_accuracy": 0.447795569896698, + "num_tokens": 451845121.0, + "step": 885 + }, + { + "epoch": 0.23958896700919416, + "grad_norm": 2.0475974082946777, + "learning_rate": 1.998496801114033e-05, + "loss": 2.7081, + "mean_token_accuracy": 0.4452309310436249, + "num_tokens": 452369297.0, + "step": 886 + }, + { + "epoch": 0.2398593834505138, + "grad_norm": 1.8014185428619385, + "learning_rate": 1.9984876795704214e-05, + "loss": 2.6519, + "mean_token_accuracy": 0.4360090494155884, + "num_tokens": 452893559.0, + "step": 887 + }, + { + "epoch": 0.24012979989183342, + "grad_norm": 2.042012929916382, + "learning_rate": 1.9984785304585653e-05, + "loss": 2.6632, + "mean_token_accuracy": 0.44391927123069763, + "num_tokens": 453417602.0, + "step": 888 + }, + { + "epoch": 0.24040021633315306, + "grad_norm": 1.8516314029693604, + "learning_rate": 1.9984693537787455e-05, + "loss": 2.5939, + "mean_token_accuracy": 0.4475914537906647, + "num_tokens": 453941802.0, + "step": 889 + }, + { + "epoch": 0.24067063277447268, + "grad_norm": 1.9155441522598267, + "learning_rate": 1.9984601495312437e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.44997093081474304, + "num_tokens": 454466071.0, + "step": 890 + }, + { + "epoch": 0.24094104921579232, + "grad_norm": 1.831027865409851, + "learning_rate": 1.9984509177163425e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.4572405219078064, + "num_tokens": 454953343.0, + "step": 891 + }, + { + "epoch": 0.24121146565711196, + "grad_norm": 2.347990036010742, + "learning_rate": 1.9984416583343248e-05, + "loss": 2.7174, + "mean_token_accuracy": 0.4512680172920227, + "num_tokens": 455477593.0, + "step": 892 + }, + { + "epoch": 0.24148188209843158, + "grad_norm": 1.9010686874389648, + "learning_rate": 1.9984323713854747e-05, + "loss": 2.7177, + "mean_token_accuracy": 0.44231927394866943, + "num_tokens": 456001755.0, + "step": 893 + }, + { + "epoch": 0.24175229853975122, + "grad_norm": 2.007716417312622, + "learning_rate": 1.9984230568700777e-05, + "loss": 2.5542, + "mean_token_accuracy": 0.46758225560188293, + "num_tokens": 456526028.0, + "step": 894 + }, + { + "epoch": 0.24202271498107084, + "grad_norm": 1.7640674114227295, + "learning_rate": 1.998413714788419e-05, + "loss": 2.5676, + "mean_token_accuracy": 0.471319317817688, + "num_tokens": 457050235.0, + "step": 895 + }, + { + "epoch": 0.24229313142239048, + "grad_norm": 2.1494646072387695, + "learning_rate": 1.9984043451407857e-05, + "loss": 2.836, + "mean_token_accuracy": 0.44467467069625854, + "num_tokens": 457574441.0, + "step": 896 + }, + { + "epoch": 0.24256354786371012, + "grad_norm": 1.9471776485443115, + "learning_rate": 1.9983949479274648e-05, + "loss": 2.4776, + "mean_token_accuracy": 0.4628407955169678, + "num_tokens": 458098564.0, + "step": 897 + }, + { + "epoch": 0.24283396430502974, + "grad_norm": 1.738305926322937, + "learning_rate": 1.998385523148745e-05, + "loss": 2.5034, + "mean_token_accuracy": 0.45982345938682556, + "num_tokens": 458622668.0, + "step": 898 + }, + { + "epoch": 0.24310438074634938, + "grad_norm": 1.7003246545791626, + "learning_rate": 1.9983760708049153e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.45534592866897583, + "num_tokens": 459146807.0, + "step": 899 + }, + { + "epoch": 0.24337479718766902, + "grad_norm": 1.4998327493667603, + "learning_rate": 1.998366590896266e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.45448732376098633, + "num_tokens": 459671085.0, + "step": 900 + }, + { + "epoch": 0.24364521362898864, + "grad_norm": 0.9975938200950623, + "learning_rate": 1.9983570834230875e-05, + "loss": 1.196, + "mean_token_accuracy": 0.6822232007980347, + "num_tokens": 460195306.0, + "step": 901 + }, + { + "epoch": 0.24391563007030828, + "grad_norm": 2.5244433879852295, + "learning_rate": 1.9983475483856723e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.42708101868629456, + "num_tokens": 460719524.0, + "step": 902 + }, + { + "epoch": 0.2441860465116279, + "grad_norm": 2.06477689743042, + "learning_rate": 1.998337985784312e-05, + "loss": 2.7442, + "mean_token_accuracy": 0.45235756039619446, + "num_tokens": 461213742.0, + "step": 903 + }, + { + "epoch": 0.24445646295294754, + "grad_norm": 1.4497275352478027, + "learning_rate": 1.9983283956193006e-05, + "loss": 2.7245, + "mean_token_accuracy": 0.44816136360168457, + "num_tokens": 461737829.0, + "step": 904 + }, + { + "epoch": 0.24472687939426718, + "grad_norm": 1.8870981931686401, + "learning_rate": 1.998318777890932e-05, + "loss": 2.6358, + "mean_token_accuracy": 0.4724631607532501, + "num_tokens": 462202844.0, + "step": 905 + }, + { + "epoch": 0.2449972958355868, + "grad_norm": 1.742565393447876, + "learning_rate": 1.9983091325995016e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.4916246831417084, + "num_tokens": 462666880.0, + "step": 906 + }, + { + "epoch": 0.24526771227690644, + "grad_norm": 1.7766884565353394, + "learning_rate": 1.998299459745305e-05, + "loss": 2.6239, + "mean_token_accuracy": 0.4453725814819336, + "num_tokens": 463191125.0, + "step": 907 + }, + { + "epoch": 0.24553812871822606, + "grad_norm": 1.7542632818222046, + "learning_rate": 1.9982897593286396e-05, + "loss": 2.8795, + "mean_token_accuracy": 0.4410553574562073, + "num_tokens": 463689978.0, + "step": 908 + }, + { + "epoch": 0.2458085451595457, + "grad_norm": 1.5954444408416748, + "learning_rate": 1.998280031349803e-05, + "loss": 2.6436, + "mean_token_accuracy": 0.46017545461654663, + "num_tokens": 464213984.0, + "step": 909 + }, + { + "epoch": 0.24607896160086534, + "grad_norm": 1.8310974836349487, + "learning_rate": 1.9982702758090927e-05, + "loss": 2.545, + "mean_token_accuracy": 0.4814651608467102, + "num_tokens": 464738214.0, + "step": 910 + }, + { + "epoch": 0.24634937804218496, + "grad_norm": 1.6194634437561035, + "learning_rate": 1.998260492706809e-05, + "loss": 2.738, + "mean_token_accuracy": 0.438972145318985, + "num_tokens": 465262448.0, + "step": 911 + }, + { + "epoch": 0.2466197944835046, + "grad_norm": 2.009763240814209, + "learning_rate": 1.9982506820432518e-05, + "loss": 2.6789, + "mean_token_accuracy": 0.45177406072616577, + "num_tokens": 465786636.0, + "step": 912 + }, + { + "epoch": 0.24689021092482422, + "grad_norm": 1.9541044235229492, + "learning_rate": 1.9982408438187225e-05, + "loss": 2.6059, + "mean_token_accuracy": 0.45267772674560547, + "num_tokens": 466310746.0, + "step": 913 + }, + { + "epoch": 0.24716062736614386, + "grad_norm": 1.700427770614624, + "learning_rate": 1.998230978033522e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.4686661660671234, + "num_tokens": 466834879.0, + "step": 914 + }, + { + "epoch": 0.2474310438074635, + "grad_norm": 2.106163501739502, + "learning_rate": 1.9982210846879537e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.4846189022064209, + "num_tokens": 467359145.0, + "step": 915 + }, + { + "epoch": 0.24770146024878312, + "grad_norm": 2.1235766410827637, + "learning_rate": 1.998211163782321e-05, + "loss": 2.7063, + "mean_token_accuracy": 0.44892945885658264, + "num_tokens": 467883282.0, + "step": 916 + }, + { + "epoch": 0.24797187669010276, + "grad_norm": 1.5589848756790161, + "learning_rate": 1.9982012153169282e-05, + "loss": 2.6028, + "mean_token_accuracy": 0.46353888511657715, + "num_tokens": 468407380.0, + "step": 917 + }, + { + "epoch": 0.2482422931314224, + "grad_norm": 1.8016260862350464, + "learning_rate": 1.998191239292081e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.44667524099349976, + "num_tokens": 468911990.0, + "step": 918 + }, + { + "epoch": 0.24851270957274202, + "grad_norm": 1.766422986984253, + "learning_rate": 1.998181235708085e-05, + "loss": 2.7243, + "mean_token_accuracy": 0.4517380893230438, + "num_tokens": 469369094.0, + "step": 919 + }, + { + "epoch": 0.24878312601406166, + "grad_norm": 1.7802973985671997, + "learning_rate": 1.998171204565247e-05, + "loss": 2.7876, + "mean_token_accuracy": 0.4447394609451294, + "num_tokens": 469893298.0, + "step": 920 + }, + { + "epoch": 0.24905354245538128, + "grad_norm": 1.2343933582305908, + "learning_rate": 1.9981611458638754e-05, + "loss": 1.2051, + "mean_token_accuracy": 0.675630509853363, + "num_tokens": 470362529.0, + "step": 921 + }, + { + "epoch": 0.24932395889670092, + "grad_norm": 3.051380157470703, + "learning_rate": 1.9981510596042786e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.45257169008255005, + "num_tokens": 470883173.0, + "step": 922 + }, + { + "epoch": 0.24959437533802056, + "grad_norm": 2.267025947570801, + "learning_rate": 1.9981409457867657e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.44953858852386475, + "num_tokens": 471407440.0, + "step": 923 + }, + { + "epoch": 0.24986479177934018, + "grad_norm": 1.6411645412445068, + "learning_rate": 1.9981308044116478e-05, + "loss": 2.6575, + "mean_token_accuracy": 0.4521993398666382, + "num_tokens": 471931658.0, + "step": 924 + }, + { + "epoch": 0.2501352082206598, + "grad_norm": 2.4182703495025635, + "learning_rate": 1.9981206354792355e-05, + "loss": 2.7947, + "mean_token_accuracy": 0.4308946430683136, + "num_tokens": 472433293.0, + "step": 925 + }, + { + "epoch": 0.25040562466197946, + "grad_norm": 2.3167848587036133, + "learning_rate": 1.998110438989841e-05, + "loss": 2.6185, + "mean_token_accuracy": 0.4779600203037262, + "num_tokens": 472957448.0, + "step": 926 + }, + { + "epoch": 0.2506760411032991, + "grad_norm": 1.9927366971969604, + "learning_rate": 1.9981002149437763e-05, + "loss": 2.6793, + "mean_token_accuracy": 0.44401347637176514, + "num_tokens": 473481685.0, + "step": 927 + }, + { + "epoch": 0.2509464575446187, + "grad_norm": 2.3096165657043457, + "learning_rate": 1.9980899633413563e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.47559577226638794, + "num_tokens": 473967590.0, + "step": 928 + }, + { + "epoch": 0.25121687398593834, + "grad_norm": 1.985356092453003, + "learning_rate": 1.9980796841828952e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.4665148854255676, + "num_tokens": 474484083.0, + "step": 929 + }, + { + "epoch": 0.251487290427258, + "grad_norm": 1.939862847328186, + "learning_rate": 1.998069377468708e-05, + "loss": 2.7646, + "mean_token_accuracy": 0.44461530447006226, + "num_tokens": 475008346.0, + "step": 930 + }, + { + "epoch": 0.2517577068685776, + "grad_norm": 4.031675815582275, + "learning_rate": 1.998059043199112e-05, + "loss": 2.3735, + "mean_token_accuracy": 0.4918934106826782, + "num_tokens": 475532622.0, + "step": 931 + }, + { + "epoch": 0.25202812330989727, + "grad_norm": 2.118455171585083, + "learning_rate": 1.9980486813744232e-05, + "loss": 2.656, + "mean_token_accuracy": 0.4601508378982544, + "num_tokens": 476056863.0, + "step": 932 + }, + { + "epoch": 0.25229853975121685, + "grad_norm": 2.232701063156128, + "learning_rate": 1.9980382919949596e-05, + "loss": 2.6695, + "mean_token_accuracy": 0.44597327709198, + "num_tokens": 476581094.0, + "step": 933 + }, + { + "epoch": 0.2525689561925365, + "grad_norm": 1.4290095567703247, + "learning_rate": 1.9980278750610402e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.4662473797798157, + "num_tokens": 477105355.0, + "step": 934 + }, + { + "epoch": 0.25283937263385614, + "grad_norm": 3.1996827125549316, + "learning_rate": 1.998017430572985e-05, + "loss": 2.1063, + "mean_token_accuracy": 0.5565462112426758, + "num_tokens": 477629471.0, + "step": 935 + }, + { + "epoch": 0.2531097890751758, + "grad_norm": 1.8937146663665771, + "learning_rate": 1.9980069585311138e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.47087597846984863, + "num_tokens": 478130113.0, + "step": 936 + }, + { + "epoch": 0.2533802055164954, + "grad_norm": 1.5288128852844238, + "learning_rate": 1.9979964589357485e-05, + "loss": 2.393, + "mean_token_accuracy": 0.4734853506088257, + "num_tokens": 478654368.0, + "step": 937 + }, + { + "epoch": 0.253650621957815, + "grad_norm": 1.5987998247146606, + "learning_rate": 1.997985931787211e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.4645729660987854, + "num_tokens": 479178583.0, + "step": 938 + }, + { + "epoch": 0.25392103839913466, + "grad_norm": 1.9111663103103638, + "learning_rate": 1.9979753770858244e-05, + "loss": 2.5699, + "mean_token_accuracy": 0.46674013137817383, + "num_tokens": 479694970.0, + "step": 939 + }, + { + "epoch": 0.2541914548404543, + "grad_norm": 1.5882841348648071, + "learning_rate": 1.997964794831912e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.4745188057422638, + "num_tokens": 480178994.0, + "step": 940 + }, + { + "epoch": 0.25446187128177394, + "grad_norm": 0.9410752654075623, + "learning_rate": 1.9979541850257994e-05, + "loss": 1.1606, + "mean_token_accuracy": 0.6913313269615173, + "num_tokens": 480703274.0, + "step": 941 + }, + { + "epoch": 0.2547322877230936, + "grad_norm": 2.406050205230713, + "learning_rate": 1.9979435476678114e-05, + "loss": 2.4699, + "mean_token_accuracy": 0.47015827894210815, + "num_tokens": 481227411.0, + "step": 942 + }, + { + "epoch": 0.2550027041644132, + "grad_norm": 1.941379427909851, + "learning_rate": 1.997932882758275e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.45995867252349854, + "num_tokens": 481751518.0, + "step": 943 + }, + { + "epoch": 0.2552731206057328, + "grad_norm": 2.1860077381134033, + "learning_rate": 1.9979221902975168e-05, + "loss": 2.7592, + "mean_token_accuracy": 0.44125330448150635, + "num_tokens": 482232944.0, + "step": 944 + }, + { + "epoch": 0.25554353704705246, + "grad_norm": 2.3211987018585205, + "learning_rate": 1.9979114702858654e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.44337645173072815, + "num_tokens": 482756913.0, + "step": 945 + }, + { + "epoch": 0.2558139534883721, + "grad_norm": 1.62779700756073, + "learning_rate": 1.9979007227236492e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.42833954095840454, + "num_tokens": 483281168.0, + "step": 946 + }, + { + "epoch": 0.25608436992969175, + "grad_norm": 2.278618335723877, + "learning_rate": 1.9978899476111985e-05, + "loss": 2.5611, + "mean_token_accuracy": 0.45512813329696655, + "num_tokens": 483805430.0, + "step": 947 + }, + { + "epoch": 0.25635478637101133, + "grad_norm": 1.5495750904083252, + "learning_rate": 1.9978791449488436e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.4825204014778137, + "num_tokens": 484272301.0, + "step": 948 + }, + { + "epoch": 0.256625202812331, + "grad_norm": 2.168520450592041, + "learning_rate": 1.9978683147369163e-05, + "loss": 2.5626, + "mean_token_accuracy": 0.46743395924568176, + "num_tokens": 484796526.0, + "step": 949 + }, + { + "epoch": 0.2568956192536506, + "grad_norm": 2.340773344039917, + "learning_rate": 1.9978574569757485e-05, + "loss": 2.6487, + "mean_token_accuracy": 0.45388755202293396, + "num_tokens": 485320754.0, + "step": 950 + }, + { + "epoch": 0.25716603569497026, + "grad_norm": 2.2259809970855713, + "learning_rate": 1.9978465716656734e-05, + "loss": 2.6424, + "mean_token_accuracy": 0.4542964696884155, + "num_tokens": 485845024.0, + "step": 951 + }, + { + "epoch": 0.2574364521362899, + "grad_norm": 1.8448201417922974, + "learning_rate": 1.997835658807025e-05, + "loss": 2.6897, + "mean_token_accuracy": 0.46462106704711914, + "num_tokens": 486309674.0, + "step": 952 + }, + { + "epoch": 0.2577068685776095, + "grad_norm": 2.115607738494873, + "learning_rate": 1.9978247184001384e-05, + "loss": 2.576, + "mean_token_accuracy": 0.4494525194168091, + "num_tokens": 486833962.0, + "step": 953 + }, + { + "epoch": 0.25797728501892914, + "grad_norm": 2.2491037845611572, + "learning_rate": 1.9978137504453493e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.4741487503051758, + "num_tokens": 487358151.0, + "step": 954 + }, + { + "epoch": 0.2582477014602488, + "grad_norm": 2.439286708831787, + "learning_rate": 1.997802754942994e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.43864506483078003, + "num_tokens": 487882414.0, + "step": 955 + }, + { + "epoch": 0.2585181179015684, + "grad_norm": 2.1376163959503174, + "learning_rate": 1.9977917318934095e-05, + "loss": 2.5129, + "mean_token_accuracy": 0.47418731451034546, + "num_tokens": 488406494.0, + "step": 956 + }, + { + "epoch": 0.25878853434288807, + "grad_norm": 1.621968388557434, + "learning_rate": 1.9977806812969345e-05, + "loss": 2.7417, + "mean_token_accuracy": 0.4491135776042938, + "num_tokens": 488930552.0, + "step": 957 + }, + { + "epoch": 0.25905895078420765, + "grad_norm": 3.2252070903778076, + "learning_rate": 1.9977696031539085e-05, + "loss": 2.4046, + "mean_token_accuracy": 0.4612239599227905, + "num_tokens": 489454645.0, + "step": 958 + }, + { + "epoch": 0.2593293672255273, + "grad_norm": 2.6968955993652344, + "learning_rate": 1.9977584974646704e-05, + "loss": 2.6788, + "mean_token_accuracy": 0.45044049620628357, + "num_tokens": 489978898.0, + "step": 959 + }, + { + "epoch": 0.25959978366684694, + "grad_norm": 2.2294936180114746, + "learning_rate": 1.9977473642295617e-05, + "loss": 2.7307, + "mean_token_accuracy": 0.4386332035064697, + "num_tokens": 490503034.0, + "step": 960 + }, + { + "epoch": 0.2598702001081666, + "grad_norm": 1.7872549295425415, + "learning_rate": 1.9977362034489232e-05, + "loss": 1.2564, + "mean_token_accuracy": 0.6677188873291016, + "num_tokens": 491027305.0, + "step": 961 + }, + { + "epoch": 0.2601406165494862, + "grad_norm": 2.855888605117798, + "learning_rate": 1.997725015123099e-05, + "loss": 2.4084, + "mean_token_accuracy": 0.48225298523902893, + "num_tokens": 491438782.0, + "step": 962 + }, + { + "epoch": 0.26041103299080587, + "grad_norm": 2.831061840057373, + "learning_rate": 1.9977137992524304e-05, + "loss": 2.7108, + "mean_token_accuracy": 0.4576377868652344, + "num_tokens": 491963056.0, + "step": 963 + }, + { + "epoch": 0.26068144943212546, + "grad_norm": 2.2105143070220947, + "learning_rate": 1.9977025558372627e-05, + "loss": 2.5472, + "mean_token_accuracy": 0.4694907069206238, + "num_tokens": 492487317.0, + "step": 964 + }, + { + "epoch": 0.2609518658734451, + "grad_norm": 2.4147608280181885, + "learning_rate": 1.9976912848779405e-05, + "loss": 2.7635, + "mean_token_accuracy": 0.45392102003097534, + "num_tokens": 493011488.0, + "step": 965 + }, + { + "epoch": 0.26122228231476474, + "grad_norm": 2.141981363296509, + "learning_rate": 1.99767998637481e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.45297831296920776, + "num_tokens": 493518605.0, + "step": 966 + }, + { + "epoch": 0.2614926987560844, + "grad_norm": 2.3472652435302734, + "learning_rate": 1.9976686603282177e-05, + "loss": 2.5946, + "mean_token_accuracy": 0.48445552587509155, + "num_tokens": 494042775.0, + "step": 967 + }, + { + "epoch": 0.26176311519740403, + "grad_norm": 1.847085952758789, + "learning_rate": 1.9976573067385107e-05, + "loss": 2.6536, + "mean_token_accuracy": 0.46655040979385376, + "num_tokens": 494566966.0, + "step": 968 + }, + { + "epoch": 0.2620335316387236, + "grad_norm": 2.1933326721191406, + "learning_rate": 1.9976459256060376e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.4601097106933594, + "num_tokens": 495086973.0, + "step": 969 + }, + { + "epoch": 0.26230394808004326, + "grad_norm": 2.0560672283172607, + "learning_rate": 1.997634516931148e-05, + "loss": 2.6138, + "mean_token_accuracy": 0.47058090567588806, + "num_tokens": 495559277.0, + "step": 970 + }, + { + "epoch": 0.2625743645213629, + "grad_norm": 1.6090035438537598, + "learning_rate": 1.9976230807141916e-05, + "loss": 2.4232, + "mean_token_accuracy": 0.46086907386779785, + "num_tokens": 496083544.0, + "step": 971 + }, + { + "epoch": 0.26284478096268254, + "grad_norm": 1.9497624635696411, + "learning_rate": 1.9976116169555193e-05, + "loss": 2.7276, + "mean_token_accuracy": 0.44069135189056396, + "num_tokens": 496607817.0, + "step": 972 + }, + { + "epoch": 0.2631151974040022, + "grad_norm": 1.7387456893920898, + "learning_rate": 1.997600125655483e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.4574123024940491, + "num_tokens": 497132026.0, + "step": 973 + }, + { + "epoch": 0.2633856138453218, + "grad_norm": 1.6770461797714233, + "learning_rate": 1.997588606814435e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.4515104591846466, + "num_tokens": 497656242.0, + "step": 974 + }, + { + "epoch": 0.2636560302866414, + "grad_norm": 1.7029106616973877, + "learning_rate": 1.9975770604327292e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.43443524837493896, + "num_tokens": 498180401.0, + "step": 975 + }, + { + "epoch": 0.26392644672796106, + "grad_norm": 1.992002248764038, + "learning_rate": 1.997565486510719e-05, + "loss": 2.5999, + "mean_token_accuracy": 0.462575763463974, + "num_tokens": 498704623.0, + "step": 976 + }, + { + "epoch": 0.2641968631692807, + "grad_norm": 1.909589409828186, + "learning_rate": 1.9975538850487608e-05, + "loss": 2.6018, + "mean_token_accuracy": 0.4519338011741638, + "num_tokens": 499228903.0, + "step": 977 + }, + { + "epoch": 0.26446727961060035, + "grad_norm": 2.3668644428253174, + "learning_rate": 1.9975422560472095e-05, + "loss": 2.7149, + "mean_token_accuracy": 0.5028050541877747, + "num_tokens": 499642039.0, + "step": 978 + }, + { + "epoch": 0.26473769605191994, + "grad_norm": 2.4175989627838135, + "learning_rate": 1.9975305995064222e-05, + "loss": 2.6973, + "mean_token_accuracy": 0.44207435846328735, + "num_tokens": 500166294.0, + "step": 979 + }, + { + "epoch": 0.2650081124932396, + "grad_norm": 1.7830731868743896, + "learning_rate": 1.9975189154267568e-05, + "loss": 2.455, + "mean_token_accuracy": 0.49545934796333313, + "num_tokens": 500690464.0, + "step": 980 + }, + { + "epoch": 0.2652785289345592, + "grad_norm": 0.8173007965087891, + "learning_rate": 1.9975072038085716e-05, + "loss": 1.0611, + "mean_token_accuracy": 0.7124631404876709, + "num_tokens": 501214701.0, + "step": 981 + }, + { + "epoch": 0.26554894537587886, + "grad_norm": 3.213026285171509, + "learning_rate": 1.997495464652226e-05, + "loss": 2.6, + "mean_token_accuracy": 0.4603886902332306, + "num_tokens": 501738945.0, + "step": 982 + }, + { + "epoch": 0.2658193618171985, + "grad_norm": 2.584104299545288, + "learning_rate": 1.9974836979580802e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.46494361758232117, + "num_tokens": 502263180.0, + "step": 983 + }, + { + "epoch": 0.2660897782585181, + "grad_norm": 1.66461980342865, + "learning_rate": 1.9974719037264953e-05, + "loss": 2.5, + "mean_token_accuracy": 0.49182572960853577, + "num_tokens": 502742436.0, + "step": 984 + }, + { + "epoch": 0.26636019469983774, + "grad_norm": 1.8142132759094238, + "learning_rate": 1.997460081957833e-05, + "loss": 2.6596, + "mean_token_accuracy": 0.4449385404586792, + "num_tokens": 503266672.0, + "step": 985 + }, + { + "epoch": 0.2666306111411574, + "grad_norm": 2.6547489166259766, + "learning_rate": 1.997448232652456e-05, + "loss": 2.6637, + "mean_token_accuracy": 0.4671272039413452, + "num_tokens": 503790878.0, + "step": 986 + }, + { + "epoch": 0.266901027582477, + "grad_norm": 1.9523491859436035, + "learning_rate": 1.997436355810728e-05, + "loss": 2.5449, + "mean_token_accuracy": 0.4628482758998871, + "num_tokens": 504315023.0, + "step": 987 + }, + { + "epoch": 0.26717144402379667, + "grad_norm": 1.9168479442596436, + "learning_rate": 1.997424451433014e-05, + "loss": 2.5573, + "mean_token_accuracy": 0.4640950560569763, + "num_tokens": 504839205.0, + "step": 988 + }, + { + "epoch": 0.26744186046511625, + "grad_norm": 1.8565930128097534, + "learning_rate": 1.9974125195196782e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.4692803621292114, + "num_tokens": 505363466.0, + "step": 989 + }, + { + "epoch": 0.2677122769064359, + "grad_norm": 2.0258848667144775, + "learning_rate": 1.997400560071087e-05, + "loss": 2.4366, + "mean_token_accuracy": 0.4721830487251282, + "num_tokens": 505836433.0, + "step": 990 + }, + { + "epoch": 0.26798269334775554, + "grad_norm": 2.055269241333008, + "learning_rate": 1.9973885730876078e-05, + "loss": 2.7229, + "mean_token_accuracy": 0.45472294092178345, + "num_tokens": 506360707.0, + "step": 991 + }, + { + "epoch": 0.2682531097890752, + "grad_norm": 1.6134155988693237, + "learning_rate": 1.997376558569608e-05, + "loss": 2.6819, + "mean_token_accuracy": 0.4561607837677002, + "num_tokens": 506884809.0, + "step": 992 + }, + { + "epoch": 0.2685235262303948, + "grad_norm": 1.6592637300491333, + "learning_rate": 1.9973645165174563e-05, + "loss": 2.45, + "mean_token_accuracy": 0.470706582069397, + "num_tokens": 507397841.0, + "step": 993 + }, + { + "epoch": 0.2687939426717144, + "grad_norm": 1.8940001726150513, + "learning_rate": 1.9973524469315227e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.4665665030479431, + "num_tokens": 507922077.0, + "step": 994 + }, + { + "epoch": 0.26906435911303406, + "grad_norm": 1.5190550088882446, + "learning_rate": 1.9973403498121766e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.47241657972335815, + "num_tokens": 508446245.0, + "step": 995 + }, + { + "epoch": 0.2693347755543537, + "grad_norm": 1.739450216293335, + "learning_rate": 1.9973282251597898e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.46511155366897583, + "num_tokens": 508970452.0, + "step": 996 + }, + { + "epoch": 0.26960519199567334, + "grad_norm": 1.4795074462890625, + "learning_rate": 1.9973160729747342e-05, + "loss": 2.4591, + "mean_token_accuracy": 0.48135310411453247, + "num_tokens": 509494687.0, + "step": 997 + }, + { + "epoch": 0.269875608436993, + "grad_norm": 2.178176164627075, + "learning_rate": 1.9973038932573826e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.4693293571472168, + "num_tokens": 510017750.0, + "step": 998 + }, + { + "epoch": 0.2701460248783126, + "grad_norm": 2.7419426441192627, + "learning_rate": 1.997291686008109e-05, + "loss": 2.7019, + "mean_token_accuracy": 0.4608585834503174, + "num_tokens": 510542027.0, + "step": 999 + }, + { + "epoch": 0.2704164413196322, + "grad_norm": 2.122528553009033, + "learning_rate": 1.9972794512272875e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.46351754665374756, + "num_tokens": 511066175.0, + "step": 1000 + }, + { + "epoch": 0.27068685776095186, + "grad_norm": 1.051896095275879, + "learning_rate": 1.9972671889152938e-05, + "loss": 1.2439, + "mean_token_accuracy": 0.676002562046051, + "num_tokens": 511545902.0, + "step": 1001 + }, + { + "epoch": 0.2709572742022715, + "grad_norm": 4.26418924331665, + "learning_rate": 1.997254899072504e-05, + "loss": 2.6177, + "mean_token_accuracy": 0.44851332902908325, + "num_tokens": 512070090.0, + "step": 1002 + }, + { + "epoch": 0.27122769064359115, + "grad_norm": 4.223275184631348, + "learning_rate": 1.997242581699295e-05, + "loss": 2.748, + "mean_token_accuracy": 0.4224492907524109, + "num_tokens": 512594182.0, + "step": 1003 + }, + { + "epoch": 0.2714981070849108, + "grad_norm": 1.8746713399887085, + "learning_rate": 1.9972302367960457e-05, + "loss": 2.5373, + "mean_token_accuracy": 0.4566737115383148, + "num_tokens": 513075510.0, + "step": 1004 + }, + { + "epoch": 0.2717685235262304, + "grad_norm": 3.806234121322632, + "learning_rate": 1.9972178643631336e-05, + "loss": 2.6271, + "mean_token_accuracy": 0.4589480757713318, + "num_tokens": 513599793.0, + "step": 1005 + }, + { + "epoch": 0.27203893996755, + "grad_norm": 4.660215377807617, + "learning_rate": 1.9972054644009388e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.48368707299232483, + "num_tokens": 514030215.0, + "step": 1006 + }, + { + "epoch": 0.27230935640886966, + "grad_norm": 3.110372304916382, + "learning_rate": 1.997193036909842e-05, + "loss": 2.5611, + "mean_token_accuracy": 0.46869832277297974, + "num_tokens": 514554482.0, + "step": 1007 + }, + { + "epoch": 0.2725797728501893, + "grad_norm": 2.0511081218719482, + "learning_rate": 1.9971805818902242e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.44837862253189087, + "num_tokens": 515078753.0, + "step": 1008 + }, + { + "epoch": 0.27285018929150895, + "grad_norm": 3.4713563919067383, + "learning_rate": 1.9971680993424678e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.4971870183944702, + "num_tokens": 515602847.0, + "step": 1009 + }, + { + "epoch": 0.27312060573282854, + "grad_norm": 3.1038424968719482, + "learning_rate": 1.997155589266956e-05, + "loss": 2.686, + "mean_token_accuracy": 0.4528435468673706, + "num_tokens": 516127062.0, + "step": 1010 + }, + { + "epoch": 0.2733910221741482, + "grad_norm": 2.475111961364746, + "learning_rate": 1.997143051664072e-05, + "loss": 2.6635, + "mean_token_accuracy": 0.4658880829811096, + "num_tokens": 516572016.0, + "step": 1011 + }, + { + "epoch": 0.2736614386154678, + "grad_norm": 2.2078816890716553, + "learning_rate": 1.9971304865342012e-05, + "loss": 2.6258, + "mean_token_accuracy": 0.45475003123283386, + "num_tokens": 517096297.0, + "step": 1012 + }, + { + "epoch": 0.27393185505678747, + "grad_norm": 2.742039918899536, + "learning_rate": 1.9971178938777284e-05, + "loss": 2.7379, + "mean_token_accuracy": 0.4564938545227051, + "num_tokens": 517620510.0, + "step": 1013 + }, + { + "epoch": 0.2742022714981071, + "grad_norm": 2.1456668376922607, + "learning_rate": 1.9971052736950405e-05, + "loss": 2.429, + "mean_token_accuracy": 0.473186194896698, + "num_tokens": 518130177.0, + "step": 1014 + }, + { + "epoch": 0.2744726879394267, + "grad_norm": 2.3162732124328613, + "learning_rate": 1.9970926259865243e-05, + "loss": 2.7435, + "mean_token_accuracy": 0.4527018666267395, + "num_tokens": 518589938.0, + "step": 1015 + }, + { + "epoch": 0.27474310438074634, + "grad_norm": 2.114133834838867, + "learning_rate": 1.997079950752569e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.4978828430175781, + "num_tokens": 519036705.0, + "step": 1016 + }, + { + "epoch": 0.275013520822066, + "grad_norm": 4.419449329376221, + "learning_rate": 1.997067247993562e-05, + "loss": 2.3498, + "mean_token_accuracy": 0.5168985724449158, + "num_tokens": 519560890.0, + "step": 1017 + }, + { + "epoch": 0.2752839372633856, + "grad_norm": 3.245293617248535, + "learning_rate": 1.997054517709894e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.4731137454509735, + "num_tokens": 520043789.0, + "step": 1018 + }, + { + "epoch": 0.27555435370470527, + "grad_norm": 1.7681628465652466, + "learning_rate": 1.997041759901955e-05, + "loss": 2.6816, + "mean_token_accuracy": 0.46240663528442383, + "num_tokens": 520568000.0, + "step": 1019 + }, + { + "epoch": 0.27582477014602486, + "grad_norm": 1.9859215021133423, + "learning_rate": 1.9970289745701367e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.46325933933258057, + "num_tokens": 521089934.0, + "step": 1020 + }, + { + "epoch": 0.2760951865873445, + "grad_norm": 1.4086387157440186, + "learning_rate": 1.997016161714832e-05, + "loss": 1.2458, + "mean_token_accuracy": 0.6810183525085449, + "num_tokens": 521614136.0, + "step": 1021 + }, + { + "epoch": 0.27636560302866414, + "grad_norm": 2.9558727741241455, + "learning_rate": 1.9970033213364334e-05, + "loss": 2.589, + "mean_token_accuracy": 0.46073049306869507, + "num_tokens": 522138376.0, + "step": 1022 + }, + { + "epoch": 0.2766360194699838, + "grad_norm": 2.202012538909912, + "learning_rate": 1.9969904534353345e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.46380534768104553, + "num_tokens": 522662607.0, + "step": 1023 + }, + { + "epoch": 0.27690643591130343, + "grad_norm": 1.7958900928497314, + "learning_rate": 1.996977558011931e-05, + "loss": 2.5241, + "mean_token_accuracy": 0.4822538495063782, + "num_tokens": 523175459.0, + "step": 1024 + }, + { + "epoch": 0.277176852352623, + "grad_norm": 2.210460662841797, + "learning_rate": 1.9969646350666184e-05, + "loss": 2.6372, + "mean_token_accuracy": 0.4670693874359131, + "num_tokens": 523681639.0, + "step": 1025 + }, + { + "epoch": 0.27744726879394266, + "grad_norm": 2.314711093902588, + "learning_rate": 1.9969516845997927e-05, + "loss": 2.5463, + "mean_token_accuracy": 0.4649502635002136, + "num_tokens": 524205841.0, + "step": 1026 + }, + { + "epoch": 0.2777176852352623, + "grad_norm": 1.9142169952392578, + "learning_rate": 1.9969387066118514e-05, + "loss": 2.3863, + "mean_token_accuracy": 0.4888025224208832, + "num_tokens": 524730110.0, + "step": 1027 + }, + { + "epoch": 0.27798810167658194, + "grad_norm": 2.184081792831421, + "learning_rate": 1.996925701103193e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.4622800350189209, + "num_tokens": 525254302.0, + "step": 1028 + }, + { + "epoch": 0.2782585181179016, + "grad_norm": 2.0139284133911133, + "learning_rate": 1.9969126680742165e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.4819697141647339, + "num_tokens": 525777346.0, + "step": 1029 + }, + { + "epoch": 0.2785289345592212, + "grad_norm": 2.1247353553771973, + "learning_rate": 1.996899607525322e-05, + "loss": 2.6599, + "mean_token_accuracy": 0.44529402256011963, + "num_tokens": 526301621.0, + "step": 1030 + }, + { + "epoch": 0.2787993510005408, + "grad_norm": 1.9158121347427368, + "learning_rate": 1.996886519456909e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.47820547223091125, + "num_tokens": 526825804.0, + "step": 1031 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 2.57302188873291, + "learning_rate": 1.996873403869381e-05, + "loss": 2.6678, + "mean_token_accuracy": 0.4613744616508484, + "num_tokens": 527349965.0, + "step": 1032 + }, + { + "epoch": 0.2793401838831801, + "grad_norm": 2.2490346431732178, + "learning_rate": 1.996860260763139e-05, + "loss": 2.4947, + "mean_token_accuracy": 0.4722229838371277, + "num_tokens": 527874249.0, + "step": 1033 + }, + { + "epoch": 0.27961060032449975, + "grad_norm": 1.798608660697937, + "learning_rate": 1.9968470901385867e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.47231441736221313, + "num_tokens": 528356667.0, + "step": 1034 + }, + { + "epoch": 0.27988101676581933, + "grad_norm": 2.2300000190734863, + "learning_rate": 1.9968338919961282e-05, + "loss": 2.7034, + "mean_token_accuracy": 0.45981234312057495, + "num_tokens": 528880950.0, + "step": 1035 + }, + { + "epoch": 0.280151433207139, + "grad_norm": 1.7265697717666626, + "learning_rate": 1.9968206663361685e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.4422628879547119, + "num_tokens": 529405201.0, + "step": 1036 + }, + { + "epoch": 0.2804218496484586, + "grad_norm": 1.9748094081878662, + "learning_rate": 1.9968074131591134e-05, + "loss": 2.6171, + "mean_token_accuracy": 0.4590069651603699, + "num_tokens": 529929422.0, + "step": 1037 + }, + { + "epoch": 0.28069226608977826, + "grad_norm": 1.8653709888458252, + "learning_rate": 1.9967941324653696e-05, + "loss": 2.6985, + "mean_token_accuracy": 0.46844157576560974, + "num_tokens": 530448911.0, + "step": 1038 + }, + { + "epoch": 0.2809626825310979, + "grad_norm": 1.862768530845642, + "learning_rate": 1.996780824255344e-05, + "loss": 2.6136, + "mean_token_accuracy": 0.45887166261672974, + "num_tokens": 530973140.0, + "step": 1039 + }, + { + "epoch": 0.28123309897241755, + "grad_norm": 1.7618650197982788, + "learning_rate": 1.9967674885294463e-05, + "loss": 2.4751, + "mean_token_accuracy": 0.4929801821708679, + "num_tokens": 531497287.0, + "step": 1040 + }, + { + "epoch": 0.28150351541373714, + "grad_norm": 1.6268565654754639, + "learning_rate": 1.9967541252880843e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6781150698661804, + "num_tokens": 531978201.0, + "step": 1041 + }, + { + "epoch": 0.2817739318550568, + "grad_norm": 2.09397292137146, + "learning_rate": 1.9967407345316688e-05, + "loss": 2.4731, + "mean_token_accuracy": 0.4691278636455536, + "num_tokens": 532502381.0, + "step": 1042 + }, + { + "epoch": 0.2820443482963764, + "grad_norm": 1.6382784843444824, + "learning_rate": 1.99672731626061e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.46582192182540894, + "num_tokens": 533026651.0, + "step": 1043 + }, + { + "epoch": 0.28231476473769607, + "grad_norm": 1.5441558361053467, + "learning_rate": 1.9967138704753202e-05, + "loss": 2.7233, + "mean_token_accuracy": 0.4576399028301239, + "num_tokens": 533550878.0, + "step": 1044 + }, + { + "epoch": 0.2825851811790157, + "grad_norm": 1.6226297616958618, + "learning_rate": 1.9967003971762125e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.47142544388771057, + "num_tokens": 534075035.0, + "step": 1045 + }, + { + "epoch": 0.2828555976203353, + "grad_norm": 1.4627763032913208, + "learning_rate": 1.996686896363699e-05, + "loss": 2.6258, + "mean_token_accuracy": 0.4647199511528015, + "num_tokens": 534599237.0, + "step": 1046 + }, + { + "epoch": 0.28312601406165494, + "grad_norm": 1.4513822793960571, + "learning_rate": 1.9966733680381942e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.450006365776062, + "num_tokens": 535123447.0, + "step": 1047 + }, + { + "epoch": 0.2833964305029746, + "grad_norm": 1.4262727499008179, + "learning_rate": 1.996659812200114e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.4796602725982666, + "num_tokens": 535614117.0, + "step": 1048 + }, + { + "epoch": 0.2836668469442942, + "grad_norm": 3.7543978691101074, + "learning_rate": 1.9966462288498736e-05, + "loss": 2.4935, + "mean_token_accuracy": 0.4892268776893616, + "num_tokens": 536138232.0, + "step": 1049 + }, + { + "epoch": 0.28393726338561387, + "grad_norm": 1.8976656198501587, + "learning_rate": 1.99663261798789e-05, + "loss": 2.6084, + "mean_token_accuracy": 0.4676305055618286, + "num_tokens": 536662375.0, + "step": 1050 + }, + { + "epoch": 0.28420767982693346, + "grad_norm": 1.441443920135498, + "learning_rate": 1.9966189796145815e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.4783666133880615, + "num_tokens": 537162363.0, + "step": 1051 + }, + { + "epoch": 0.2844780962682531, + "grad_norm": 1.964797854423523, + "learning_rate": 1.996605313730365e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.478199303150177, + "num_tokens": 537624821.0, + "step": 1052 + }, + { + "epoch": 0.28474851270957274, + "grad_norm": 2.0836856365203857, + "learning_rate": 1.9965916203356614e-05, + "loss": 2.5397, + "mean_token_accuracy": 0.45354121923446655, + "num_tokens": 538148838.0, + "step": 1053 + }, + { + "epoch": 0.2850189291508924, + "grad_norm": 1.87941312789917, + "learning_rate": 1.99657789943089e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.47497332096099854, + "num_tokens": 538672933.0, + "step": 1054 + }, + { + "epoch": 0.28528934559221203, + "grad_norm": 1.8418538570404053, + "learning_rate": 1.9965641510164718e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.439153790473938, + "num_tokens": 539197057.0, + "step": 1055 + }, + { + "epoch": 0.2855597620335316, + "grad_norm": 1.907188892364502, + "learning_rate": 1.9965503750928293e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.46343129873275757, + "num_tokens": 539721334.0, + "step": 1056 + }, + { + "epoch": 0.28583017847485126, + "grad_norm": 1.7978168725967407, + "learning_rate": 1.996536571660384e-05, + "loss": 2.5602, + "mean_token_accuracy": 0.4756195545196533, + "num_tokens": 540245579.0, + "step": 1057 + }, + { + "epoch": 0.2861005949161709, + "grad_norm": 1.8811516761779785, + "learning_rate": 1.9965227407195606e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.4851285219192505, + "num_tokens": 540705166.0, + "step": 1058 + }, + { + "epoch": 0.28637101135749055, + "grad_norm": 1.9755104780197144, + "learning_rate": 1.996508882270783e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.4771394729614258, + "num_tokens": 541229420.0, + "step": 1059 + }, + { + "epoch": 0.2866414277988102, + "grad_norm": 1.8707000017166138, + "learning_rate": 1.9964949963144766e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.4616202116012573, + "num_tokens": 541753564.0, + "step": 1060 + }, + { + "epoch": 0.2869118442401298, + "grad_norm": 0.8757693767547607, + "learning_rate": 1.9964810828510667e-05, + "loss": 1.2082, + "mean_token_accuracy": 0.6790164709091187, + "num_tokens": 542277723.0, + "step": 1061 + }, + { + "epoch": 0.2871822606814494, + "grad_norm": 2.0999011993408203, + "learning_rate": 1.9964671418809813e-05, + "loss": 2.6022, + "mean_token_accuracy": 0.4532000422477722, + "num_tokens": 542801967.0, + "step": 1062 + }, + { + "epoch": 0.28745267712276906, + "grad_norm": 1.9579440355300903, + "learning_rate": 1.9964531734046473e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.4601728320121765, + "num_tokens": 543326220.0, + "step": 1063 + }, + { + "epoch": 0.2877230935640887, + "grad_norm": 1.5536972284317017, + "learning_rate": 1.996439177422494e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.47298309206962585, + "num_tokens": 543806342.0, + "step": 1064 + }, + { + "epoch": 0.28799351000540835, + "grad_norm": 2.0879838466644287, + "learning_rate": 1.9964251539349504e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.47408193349838257, + "num_tokens": 544330481.0, + "step": 1065 + }, + { + "epoch": 0.28826392644672794, + "grad_norm": 1.9761924743652344, + "learning_rate": 1.996411102942447e-05, + "loss": 2.5357, + "mean_token_accuracy": 0.454720139503479, + "num_tokens": 544854567.0, + "step": 1066 + }, + { + "epoch": 0.2885343428880476, + "grad_norm": 1.4959845542907715, + "learning_rate": 1.9963970244454144e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.47325459122657776, + "num_tokens": 545378739.0, + "step": 1067 + }, + { + "epoch": 0.2888047593293672, + "grad_norm": 1.594793677330017, + "learning_rate": 1.996382918444285e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.44739601016044617, + "num_tokens": 545903003.0, + "step": 1068 + }, + { + "epoch": 0.28907517577068687, + "grad_norm": 1.956005573272705, + "learning_rate": 1.9963687849394917e-05, + "loss": 2.443, + "mean_token_accuracy": 0.5002775192260742, + "num_tokens": 546427287.0, + "step": 1069 + }, + { + "epoch": 0.2893455922120065, + "grad_norm": 1.8087085485458374, + "learning_rate": 1.996354623931468e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.45714473724365234, + "num_tokens": 546951324.0, + "step": 1070 + }, + { + "epoch": 0.2896160086533261, + "grad_norm": 1.8949058055877686, + "learning_rate": 1.9963404354206485e-05, + "loss": 2.6636, + "mean_token_accuracy": 0.4587661623954773, + "num_tokens": 547475586.0, + "step": 1071 + }, + { + "epoch": 0.28988642509464574, + "grad_norm": 1.9525387287139893, + "learning_rate": 1.996326219407468e-05, + "loss": 2.6416, + "mean_token_accuracy": 0.4576141834259033, + "num_tokens": 547999827.0, + "step": 1072 + }, + { + "epoch": 0.2901568415359654, + "grad_norm": 1.5894263982772827, + "learning_rate": 1.9963119758923642e-05, + "loss": 2.3501, + "mean_token_accuracy": 0.4897621273994446, + "num_tokens": 548524034.0, + "step": 1073 + }, + { + "epoch": 0.290427257977285, + "grad_norm": 2.001304864883423, + "learning_rate": 1.9962977048757723e-05, + "loss": 2.5962, + "mean_token_accuracy": 0.45013922452926636, + "num_tokens": 549048133.0, + "step": 1074 + }, + { + "epoch": 0.29069767441860467, + "grad_norm": 1.4225677251815796, + "learning_rate": 1.996283406358131e-05, + "loss": 2.4691, + "mean_token_accuracy": 0.4844067096710205, + "num_tokens": 549572389.0, + "step": 1075 + }, + { + "epoch": 0.2909680908599243, + "grad_norm": 1.51744544506073, + "learning_rate": 1.996269080339879e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.4650585651397705, + "num_tokens": 550096593.0, + "step": 1076 + }, + { + "epoch": 0.2912385073012439, + "grad_norm": 1.8195687532424927, + "learning_rate": 1.9962547268214564e-05, + "loss": 2.693, + "mean_token_accuracy": 0.45737889409065247, + "num_tokens": 550620878.0, + "step": 1077 + }, + { + "epoch": 0.29150892374256354, + "grad_norm": 1.557437777519226, + "learning_rate": 1.9962403458033028e-05, + "loss": 2.6064, + "mean_token_accuracy": 0.464782178401947, + "num_tokens": 551145061.0, + "step": 1078 + }, + { + "epoch": 0.2917793401838832, + "grad_norm": 1.49310302734375, + "learning_rate": 1.9962259372858596e-05, + "loss": 2.7656, + "mean_token_accuracy": 0.44808924198150635, + "num_tokens": 551639815.0, + "step": 1079 + }, + { + "epoch": 0.2920497566252028, + "grad_norm": 1.4710135459899902, + "learning_rate": 1.996211501269569e-05, + "loss": 2.5676, + "mean_token_accuracy": 0.4486501216888428, + "num_tokens": 552164070.0, + "step": 1080 + }, + { + "epoch": 0.29232017306652247, + "grad_norm": 1.1907856464385986, + "learning_rate": 1.996197037754874e-05, + "loss": 1.2131, + "mean_token_accuracy": 0.685154914855957, + "num_tokens": 552688180.0, + "step": 1081 + }, + { + "epoch": 0.29259058950784206, + "grad_norm": 2.5723886489868164, + "learning_rate": 1.9961825467422183e-05, + "loss": 2.471, + "mean_token_accuracy": 0.4715762138366699, + "num_tokens": 553212365.0, + "step": 1082 + }, + { + "epoch": 0.2928610059491617, + "grad_norm": 2.0199971199035645, + "learning_rate": 1.9961680282320467e-05, + "loss": 2.7633, + "mean_token_accuracy": 0.4394505023956299, + "num_tokens": 553736607.0, + "step": 1083 + }, + { + "epoch": 0.29313142239048134, + "grad_norm": 1.7617985010147095, + "learning_rate": 1.9961534822248043e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.47684621810913086, + "num_tokens": 554223058.0, + "step": 1084 + }, + { + "epoch": 0.293401838831801, + "grad_norm": 1.9228864908218384, + "learning_rate": 1.996138908720938e-05, + "loss": 2.3599, + "mean_token_accuracy": 0.4806225895881653, + "num_tokens": 554747299.0, + "step": 1085 + }, + { + "epoch": 0.29367225527312063, + "grad_norm": 1.8443647623062134, + "learning_rate": 1.996124307720894e-05, + "loss": 2.7143, + "mean_token_accuracy": 0.45313867926597595, + "num_tokens": 555271569.0, + "step": 1086 + }, + { + "epoch": 0.2939426717144402, + "grad_norm": 1.8948607444763184, + "learning_rate": 1.996109679225121e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.4509505331516266, + "num_tokens": 555795831.0, + "step": 1087 + }, + { + "epoch": 0.29421308815575986, + "grad_norm": 1.998690128326416, + "learning_rate": 1.9960950232340684e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.49061286449432373, + "num_tokens": 556319938.0, + "step": 1088 + }, + { + "epoch": 0.2944835045970795, + "grad_norm": 1.708539366722107, + "learning_rate": 1.996080339748185e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.47231197357177734, + "num_tokens": 556813948.0, + "step": 1089 + }, + { + "epoch": 0.29475392103839915, + "grad_norm": 1.8411725759506226, + "learning_rate": 1.9960656287679213e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.4755212068557739, + "num_tokens": 557332424.0, + "step": 1090 + }, + { + "epoch": 0.2950243374797188, + "grad_norm": 2.212630271911621, + "learning_rate": 1.996050890293729e-05, + "loss": 2.5202, + "mean_token_accuracy": 0.4725051820278168, + "num_tokens": 557856701.0, + "step": 1091 + }, + { + "epoch": 0.2952947539210384, + "grad_norm": 1.8109376430511475, + "learning_rate": 1.9960361243260605e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.48735761642456055, + "num_tokens": 558380791.0, + "step": 1092 + }, + { + "epoch": 0.295565170362358, + "grad_norm": 1.861100673675537, + "learning_rate": 1.9960213308653684e-05, + "loss": 2.6142, + "mean_token_accuracy": 0.47534722089767456, + "num_tokens": 558904961.0, + "step": 1093 + }, + { + "epoch": 0.29583558680367766, + "grad_norm": 1.7882651090621948, + "learning_rate": 1.996006509912107e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.45577162504196167, + "num_tokens": 559419423.0, + "step": 1094 + }, + { + "epoch": 0.2961060032449973, + "grad_norm": 1.8146342039108276, + "learning_rate": 1.995991661466731e-05, + "loss": 2.5531, + "mean_token_accuracy": 0.4669073820114136, + "num_tokens": 559943589.0, + "step": 1095 + }, + { + "epoch": 0.29637641968631695, + "grad_norm": 1.90481436252594, + "learning_rate": 1.9959767855296958e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.47397106885910034, + "num_tokens": 560467870.0, + "step": 1096 + }, + { + "epoch": 0.29664683612763654, + "grad_norm": 1.66248619556427, + "learning_rate": 1.9959618821014576e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.49344491958618164, + "num_tokens": 560943870.0, + "step": 1097 + }, + { + "epoch": 0.2969172525689562, + "grad_norm": 2.2455053329467773, + "learning_rate": 1.9959469511824743e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.4841974079608917, + "num_tokens": 561412372.0, + "step": 1098 + }, + { + "epoch": 0.2971876690102758, + "grad_norm": 2.292273998260498, + "learning_rate": 1.995931992773204e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.46934401988983154, + "num_tokens": 561894797.0, + "step": 1099 + }, + { + "epoch": 0.29745808545159547, + "grad_norm": 2.1654295921325684, + "learning_rate": 1.995917006874105e-05, + "loss": 2.5328, + "mean_token_accuracy": 0.4921986758708954, + "num_tokens": 562376250.0, + "step": 1100 + }, + { + "epoch": 0.2977285018929151, + "grad_norm": 1.353165864944458, + "learning_rate": 1.9959019934856374e-05, + "loss": 1.214, + "mean_token_accuracy": 0.6910457611083984, + "num_tokens": 562900257.0, + "step": 1101 + }, + { + "epoch": 0.2979989183342347, + "grad_norm": 2.8067588806152344, + "learning_rate": 1.9958869526082624e-05, + "loss": 2.6894, + "mean_token_accuracy": 0.45240211486816406, + "num_tokens": 563424524.0, + "step": 1102 + }, + { + "epoch": 0.29826933477555434, + "grad_norm": 1.875433325767517, + "learning_rate": 1.995871884242441e-05, + "loss": 2.2678, + "mean_token_accuracy": 0.5345737934112549, + "num_tokens": 563926149.0, + "step": 1103 + }, + { + "epoch": 0.298539751216874, + "grad_norm": 2.0434913635253906, + "learning_rate": 1.995856788388635e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.4491657018661499, + "num_tokens": 564450325.0, + "step": 1104 + }, + { + "epoch": 0.2988101676581936, + "grad_norm": 2.160987615585327, + "learning_rate": 1.9958416650473086e-05, + "loss": 2.5399, + "mean_token_accuracy": 0.4738468527793884, + "num_tokens": 564974590.0, + "step": 1105 + }, + { + "epoch": 0.29908058409951327, + "grad_norm": 1.8919470310211182, + "learning_rate": 1.9958265142189255e-05, + "loss": 2.5362, + "mean_token_accuracy": 0.45826056599617004, + "num_tokens": 565498845.0, + "step": 1106 + }, + { + "epoch": 0.29935100054083286, + "grad_norm": 1.8909164667129517, + "learning_rate": 1.9958113359039506e-05, + "loss": 2.4736, + "mean_token_accuracy": 0.4853385090827942, + "num_tokens": 566023076.0, + "step": 1107 + }, + { + "epoch": 0.2996214169821525, + "grad_norm": 1.7257169485092163, + "learning_rate": 1.995796130102849e-05, + "loss": 2.426, + "mean_token_accuracy": 0.48672255873680115, + "num_tokens": 566547347.0, + "step": 1108 + }, + { + "epoch": 0.29989183342347214, + "grad_norm": 1.9219695329666138, + "learning_rate": 1.9957808968160884e-05, + "loss": 2.5677, + "mean_token_accuracy": 0.48020434379577637, + "num_tokens": 567071562.0, + "step": 1109 + }, + { + "epoch": 0.3001622498647918, + "grad_norm": 1.992250919342041, + "learning_rate": 1.9957656360441352e-05, + "loss": 2.5728, + "mean_token_accuracy": 0.4581112265586853, + "num_tokens": 567595741.0, + "step": 1110 + }, + { + "epoch": 0.30043266630611143, + "grad_norm": 1.9283685684204102, + "learning_rate": 1.995750347787458e-05, + "loss": 2.7207, + "mean_token_accuracy": 0.4636729955673218, + "num_tokens": 568119989.0, + "step": 1111 + }, + { + "epoch": 0.300703082747431, + "grad_norm": 2.272587299346924, + "learning_rate": 1.9957350320465262e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.43769633769989014, + "num_tokens": 568644109.0, + "step": 1112 + }, + { + "epoch": 0.30097349918875066, + "grad_norm": 1.3449254035949707, + "learning_rate": 1.995719688821809e-05, + "loss": 2.622, + "mean_token_accuracy": 0.4639703631401062, + "num_tokens": 569168361.0, + "step": 1113 + }, + { + "epoch": 0.3012439156300703, + "grad_norm": 1.7372095584869385, + "learning_rate": 1.9957043181137775e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.46842193603515625, + "num_tokens": 569692598.0, + "step": 1114 + }, + { + "epoch": 0.30151433207138995, + "grad_norm": 1.737159252166748, + "learning_rate": 1.9956889199229043e-05, + "loss": 2.7323, + "mean_token_accuracy": 0.4383682608604431, + "num_tokens": 570216718.0, + "step": 1115 + }, + { + "epoch": 0.3017847485127096, + "grad_norm": 1.7719806432724, + "learning_rate": 1.9956734942496604e-05, + "loss": 2.3386, + "mean_token_accuracy": 0.48212650418281555, + "num_tokens": 570718535.0, + "step": 1116 + }, + { + "epoch": 0.30205516495402923, + "grad_norm": 2.1884210109710693, + "learning_rate": 1.9956580410945193e-05, + "loss": 2.6388, + "mean_token_accuracy": 0.4591166377067566, + "num_tokens": 571242748.0, + "step": 1117 + }, + { + "epoch": 0.3023255813953488, + "grad_norm": 1.9342070817947388, + "learning_rate": 1.995642560457956e-05, + "loss": 2.6159, + "mean_token_accuracy": 0.4676937460899353, + "num_tokens": 571766896.0, + "step": 1118 + }, + { + "epoch": 0.30259599783666846, + "grad_norm": 1.8888704776763916, + "learning_rate": 1.9956270523404452e-05, + "loss": 2.4765, + "mean_token_accuracy": 0.4809718728065491, + "num_tokens": 572288905.0, + "step": 1119 + }, + { + "epoch": 0.3028664142779881, + "grad_norm": 1.5777488946914673, + "learning_rate": 1.9956115167424622e-05, + "loss": 2.6074, + "mean_token_accuracy": 0.46170172095298767, + "num_tokens": 572813066.0, + "step": 1120 + }, + { + "epoch": 0.30313683071930775, + "grad_norm": 1.346095323562622, + "learning_rate": 1.9955959536644843e-05, + "loss": 1.2297, + "mean_token_accuracy": 0.6802902221679688, + "num_tokens": 573302793.0, + "step": 1121 + }, + { + "epoch": 0.3034072471606274, + "grad_norm": 3.49368953704834, + "learning_rate": 1.9955803631069888e-05, + "loss": 2.7728, + "mean_token_accuracy": 0.4341806173324585, + "num_tokens": 573826962.0, + "step": 1122 + }, + { + "epoch": 0.303677663601947, + "grad_norm": 2.98539137840271, + "learning_rate": 1.9955647450704538e-05, + "loss": 2.6063, + "mean_token_accuracy": 0.47472041845321655, + "num_tokens": 574338996.0, + "step": 1123 + }, + { + "epoch": 0.3039480800432666, + "grad_norm": 1.6829211711883545, + "learning_rate": 1.995549099555359e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.48345085978507996, + "num_tokens": 574863252.0, + "step": 1124 + }, + { + "epoch": 0.30421849648458626, + "grad_norm": 2.9504053592681885, + "learning_rate": 1.9955334265621836e-05, + "loss": 2.7033, + "mean_token_accuracy": 0.4533368647098541, + "num_tokens": 575387514.0, + "step": 1125 + }, + { + "epoch": 0.3044889129259059, + "grad_norm": 2.153270959854126, + "learning_rate": 1.99551772609141e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.4934968948364258, + "num_tokens": 575851180.0, + "step": 1126 + }, + { + "epoch": 0.30475932936722555, + "grad_norm": 2.230334758758545, + "learning_rate": 1.9955019981435182e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.4791473150253296, + "num_tokens": 576375183.0, + "step": 1127 + }, + { + "epoch": 0.30502974580854514, + "grad_norm": 2.002007246017456, + "learning_rate": 1.995486242718992e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.4675339162349701, + "num_tokens": 576899372.0, + "step": 1128 + }, + { + "epoch": 0.3053001622498648, + "grad_norm": 1.813252568244934, + "learning_rate": 1.9954704598183143e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.4935540556907654, + "num_tokens": 577423611.0, + "step": 1129 + }, + { + "epoch": 0.3055705786911844, + "grad_norm": 1.7331359386444092, + "learning_rate": 1.9954546494419697e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.4966517984867096, + "num_tokens": 577859345.0, + "step": 1130 + }, + { + "epoch": 0.30584099513250407, + "grad_norm": 1.8904881477355957, + "learning_rate": 1.9954388115904425e-05, + "loss": 2.7075, + "mean_token_accuracy": 0.43480199575424194, + "num_tokens": 578383550.0, + "step": 1131 + }, + { + "epoch": 0.3061114115738237, + "grad_norm": 1.6581902503967285, + "learning_rate": 1.9954229462642198e-05, + "loss": 2.4148, + "mean_token_accuracy": 0.47914552688598633, + "num_tokens": 578907835.0, + "step": 1132 + }, + { + "epoch": 0.3063818280151433, + "grad_norm": 1.710109829902649, + "learning_rate": 1.995407053463788e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.4645959436893463, + "num_tokens": 579432104.0, + "step": 1133 + }, + { + "epoch": 0.30665224445646294, + "grad_norm": 1.8620460033416748, + "learning_rate": 1.995391133189634e-05, + "loss": 2.425, + "mean_token_accuracy": 0.5144208669662476, + "num_tokens": 579956350.0, + "step": 1134 + }, + { + "epoch": 0.3069226608977826, + "grad_norm": 1.5735269784927368, + "learning_rate": 1.9953751854422474e-05, + "loss": 2.531, + "mean_token_accuracy": 0.46035560965538025, + "num_tokens": 580480581.0, + "step": 1135 + }, + { + "epoch": 0.3071930773391022, + "grad_norm": 1.481830358505249, + "learning_rate": 1.9953592102221166e-05, + "loss": 2.679, + "mean_token_accuracy": 0.4471004605293274, + "num_tokens": 581004708.0, + "step": 1136 + }, + { + "epoch": 0.30746349378042187, + "grad_norm": 1.3813445568084717, + "learning_rate": 1.9953432075297324e-05, + "loss": 2.523, + "mean_token_accuracy": 0.48337841033935547, + "num_tokens": 581528796.0, + "step": 1137 + }, + { + "epoch": 0.30773391022174146, + "grad_norm": 2.988116979598999, + "learning_rate": 1.9953271773655853e-05, + "loss": 2.3988, + "mean_token_accuracy": 0.5245431661605835, + "num_tokens": 582052983.0, + "step": 1138 + }, + { + "epoch": 0.3080043266630611, + "grad_norm": 1.9930872917175293, + "learning_rate": 1.9953111197301675e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.47970181703567505, + "num_tokens": 582577257.0, + "step": 1139 + }, + { + "epoch": 0.30827474310438074, + "grad_norm": 1.368987798690796, + "learning_rate": 1.995295034623972e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.4725874066352844, + "num_tokens": 583084560.0, + "step": 1140 + }, + { + "epoch": 0.3085451595457004, + "grad_norm": 0.9193658828735352, + "learning_rate": 1.9952789220474914e-05, + "loss": 1.2884, + "mean_token_accuracy": 0.6589342355728149, + "num_tokens": 583608815.0, + "step": 1141 + }, + { + "epoch": 0.30881557598702003, + "grad_norm": 2.5054519176483154, + "learning_rate": 1.995262782001221e-05, + "loss": 2.5055, + "mean_token_accuracy": 0.46969443559646606, + "num_tokens": 584133028.0, + "step": 1142 + }, + { + "epoch": 0.3090859924283396, + "grad_norm": 1.6854557991027832, + "learning_rate": 1.9952466144856552e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.4757336676120758, + "num_tokens": 584657194.0, + "step": 1143 + }, + { + "epoch": 0.30935640886965926, + "grad_norm": 2.026812791824341, + "learning_rate": 1.995230419501291e-05, + "loss": 2.486, + "mean_token_accuracy": 0.48925936222076416, + "num_tokens": 585181389.0, + "step": 1144 + }, + { + "epoch": 0.3096268253109789, + "grad_norm": 2.1477224826812744, + "learning_rate": 1.9952141970486245e-05, + "loss": 2.5567, + "mean_token_accuracy": 0.48422837257385254, + "num_tokens": 585705660.0, + "step": 1145 + }, + { + "epoch": 0.30989724175229855, + "grad_norm": 1.5697786808013916, + "learning_rate": 1.995197947128154e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.48111894726753235, + "num_tokens": 586229864.0, + "step": 1146 + }, + { + "epoch": 0.3101676581936182, + "grad_norm": 2.288599729537964, + "learning_rate": 1.9951816697403776e-05, + "loss": 2.3837, + "mean_token_accuracy": 0.47164273262023926, + "num_tokens": 586754005.0, + "step": 1147 + }, + { + "epoch": 0.3104380746349378, + "grad_norm": 2.470879554748535, + "learning_rate": 1.995165364885795e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.46621984243392944, + "num_tokens": 587257658.0, + "step": 1148 + }, + { + "epoch": 0.3107084910762574, + "grad_norm": 1.7012200355529785, + "learning_rate": 1.9951490325649074e-05, + "loss": 2.3256, + "mean_token_accuracy": 0.4896562099456787, + "num_tokens": 587781903.0, + "step": 1149 + }, + { + "epoch": 0.31097890751757706, + "grad_norm": 2.1727852821350098, + "learning_rate": 1.9951326727782137e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.48737791180610657, + "num_tokens": 588288828.0, + "step": 1150 + }, + { + "epoch": 0.3112493239588967, + "grad_norm": 2.0890378952026367, + "learning_rate": 1.995116285526218e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.48006671667099, + "num_tokens": 588761381.0, + "step": 1151 + }, + { + "epoch": 0.31151974040021635, + "grad_norm": 1.9140517711639404, + "learning_rate": 1.995099870809422e-05, + "loss": 2.2956, + "mean_token_accuracy": 0.5082205533981323, + "num_tokens": 589270065.0, + "step": 1152 + }, + { + "epoch": 0.311790156841536, + "grad_norm": 1.7809579372406006, + "learning_rate": 1.99508342862833e-05, + "loss": 2.4501, + "mean_token_accuracy": 0.4768812954425812, + "num_tokens": 589794327.0, + "step": 1153 + }, + { + "epoch": 0.3120605732828556, + "grad_norm": 2.2353639602661133, + "learning_rate": 1.9950669589834456e-05, + "loss": 2.6527, + "mean_token_accuracy": 0.4816018044948578, + "num_tokens": 590257826.0, + "step": 1154 + }, + { + "epoch": 0.3123309897241752, + "grad_norm": 1.7091339826583862, + "learning_rate": 1.995050461875275e-05, + "loss": 2.5736, + "mean_token_accuracy": 0.4681115448474884, + "num_tokens": 590781895.0, + "step": 1155 + }, + { + "epoch": 0.31260140616549487, + "grad_norm": 2.06955623626709, + "learning_rate": 1.995033937304324e-05, + "loss": 2.3779, + "mean_token_accuracy": 0.4836537837982178, + "num_tokens": 591306052.0, + "step": 1156 + }, + { + "epoch": 0.3128718226068145, + "grad_norm": 1.6853703260421753, + "learning_rate": 1.9950173852711e-05, + "loss": 2.4981, + "mean_token_accuracy": 0.47973793745040894, + "num_tokens": 591796435.0, + "step": 1157 + }, + { + "epoch": 0.31314223904813415, + "grad_norm": 1.549856185913086, + "learning_rate": 1.99500080577611e-05, + "loss": 2.474, + "mean_token_accuracy": 0.46835845708847046, + "num_tokens": 592320698.0, + "step": 1158 + }, + { + "epoch": 0.31341265548945374, + "grad_norm": 1.8095142841339111, + "learning_rate": 1.9949841988198637e-05, + "loss": 2.5796, + "mean_token_accuracy": 0.4731190502643585, + "num_tokens": 592844958.0, + "step": 1159 + }, + { + "epoch": 0.3136830719307734, + "grad_norm": 1.5418007373809814, + "learning_rate": 1.9949675644028702e-05, + "loss": 2.2615, + "mean_token_accuracy": 0.4927932620048523, + "num_tokens": 593369182.0, + "step": 1160 + }, + { + "epoch": 0.313953488372093, + "grad_norm": 1.368560552597046, + "learning_rate": 1.9949509025256395e-05, + "loss": 1.2602, + "mean_token_accuracy": 0.6721220016479492, + "num_tokens": 593893338.0, + "step": 1161 + }, + { + "epoch": 0.31422390481341267, + "grad_norm": 3.2728817462921143, + "learning_rate": 1.9949342131886832e-05, + "loss": 2.3621, + "mean_token_accuracy": 0.49072137475013733, + "num_tokens": 594403184.0, + "step": 1162 + }, + { + "epoch": 0.3144943212547323, + "grad_norm": 3.8095834255218506, + "learning_rate": 1.9949174963925138e-05, + "loss": 2.6534, + "mean_token_accuracy": 0.45832040905952454, + "num_tokens": 594927332.0, + "step": 1163 + }, + { + "epoch": 0.3147647376960519, + "grad_norm": 1.5597246885299683, + "learning_rate": 1.9949007521376437e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.4833383560180664, + "num_tokens": 595451423.0, + "step": 1164 + }, + { + "epoch": 0.31503515413737154, + "grad_norm": 2.0116443634033203, + "learning_rate": 1.994883980424587e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.48567891120910645, + "num_tokens": 595924911.0, + "step": 1165 + }, + { + "epoch": 0.3153055705786912, + "grad_norm": 2.614300012588501, + "learning_rate": 1.9948671812538576e-05, + "loss": 2.58, + "mean_token_accuracy": 0.44806012511253357, + "num_tokens": 596449032.0, + "step": 1166 + }, + { + "epoch": 0.31557598702001083, + "grad_norm": 1.9356552362442017, + "learning_rate": 1.994850354625972e-05, + "loss": 2.4333, + "mean_token_accuracy": 0.4733174741268158, + "num_tokens": 596965109.0, + "step": 1167 + }, + { + "epoch": 0.31584640346133047, + "grad_norm": 1.8348054885864258, + "learning_rate": 1.9948335005414454e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.4827162027359009, + "num_tokens": 597454273.0, + "step": 1168 + }, + { + "epoch": 0.31611681990265006, + "grad_norm": 2.270320415496826, + "learning_rate": 1.9948166190007956e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.4676438271999359, + "num_tokens": 597945557.0, + "step": 1169 + }, + { + "epoch": 0.3163872363439697, + "grad_norm": 1.7303664684295654, + "learning_rate": 1.9947997100045407e-05, + "loss": 2.6807, + "mean_token_accuracy": 0.4648047089576721, + "num_tokens": 598469830.0, + "step": 1170 + }, + { + "epoch": 0.31665765278528935, + "grad_norm": 2.1403799057006836, + "learning_rate": 1.994782773553199e-05, + "loss": 2.603, + "mean_token_accuracy": 0.4604337811470032, + "num_tokens": 598994106.0, + "step": 1171 + }, + { + "epoch": 0.316928069226609, + "grad_norm": 2.2222259044647217, + "learning_rate": 1.9947658096472906e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.46532750129699707, + "num_tokens": 599518346.0, + "step": 1172 + }, + { + "epoch": 0.31719848566792863, + "grad_norm": 2.0969924926757812, + "learning_rate": 1.994748818287336e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.4668421745300293, + "num_tokens": 600015425.0, + "step": 1173 + }, + { + "epoch": 0.3174689021092482, + "grad_norm": 1.7666910886764526, + "learning_rate": 1.994731799473856e-05, + "loss": 2.4949, + "mean_token_accuracy": 0.47350752353668213, + "num_tokens": 600539577.0, + "step": 1174 + }, + { + "epoch": 0.31773931855056786, + "grad_norm": 1.7353671789169312, + "learning_rate": 1.9947147532073733e-05, + "loss": 2.4793, + "mean_token_accuracy": 0.4887050688266754, + "num_tokens": 601063794.0, + "step": 1175 + }, + { + "epoch": 0.3180097349918875, + "grad_norm": 1.6098997592926025, + "learning_rate": 1.994697679488411e-05, + "loss": 2.4148, + "mean_token_accuracy": 0.501653790473938, + "num_tokens": 601536111.0, + "step": 1176 + }, + { + "epoch": 0.31828015143320715, + "grad_norm": 2.704106092453003, + "learning_rate": 1.9946805783174924e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.4785248041152954, + "num_tokens": 602060357.0, + "step": 1177 + }, + { + "epoch": 0.3185505678745268, + "grad_norm": 2.6633148193359375, + "learning_rate": 1.9946634496951428e-05, + "loss": 2.2827, + "mean_token_accuracy": 0.5352474451065063, + "num_tokens": 602584608.0, + "step": 1178 + }, + { + "epoch": 0.3188209843158464, + "grad_norm": 2.632978916168213, + "learning_rate": 1.9946462936218875e-05, + "loss": 2.5093, + "mean_token_accuracy": 0.491967111825943, + "num_tokens": 603059420.0, + "step": 1179 + }, + { + "epoch": 0.319091400757166, + "grad_norm": 1.794688105583191, + "learning_rate": 1.9946291100982527e-05, + "loss": 2.4546, + "mean_token_accuracy": 0.47121381759643555, + "num_tokens": 603583699.0, + "step": 1180 + }, + { + "epoch": 0.31936181719848566, + "grad_norm": 0.9892809987068176, + "learning_rate": 1.9946118991247657e-05, + "loss": 1.2299, + "mean_token_accuracy": 0.6822737455368042, + "num_tokens": 604065182.0, + "step": 1181 + }, + { + "epoch": 0.3196322336398053, + "grad_norm": 3.556485414505005, + "learning_rate": 1.9945946607019556e-05, + "loss": 2.554, + "mean_token_accuracy": 0.47067779302597046, + "num_tokens": 604530011.0, + "step": 1182 + }, + { + "epoch": 0.31990265008112495, + "grad_norm": 2.9128355979919434, + "learning_rate": 1.9945773948303496e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.4866110682487488, + "num_tokens": 605054208.0, + "step": 1183 + }, + { + "epoch": 0.32017306652244454, + "grad_norm": 1.7822116613388062, + "learning_rate": 1.994560101510479e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.4708520770072937, + "num_tokens": 605578355.0, + "step": 1184 + }, + { + "epoch": 0.3204434829637642, + "grad_norm": 2.094531536102295, + "learning_rate": 1.9945427807428734e-05, + "loss": 2.5026, + "mean_token_accuracy": 0.483147531747818, + "num_tokens": 606053571.0, + "step": 1185 + }, + { + "epoch": 0.3207138994050838, + "grad_norm": 2.49230694770813, + "learning_rate": 1.9945254325280646e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.4750646948814392, + "num_tokens": 606518698.0, + "step": 1186 + }, + { + "epoch": 0.32098431584640347, + "grad_norm": 2.051215887069702, + "learning_rate": 1.9945080568665847e-05, + "loss": 2.4537, + "mean_token_accuracy": 0.4647597372531891, + "num_tokens": 607042899.0, + "step": 1187 + }, + { + "epoch": 0.3212547322877231, + "grad_norm": 2.2238166332244873, + "learning_rate": 1.9944906537589676e-05, + "loss": 2.4709, + "mean_token_accuracy": 0.4848061501979828, + "num_tokens": 607567074.0, + "step": 1188 + }, + { + "epoch": 0.32152514872904275, + "grad_norm": 2.255728244781494, + "learning_rate": 1.9944732232057466e-05, + "loss": 2.682, + "mean_token_accuracy": 0.4527939558029175, + "num_tokens": 608091345.0, + "step": 1189 + }, + { + "epoch": 0.32179556517036234, + "grad_norm": 1.8955293893814087, + "learning_rate": 1.9944557652074563e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.4886109530925751, + "num_tokens": 608615613.0, + "step": 1190 + }, + { + "epoch": 0.322065981611682, + "grad_norm": 2.777909517288208, + "learning_rate": 1.9944382797646328e-05, + "loss": 2.032, + "mean_token_accuracy": 0.5399098992347717, + "num_tokens": 609118958.0, + "step": 1191 + }, + { + "epoch": 0.3223363980530016, + "grad_norm": 2.138815402984619, + "learning_rate": 1.9944207668778127e-05, + "loss": 2.6231, + "mean_token_accuracy": 0.4845825433731079, + "num_tokens": 609544233.0, + "step": 1192 + }, + { + "epoch": 0.32260681449432127, + "grad_norm": 2.4696459770202637, + "learning_rate": 1.994403226547533e-05, + "loss": 2.6033, + "mean_token_accuracy": 0.4780176281929016, + "num_tokens": 610035559.0, + "step": 1193 + }, + { + "epoch": 0.3228772309356409, + "grad_norm": 2.12099289894104, + "learning_rate": 1.994385658774332e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.4605671763420105, + "num_tokens": 610559840.0, + "step": 1194 + }, + { + "epoch": 0.3231476473769605, + "grad_norm": 1.7735563516616821, + "learning_rate": 1.994368063558749e-05, + "loss": 2.2944, + "mean_token_accuracy": 0.49322378635406494, + "num_tokens": 611051574.0, + "step": 1195 + }, + { + "epoch": 0.32341806381828014, + "grad_norm": 2.4149723052978516, + "learning_rate": 1.9943504409013234e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.4872148633003235, + "num_tokens": 611575778.0, + "step": 1196 + }, + { + "epoch": 0.3236884802595998, + "grad_norm": 2.218754768371582, + "learning_rate": 1.9943327908025963e-05, + "loss": 2.626, + "mean_token_accuracy": 0.4633682370185852, + "num_tokens": 612100057.0, + "step": 1197 + }, + { + "epoch": 0.32395889670091943, + "grad_norm": 2.621696949005127, + "learning_rate": 1.994315113263109e-05, + "loss": 2.608, + "mean_token_accuracy": 0.45056790113449097, + "num_tokens": 612624323.0, + "step": 1198 + }, + { + "epoch": 0.3242293131422391, + "grad_norm": 1.6268519163131714, + "learning_rate": 1.9942974082834043e-05, + "loss": 2.5745, + "mean_token_accuracy": 0.47786852717399597, + "num_tokens": 613148525.0, + "step": 1199 + }, + { + "epoch": 0.32449972958355866, + "grad_norm": 1.7326334714889526, + "learning_rate": 1.994279675864025e-05, + "loss": 2.4489, + "mean_token_accuracy": 0.4663427472114563, + "num_tokens": 613672728.0, + "step": 1200 + }, + { + "epoch": 0.3247701460248783, + "grad_norm": 1.1578549146652222, + "learning_rate": 1.9942619160055152e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.6883300542831421, + "num_tokens": 614195575.0, + "step": 1201 + }, + { + "epoch": 0.32504056246619795, + "grad_norm": 2.8840956687927246, + "learning_rate": 1.99424412870842e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.4643203616142273, + "num_tokens": 614719622.0, + "step": 1202 + }, + { + "epoch": 0.3253109789075176, + "grad_norm": 1.9115893840789795, + "learning_rate": 1.9942263139732853e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.45440322160720825, + "num_tokens": 615243904.0, + "step": 1203 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 1.9616979360580444, + "learning_rate": 1.9942084718006573e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.45990532636642456, + "num_tokens": 615768119.0, + "step": 1204 + }, + { + "epoch": 0.3258518117901568, + "grad_norm": 2.0343284606933594, + "learning_rate": 1.994190602191084e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.47718140482902527, + "num_tokens": 616292219.0, + "step": 1205 + }, + { + "epoch": 0.32612222823147646, + "grad_norm": 1.6976169347763062, + "learning_rate": 1.994172705145113e-05, + "loss": 2.5758, + "mean_token_accuracy": 0.46347612142562866, + "num_tokens": 616816489.0, + "step": 1206 + }, + { + "epoch": 0.3263926446727961, + "grad_norm": 6.184410095214844, + "learning_rate": 1.994154780663294e-05, + "loss": 2.2137, + "mean_token_accuracy": 0.527759313583374, + "num_tokens": 617340602.0, + "step": 1207 + }, + { + "epoch": 0.32666306111411575, + "grad_norm": 2.5379366874694824, + "learning_rate": 1.9941368287461767e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.487726092338562, + "num_tokens": 617864862.0, + "step": 1208 + }, + { + "epoch": 0.3269334775554354, + "grad_norm": 2.034095048904419, + "learning_rate": 1.994118849394312e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.4570958614349365, + "num_tokens": 618388915.0, + "step": 1209 + }, + { + "epoch": 0.327203893996755, + "grad_norm": 1.7659155130386353, + "learning_rate": 1.994100842608252e-05, + "loss": 2.4919, + "mean_token_accuracy": 0.4807034134864807, + "num_tokens": 618913043.0, + "step": 1210 + }, + { + "epoch": 0.3274743104380746, + "grad_norm": 1.9631850719451904, + "learning_rate": 1.994082808388548e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.4623459577560425, + "num_tokens": 619437294.0, + "step": 1211 + }, + { + "epoch": 0.32774472687939427, + "grad_norm": 1.9834755659103394, + "learning_rate": 1.994064746735755e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.45683789253234863, + "num_tokens": 619961517.0, + "step": 1212 + }, + { + "epoch": 0.3280151433207139, + "grad_norm": 1.3967052698135376, + "learning_rate": 1.9940466576504255e-05, + "loss": 2.4221, + "mean_token_accuracy": 0.46647021174430847, + "num_tokens": 620485760.0, + "step": 1213 + }, + { + "epoch": 0.32828555976203355, + "grad_norm": 1.8164687156677246, + "learning_rate": 1.9940285411331158e-05, + "loss": 2.6753, + "mean_token_accuracy": 0.46028047800064087, + "num_tokens": 621009949.0, + "step": 1214 + }, + { + "epoch": 0.32855597620335314, + "grad_norm": 1.9710973501205444, + "learning_rate": 1.994010397184381e-05, + "loss": 2.623, + "mean_token_accuracy": 0.4660157561302185, + "num_tokens": 621497362.0, + "step": 1215 + }, + { + "epoch": 0.3288263926446728, + "grad_norm": 1.65244722366333, + "learning_rate": 1.9939922258047783e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.4785500764846802, + "num_tokens": 621971637.0, + "step": 1216 + }, + { + "epoch": 0.3290968090859924, + "grad_norm": 1.921042561531067, + "learning_rate": 1.9939740269948652e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4794490337371826, + "num_tokens": 622472995.0, + "step": 1217 + }, + { + "epoch": 0.32936722552731207, + "grad_norm": 2.7046971321105957, + "learning_rate": 1.9939558007551996e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.4741135835647583, + "num_tokens": 622997154.0, + "step": 1218 + }, + { + "epoch": 0.3296376419686317, + "grad_norm": 2.052347183227539, + "learning_rate": 1.9939375470863412e-05, + "loss": 2.5429, + "mean_token_accuracy": 0.49025076627731323, + "num_tokens": 623521356.0, + "step": 1219 + }, + { + "epoch": 0.3299080584099513, + "grad_norm": 2.0874528884887695, + "learning_rate": 1.99391926598885e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.4856048822402954, + "num_tokens": 624015242.0, + "step": 1220 + }, + { + "epoch": 0.33017847485127094, + "grad_norm": 1.1415365934371948, + "learning_rate": 1.9939009574632872e-05, + "loss": 1.1942, + "mean_token_accuracy": 0.6701134443283081, + "num_tokens": 624539433.0, + "step": 1221 + }, + { + "epoch": 0.3304488912925906, + "grad_norm": 4.511559963226318, + "learning_rate": 1.9938826215102137e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.4609801173210144, + "num_tokens": 625053763.0, + "step": 1222 + }, + { + "epoch": 0.33071930773391023, + "grad_norm": 3.4566540718078613, + "learning_rate": 1.993864258130193e-05, + "loss": 2.5812, + "mean_token_accuracy": 0.45052140951156616, + "num_tokens": 625577849.0, + "step": 1223 + }, + { + "epoch": 0.33098972417522987, + "grad_norm": 2.1565980911254883, + "learning_rate": 1.993845867323788e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.48277732729911804, + "num_tokens": 626101970.0, + "step": 1224 + }, + { + "epoch": 0.33126014061654946, + "grad_norm": 3.663970947265625, + "learning_rate": 1.993827449091563e-05, + "loss": 2.353, + "mean_token_accuracy": 0.5048136711120605, + "num_tokens": 626626102.0, + "step": 1225 + }, + { + "epoch": 0.3315305570578691, + "grad_norm": 3.3842151165008545, + "learning_rate": 1.9938090034340834e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.45818662643432617, + "num_tokens": 627150226.0, + "step": 1226 + }, + { + "epoch": 0.33180097349918874, + "grad_norm": 2.792025566101074, + "learning_rate": 1.9937905303519154e-05, + "loss": 2.5823, + "mean_token_accuracy": 0.45267221331596375, + "num_tokens": 627633413.0, + "step": 1227 + }, + { + "epoch": 0.3320713899405084, + "grad_norm": 2.7047502994537354, + "learning_rate": 1.9937720298456252e-05, + "loss": 2.26, + "mean_token_accuracy": 0.5114306211471558, + "num_tokens": 628106399.0, + "step": 1228 + }, + { + "epoch": 0.33234180638182803, + "grad_norm": 2.886542320251465, + "learning_rate": 1.9937535019157804e-05, + "loss": 2.5293, + "mean_token_accuracy": 0.4699050486087799, + "num_tokens": 628630669.0, + "step": 1229 + }, + { + "epoch": 0.3326122228231477, + "grad_norm": 2.414522409439087, + "learning_rate": 1.9937349465629502e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.4881279170513153, + "num_tokens": 629113761.0, + "step": 1230 + }, + { + "epoch": 0.33288263926446726, + "grad_norm": 2.876652717590332, + "learning_rate": 1.9937163637877034e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.4657168984413147, + "num_tokens": 629637973.0, + "step": 1231 + }, + { + "epoch": 0.3331530557057869, + "grad_norm": 2.859726905822754, + "learning_rate": 1.9936977535906104e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.4952360987663269, + "num_tokens": 630162240.0, + "step": 1232 + }, + { + "epoch": 0.33342347214710655, + "grad_norm": 2.314060926437378, + "learning_rate": 1.9936791159722422e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.4689619541168213, + "num_tokens": 630686343.0, + "step": 1233 + }, + { + "epoch": 0.3336938885884262, + "grad_norm": 1.8208414316177368, + "learning_rate": 1.99366045093317e-05, + "loss": 2.3971, + "mean_token_accuracy": 0.478915810585022, + "num_tokens": 631210580.0, + "step": 1234 + }, + { + "epoch": 0.33396430502974583, + "grad_norm": 2.0495991706848145, + "learning_rate": 1.993641758473968e-05, + "loss": 2.1583, + "mean_token_accuracy": 0.5435237884521484, + "num_tokens": 631663426.0, + "step": 1235 + }, + { + "epoch": 0.3342347214710654, + "grad_norm": 2.1647660732269287, + "learning_rate": 1.9936230385952083e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4871935248374939, + "num_tokens": 632187664.0, + "step": 1236 + }, + { + "epoch": 0.33450513791238506, + "grad_norm": 1.71205735206604, + "learning_rate": 1.993604291297466e-05, + "loss": 2.5818, + "mean_token_accuracy": 0.4724486470222473, + "num_tokens": 632666638.0, + "step": 1237 + }, + { + "epoch": 0.3347755543537047, + "grad_norm": 2.047938108444214, + "learning_rate": 1.9935855165813157e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.4837847352027893, + "num_tokens": 633190866.0, + "step": 1238 + }, + { + "epoch": 0.33504597079502435, + "grad_norm": 2.155824899673462, + "learning_rate": 1.9935667144473344e-05, + "loss": 2.6669, + "mean_token_accuracy": 0.46361321210861206, + "num_tokens": 633715111.0, + "step": 1239 + }, + { + "epoch": 0.335316387236344, + "grad_norm": 2.1639251708984375, + "learning_rate": 1.9935478848960983e-05, + "loss": 2.5009, + "mean_token_accuracy": 0.4738827347755432, + "num_tokens": 634239251.0, + "step": 1240 + }, + { + "epoch": 0.3355868036776636, + "grad_norm": 1.1557111740112305, + "learning_rate": 1.993529027928185e-05, + "loss": 1.305, + "mean_token_accuracy": 0.6751828789710999, + "num_tokens": 634679609.0, + "step": 1241 + }, + { + "epoch": 0.3358572201189832, + "grad_norm": 3.9697377681732178, + "learning_rate": 1.993510143544174e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4976961016654968, + "num_tokens": 635186429.0, + "step": 1242 + }, + { + "epoch": 0.33612763656030287, + "grad_norm": 2.6602377891540527, + "learning_rate": 1.9934912317446438e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.4917733073234558, + "num_tokens": 635664792.0, + "step": 1243 + }, + { + "epoch": 0.3363980530016225, + "grad_norm": 1.789338231086731, + "learning_rate": 1.993472292530175e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.48941728472709656, + "num_tokens": 636188978.0, + "step": 1244 + }, + { + "epoch": 0.33666846944294215, + "grad_norm": 2.4239306449890137, + "learning_rate": 1.9934533259013487e-05, + "loss": 2.5962, + "mean_token_accuracy": 0.4718968868255615, + "num_tokens": 636713058.0, + "step": 1245 + }, + { + "epoch": 0.33693888588426174, + "grad_norm": 1.797940969467163, + "learning_rate": 1.9934343318587473e-05, + "loss": 2.3299, + "mean_token_accuracy": 0.5144447088241577, + "num_tokens": 637237242.0, + "step": 1246 + }, + { + "epoch": 0.3372093023255814, + "grad_norm": 1.7738754749298096, + "learning_rate": 1.9934153104029528e-05, + "loss": 2.5545, + "mean_token_accuracy": 0.4743107557296753, + "num_tokens": 637733143.0, + "step": 1247 + }, + { + "epoch": 0.337479718766901, + "grad_norm": 1.5320414304733276, + "learning_rate": 1.993396261534549e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.47950291633605957, + "num_tokens": 638257289.0, + "step": 1248 + }, + { + "epoch": 0.33775013520822067, + "grad_norm": 1.5651546716690063, + "learning_rate": 1.9933771852541212e-05, + "loss": 2.5499, + "mean_token_accuracy": 0.46492716670036316, + "num_tokens": 638781531.0, + "step": 1249 + }, + { + "epoch": 0.3380205516495403, + "grad_norm": 1.485479712486267, + "learning_rate": 1.993358081562254e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.4875794053077698, + "num_tokens": 639305809.0, + "step": 1250 + }, + { + "epoch": 0.3382909680908599, + "grad_norm": 1.681867241859436, + "learning_rate": 1.9933389504595333e-05, + "loss": 2.4959, + "mean_token_accuracy": 0.46913671493530273, + "num_tokens": 639829997.0, + "step": 1251 + }, + { + "epoch": 0.33856138453217954, + "grad_norm": 1.640411615371704, + "learning_rate": 1.9933197919465468e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.5015050768852234, + "num_tokens": 640354266.0, + "step": 1252 + }, + { + "epoch": 0.3388318009734992, + "grad_norm": 1.524244785308838, + "learning_rate": 1.993300606023882e-05, + "loss": 2.5149, + "mean_token_accuracy": 0.4794830083847046, + "num_tokens": 640878471.0, + "step": 1253 + }, + { + "epoch": 0.33910221741481883, + "grad_norm": 1.6203545331954956, + "learning_rate": 1.9932813926921272e-05, + "loss": 2.6154, + "mean_token_accuracy": 0.4784323573112488, + "num_tokens": 641340917.0, + "step": 1254 + }, + { + "epoch": 0.3393726338561385, + "grad_norm": 2.018852472305298, + "learning_rate": 1.9932621519518724e-05, + "loss": 2.536, + "mean_token_accuracy": 0.48573604226112366, + "num_tokens": 641865004.0, + "step": 1255 + }, + { + "epoch": 0.33964305029745806, + "grad_norm": 1.4113759994506836, + "learning_rate": 1.993242883803708e-05, + "loss": 2.5243, + "mean_token_accuracy": 0.4672988951206207, + "num_tokens": 642389246.0, + "step": 1256 + }, + { + "epoch": 0.3399134667387777, + "grad_norm": 2.9008681774139404, + "learning_rate": 1.9932235882482253e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.5229164361953735, + "num_tokens": 642913513.0, + "step": 1257 + }, + { + "epoch": 0.34018388318009735, + "grad_norm": 2.031961441040039, + "learning_rate": 1.9932042652860154e-05, + "loss": 2.3769, + "mean_token_accuracy": 0.4703715443611145, + "num_tokens": 643437561.0, + "step": 1258 + }, + { + "epoch": 0.340454299621417, + "grad_norm": 1.626266360282898, + "learning_rate": 1.9931849149176727e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.4799870252609253, + "num_tokens": 643933593.0, + "step": 1259 + }, + { + "epoch": 0.34072471606273663, + "grad_norm": 1.467713713645935, + "learning_rate": 1.9931655371437898e-05, + "loss": 2.4889, + "mean_token_accuracy": 0.4793611764907837, + "num_tokens": 644434696.0, + "step": 1260 + }, + { + "epoch": 0.3409951325040562, + "grad_norm": 1.3319545984268188, + "learning_rate": 1.9931461319649618e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.675165057182312, + "num_tokens": 644958937.0, + "step": 1261 + }, + { + "epoch": 0.34126554894537586, + "grad_norm": 2.779221296310425, + "learning_rate": 1.993126699381783e-05, + "loss": 2.4191, + "mean_token_accuracy": 0.5025346279144287, + "num_tokens": 645483170.0, + "step": 1262 + }, + { + "epoch": 0.3415359653866955, + "grad_norm": 2.6016931533813477, + "learning_rate": 1.9931072393948515e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.4790154695510864, + "num_tokens": 646007347.0, + "step": 1263 + }, + { + "epoch": 0.34180638182801515, + "grad_norm": 2.1240522861480713, + "learning_rate": 1.9930877520047634e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.4674075245857239, + "num_tokens": 646531620.0, + "step": 1264 + }, + { + "epoch": 0.3420767982693348, + "grad_norm": 2.3845129013061523, + "learning_rate": 1.9930682372121163e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.48548781871795654, + "num_tokens": 647055824.0, + "step": 1265 + }, + { + "epoch": 0.34234721471065443, + "grad_norm": 2.3984618186950684, + "learning_rate": 1.9930486950175096e-05, + "loss": 2.5421, + "mean_token_accuracy": 0.4765290319919586, + "num_tokens": 647479743.0, + "step": 1266 + }, + { + "epoch": 0.342617631151974, + "grad_norm": 2.2229855060577393, + "learning_rate": 1.9930291254215424e-05, + "loss": 2.442, + "mean_token_accuracy": 0.48351556062698364, + "num_tokens": 648004016.0, + "step": 1267 + }, + { + "epoch": 0.34288804759329367, + "grad_norm": 2.1332898139953613, + "learning_rate": 1.9930095284248157e-05, + "loss": 2.5243, + "mean_token_accuracy": 0.4715539813041687, + "num_tokens": 648528265.0, + "step": 1268 + }, + { + "epoch": 0.3431584640346133, + "grad_norm": 1.948883295059204, + "learning_rate": 1.9929899040279304e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.473676472902298, + "num_tokens": 649013393.0, + "step": 1269 + }, + { + "epoch": 0.34342888047593295, + "grad_norm": 1.9363702535629272, + "learning_rate": 1.9929702522314887e-05, + "loss": 2.4948, + "mean_token_accuracy": 0.48190343379974365, + "num_tokens": 649537673.0, + "step": 1270 + }, + { + "epoch": 0.3436992969172526, + "grad_norm": 1.9241639375686646, + "learning_rate": 1.9929505730360935e-05, + "loss": 2.4238, + "mean_token_accuracy": 0.4872969388961792, + "num_tokens": 650058285.0, + "step": 1271 + }, + { + "epoch": 0.3439697133585722, + "grad_norm": 1.9338539838790894, + "learning_rate": 1.9929308664423487e-05, + "loss": 2.2791, + "mean_token_accuracy": 0.49366816878318787, + "num_tokens": 650536320.0, + "step": 1272 + }, + { + "epoch": 0.3442401297998918, + "grad_norm": 1.4762192964553833, + "learning_rate": 1.9929111324508594e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.4679284691810608, + "num_tokens": 651060538.0, + "step": 1273 + }, + { + "epoch": 0.34451054624121147, + "grad_norm": 2.216736316680908, + "learning_rate": 1.9928913710622303e-05, + "loss": 2.4938, + "mean_token_accuracy": 0.4902389347553253, + "num_tokens": 651555477.0, + "step": 1274 + }, + { + "epoch": 0.3447809626825311, + "grad_norm": 2.1273820400238037, + "learning_rate": 1.992871582277068e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.5303987264633179, + "num_tokens": 652079655.0, + "step": 1275 + }, + { + "epoch": 0.34505137912385075, + "grad_norm": 2.0528109073638916, + "learning_rate": 1.99285176609598e-05, + "loss": 2.4906, + "mean_token_accuracy": 0.4740990698337555, + "num_tokens": 652588748.0, + "step": 1276 + }, + { + "epoch": 0.34532179556517034, + "grad_norm": 1.9569131135940552, + "learning_rate": 1.992831922519574e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.48080116510391235, + "num_tokens": 653112923.0, + "step": 1277 + }, + { + "epoch": 0.34559221200649, + "grad_norm": 2.0694422721862793, + "learning_rate": 1.992812051548459e-05, + "loss": 2.2955, + "mean_token_accuracy": 0.49964603781700134, + "num_tokens": 653637182.0, + "step": 1278 + }, + { + "epoch": 0.34586262844780963, + "grad_norm": 2.094541072845459, + "learning_rate": 1.9927921531832445e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.4869632124900818, + "num_tokens": 654161292.0, + "step": 1279 + }, + { + "epoch": 0.34613304488912927, + "grad_norm": 1.7733380794525146, + "learning_rate": 1.9927722274245415e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.47354936599731445, + "num_tokens": 654685492.0, + "step": 1280 + }, + { + "epoch": 0.3464034613304489, + "grad_norm": 0.9996545910835266, + "learning_rate": 1.992752274272961e-05, + "loss": 1.2573, + "mean_token_accuracy": 0.6713170409202576, + "num_tokens": 655209753.0, + "step": 1281 + }, + { + "epoch": 0.3466738777717685, + "grad_norm": 2.6001360416412354, + "learning_rate": 1.9927322937291153e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4835253655910492, + "num_tokens": 655733980.0, + "step": 1282 + }, + { + "epoch": 0.34694429421308814, + "grad_norm": 2.3241195678710938, + "learning_rate": 1.9927122857936177e-05, + "loss": 2.5116, + "mean_token_accuracy": 0.47395598888397217, + "num_tokens": 656244261.0, + "step": 1283 + }, + { + "epoch": 0.3472147106544078, + "grad_norm": 1.8656083345413208, + "learning_rate": 1.9926922504670816e-05, + "loss": 2.588, + "mean_token_accuracy": 0.46503788232803345, + "num_tokens": 656768467.0, + "step": 1284 + }, + { + "epoch": 0.34748512709572743, + "grad_norm": 2.028765916824341, + "learning_rate": 1.992672187750122e-05, + "loss": 2.4283, + "mean_token_accuracy": 0.476319819688797, + "num_tokens": 657242261.0, + "step": 1285 + }, + { + "epoch": 0.3477555435370471, + "grad_norm": 2.208834171295166, + "learning_rate": 1.9926520976433545e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4887404441833496, + "num_tokens": 657766504.0, + "step": 1286 + }, + { + "epoch": 0.34802595997836666, + "grad_norm": 2.4312498569488525, + "learning_rate": 1.9926319801473957e-05, + "loss": 2.2568, + "mean_token_accuracy": 0.5366413593292236, + "num_tokens": 658290774.0, + "step": 1287 + }, + { + "epoch": 0.3482963764196863, + "grad_norm": 2.084721326828003, + "learning_rate": 1.9926118352628627e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.49646398425102234, + "num_tokens": 658763006.0, + "step": 1288 + }, + { + "epoch": 0.34856679286100595, + "grad_norm": 2.1037230491638184, + "learning_rate": 1.9925916629903734e-05, + "loss": 2.6634, + "mean_token_accuracy": 0.4701985716819763, + "num_tokens": 659287210.0, + "step": 1289 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 2.201263427734375, + "learning_rate": 1.992571463330547e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.5084021687507629, + "num_tokens": 659746963.0, + "step": 1290 + }, + { + "epoch": 0.34910762574364523, + "grad_norm": 2.6360180377960205, + "learning_rate": 1.9925512362840035e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.47765856981277466, + "num_tokens": 660270063.0, + "step": 1291 + }, + { + "epoch": 0.3493780421849648, + "grad_norm": 1.9667267799377441, + "learning_rate": 1.9925309818513624e-05, + "loss": 2.5094, + "mean_token_accuracy": 0.48211416602134705, + "num_tokens": 660794306.0, + "step": 1292 + }, + { + "epoch": 0.34964845862628446, + "grad_norm": 2.053791046142578, + "learning_rate": 1.992510700033247e-05, + "loss": 2.2763, + "mean_token_accuracy": 0.5156081914901733, + "num_tokens": 661237754.0, + "step": 1293 + }, + { + "epoch": 0.3499188750676041, + "grad_norm": 1.7624810934066772, + "learning_rate": 1.9924903908302782e-05, + "loss": 2.4576, + "mean_token_accuracy": 0.46385765075683594, + "num_tokens": 661761917.0, + "step": 1294 + }, + { + "epoch": 0.35018929150892375, + "grad_norm": 1.569079041481018, + "learning_rate": 1.9924700542430798e-05, + "loss": 2.434, + "mean_token_accuracy": 0.49614661931991577, + "num_tokens": 662285895.0, + "step": 1295 + }, + { + "epoch": 0.3504597079502434, + "grad_norm": 2.1854701042175293, + "learning_rate": 1.9924496902722752e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.4956878423690796, + "num_tokens": 662810162.0, + "step": 1296 + }, + { + "epoch": 0.350730124391563, + "grad_norm": 1.995469570159912, + "learning_rate": 1.9924292989184896e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.4771658778190613, + "num_tokens": 663334336.0, + "step": 1297 + }, + { + "epoch": 0.3510005408328826, + "grad_norm": 4.9507155418396, + "learning_rate": 1.992408880182349e-05, + "loss": 2.1785, + "mean_token_accuracy": 0.5304554104804993, + "num_tokens": 663858558.0, + "step": 1298 + }, + { + "epoch": 0.35127095727420227, + "grad_norm": 2.945516586303711, + "learning_rate": 1.9923884340644793e-05, + "loss": 2.2737, + "mean_token_accuracy": 0.5329157114028931, + "num_tokens": 664358573.0, + "step": 1299 + }, + { + "epoch": 0.3515413737155219, + "grad_norm": 2.238769054412842, + "learning_rate": 1.992367960565508e-05, + "loss": 2.1591, + "mean_token_accuracy": 0.5167820453643799, + "num_tokens": 664882711.0, + "step": 1300 + }, + { + "epoch": 0.35181179015684155, + "grad_norm": 0.9885656237602234, + "learning_rate": 1.9923474596860637e-05, + "loss": 1.245, + "mean_token_accuracy": 0.6726996898651123, + "num_tokens": 665406739.0, + "step": 1301 + }, + { + "epoch": 0.3520822065981612, + "grad_norm": 2.3940391540527344, + "learning_rate": 1.9923269314267748e-05, + "loss": 2.4004, + "mean_token_accuracy": 0.47948533296585083, + "num_tokens": 665897716.0, + "step": 1302 + }, + { + "epoch": 0.3523526230394808, + "grad_norm": 2.199899196624756, + "learning_rate": 1.9923063757882717e-05, + "loss": 2.3886, + "mean_token_accuracy": 0.4778539538383484, + "num_tokens": 666421844.0, + "step": 1303 + }, + { + "epoch": 0.3526230394808004, + "grad_norm": 1.522210955619812, + "learning_rate": 1.9922857927711847e-05, + "loss": 2.4217, + "mean_token_accuracy": 0.48574915528297424, + "num_tokens": 666928586.0, + "step": 1304 + }, + { + "epoch": 0.35289345592212007, + "grad_norm": 2.5707993507385254, + "learning_rate": 1.9922651823761456e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.48779189586639404, + "num_tokens": 667452759.0, + "step": 1305 + }, + { + "epoch": 0.3531638723634397, + "grad_norm": 2.332793712615967, + "learning_rate": 1.9922445446037867e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.482562780380249, + "num_tokens": 667976942.0, + "step": 1306 + }, + { + "epoch": 0.35343428880475936, + "grad_norm": 2.171624183654785, + "learning_rate": 1.9922238794547416e-05, + "loss": 2.374, + "mean_token_accuracy": 0.4741695523262024, + "num_tokens": 668501161.0, + "step": 1307 + }, + { + "epoch": 0.35370470524607894, + "grad_norm": 2.0177550315856934, + "learning_rate": 1.9922031869296437e-05, + "loss": 2.3802, + "mean_token_accuracy": 0.4785923957824707, + "num_tokens": 669025377.0, + "step": 1308 + }, + { + "epoch": 0.3539751216873986, + "grad_norm": 2.340259075164795, + "learning_rate": 1.9921824670291285e-05, + "loss": 2.276, + "mean_token_accuracy": 0.4967786371707916, + "num_tokens": 669549597.0, + "step": 1309 + }, + { + "epoch": 0.35424553812871823, + "grad_norm": 1.9396625757217407, + "learning_rate": 1.992161719753831e-05, + "loss": 2.375, + "mean_token_accuracy": 0.5146434307098389, + "num_tokens": 670073790.0, + "step": 1310 + }, + { + "epoch": 0.3545159545700379, + "grad_norm": 63.57628631591797, + "learning_rate": 1.9921409451043882e-05, + "loss": 2.4265, + "mean_token_accuracy": 0.467729389667511, + "num_tokens": 670598076.0, + "step": 1311 + }, + { + "epoch": 0.3547863710113575, + "grad_norm": 3.113905191421509, + "learning_rate": 1.9921201430814382e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.4668581187725067, + "num_tokens": 671122276.0, + "step": 1312 + }, + { + "epoch": 0.3550567874526771, + "grad_norm": 2.546865224838257, + "learning_rate": 1.9920993136856183e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.48023855686187744, + "num_tokens": 671618155.0, + "step": 1313 + }, + { + "epoch": 0.35532720389399675, + "grad_norm": 1.8942055702209473, + "learning_rate": 1.9920784569175678e-05, + "loss": 2.4952, + "mean_token_accuracy": 0.4542822241783142, + "num_tokens": 672121413.0, + "step": 1314 + }, + { + "epoch": 0.3555976203353164, + "grad_norm": 2.799471855163574, + "learning_rate": 1.9920575727779272e-05, + "loss": 2.478, + "mean_token_accuracy": 0.481543630361557, + "num_tokens": 672645696.0, + "step": 1315 + }, + { + "epoch": 0.35586803677663603, + "grad_norm": 2.9744019508361816, + "learning_rate": 1.9920366612673364e-05, + "loss": 2.6926, + "mean_token_accuracy": 0.4642952084541321, + "num_tokens": 673169877.0, + "step": 1316 + }, + { + "epoch": 0.3561384532179557, + "grad_norm": 2.1888632774353027, + "learning_rate": 1.992015722386438e-05, + "loss": 2.4253, + "mean_token_accuracy": 0.48346632719039917, + "num_tokens": 673694020.0, + "step": 1317 + }, + { + "epoch": 0.35640886965927526, + "grad_norm": 2.311406373977661, + "learning_rate": 1.991994756135874e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.47363853454589844, + "num_tokens": 674218234.0, + "step": 1318 + }, + { + "epoch": 0.3566792861005949, + "grad_norm": 2.9283599853515625, + "learning_rate": 1.991973762516287e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.4855358898639679, + "num_tokens": 674736653.0, + "step": 1319 + }, + { + "epoch": 0.35694970254191455, + "grad_norm": 1.6945557594299316, + "learning_rate": 1.9919527415283227e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.4945372939109802, + "num_tokens": 675211423.0, + "step": 1320 + }, + { + "epoch": 0.3572201189832342, + "grad_norm": 1.0639594793319702, + "learning_rate": 1.9919316931726248e-05, + "loss": 1.2218, + "mean_token_accuracy": 0.692003607749939, + "num_tokens": 675735596.0, + "step": 1321 + }, + { + "epoch": 0.35749053542455383, + "grad_norm": 3.2320048809051514, + "learning_rate": 1.9919106174498396e-05, + "loss": 2.4294, + "mean_token_accuracy": 0.47737354040145874, + "num_tokens": 676259787.0, + "step": 1322 + }, + { + "epoch": 0.3577609518658734, + "grad_norm": 3.467179298400879, + "learning_rate": 1.991889514360614e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.49351513385772705, + "num_tokens": 676723396.0, + "step": 1323 + }, + { + "epoch": 0.35803136830719307, + "grad_norm": 1.7762491703033447, + "learning_rate": 1.991868383905595e-05, + "loss": 2.4153, + "mean_token_accuracy": 0.5001305341720581, + "num_tokens": 677247485.0, + "step": 1324 + }, + { + "epoch": 0.3583017847485127, + "grad_norm": 2.004507064819336, + "learning_rate": 1.9918472260854312e-05, + "loss": 2.4264, + "mean_token_accuracy": 0.47522926330566406, + "num_tokens": 677690326.0, + "step": 1325 + }, + { + "epoch": 0.35857220118983235, + "grad_norm": 2.2598876953125, + "learning_rate": 1.9918260409007713e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.47791609168052673, + "num_tokens": 678214477.0, + "step": 1326 + }, + { + "epoch": 0.358842617631152, + "grad_norm": 1.6169179677963257, + "learning_rate": 1.9918048283522666e-05, + "loss": 2.2432, + "mean_token_accuracy": 0.49343186616897583, + "num_tokens": 678738756.0, + "step": 1327 + }, + { + "epoch": 0.3591130340724716, + "grad_norm": 1.5773367881774902, + "learning_rate": 1.9917835884405664e-05, + "loss": 2.3884, + "mean_token_accuracy": 0.478817343711853, + "num_tokens": 679263016.0, + "step": 1328 + }, + { + "epoch": 0.3593834505137912, + "grad_norm": 1.8536313772201538, + "learning_rate": 1.9917623211663237e-05, + "loss": 2.456, + "mean_token_accuracy": 0.47532111406326294, + "num_tokens": 679745129.0, + "step": 1329 + }, + { + "epoch": 0.35965386695511087, + "grad_norm": 1.569069266319275, + "learning_rate": 1.99174102653019e-05, + "loss": 2.5063, + "mean_token_accuracy": 0.47475844621658325, + "num_tokens": 680248517.0, + "step": 1330 + }, + { + "epoch": 0.3599242833964305, + "grad_norm": 1.4534175395965576, + "learning_rate": 1.9917197045328197e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.514134407043457, + "num_tokens": 680772614.0, + "step": 1331 + }, + { + "epoch": 0.36019469983775015, + "grad_norm": 2.066520929336548, + "learning_rate": 1.9916983551748662e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.49415504932403564, + "num_tokens": 681296777.0, + "step": 1332 + }, + { + "epoch": 0.36046511627906974, + "grad_norm": 1.5810426473617554, + "learning_rate": 1.991676978456985e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.48405933380126953, + "num_tokens": 681820957.0, + "step": 1333 + }, + { + "epoch": 0.3607355327203894, + "grad_norm": 1.611004114151001, + "learning_rate": 1.9916555743798315e-05, + "loss": 2.4853, + "mean_token_accuracy": 0.4710841774940491, + "num_tokens": 682345140.0, + "step": 1334 + }, + { + "epoch": 0.361005949161709, + "grad_norm": 1.436521291732788, + "learning_rate": 1.991634142944063e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.4798339605331421, + "num_tokens": 682869333.0, + "step": 1335 + }, + { + "epoch": 0.36127636560302867, + "grad_norm": 1.3601466417312622, + "learning_rate": 1.9916126841503365e-05, + "loss": 2.3203, + "mean_token_accuracy": 0.5065519213676453, + "num_tokens": 683289839.0, + "step": 1336 + }, + { + "epoch": 0.3615467820443483, + "grad_norm": 1.4087756872177124, + "learning_rate": 1.9915911979993115e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.5066505074501038, + "num_tokens": 683814060.0, + "step": 1337 + }, + { + "epoch": 0.3618171984856679, + "grad_norm": 1.6808340549468994, + "learning_rate": 1.9915696844916465e-05, + "loss": 2.3796, + "mean_token_accuracy": 0.47673672437667847, + "num_tokens": 684306362.0, + "step": 1338 + }, + { + "epoch": 0.36208761492698754, + "grad_norm": 1.9050136804580688, + "learning_rate": 1.991548143628001e-05, + "loss": 2.6001, + "mean_token_accuracy": 0.4828876852989197, + "num_tokens": 684788956.0, + "step": 1339 + }, + { + "epoch": 0.3623580313683072, + "grad_norm": 2.102975845336914, + "learning_rate": 1.991526575409037e-05, + "loss": 2.638, + "mean_token_accuracy": 0.4635613262653351, + "num_tokens": 685313132.0, + "step": 1340 + }, + { + "epoch": 0.36262844780962683, + "grad_norm": 1.0164637565612793, + "learning_rate": 1.9915049798354156e-05, + "loss": 1.2171, + "mean_token_accuracy": 0.685657262802124, + "num_tokens": 685837331.0, + "step": 1341 + }, + { + "epoch": 0.3628988642509465, + "grad_norm": 3.18550443649292, + "learning_rate": 1.9914833569078e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.5154510736465454, + "num_tokens": 686361558.0, + "step": 1342 + }, + { + "epoch": 0.3631692806922661, + "grad_norm": 2.571228504180908, + "learning_rate": 1.991461706626853e-05, + "loss": 2.5339, + "mean_token_accuracy": 0.46510758996009827, + "num_tokens": 686885738.0, + "step": 1343 + }, + { + "epoch": 0.3634396971335857, + "grad_norm": 1.5122127532958984, + "learning_rate": 1.9914400289932395e-05, + "loss": 2.4858, + "mean_token_accuracy": 0.4888424873352051, + "num_tokens": 687409909.0, + "step": 1344 + }, + { + "epoch": 0.36371011357490535, + "grad_norm": 2.200660228729248, + "learning_rate": 1.9914183240076242e-05, + "loss": 2.3406, + "mean_token_accuracy": 0.4996360242366791, + "num_tokens": 687934151.0, + "step": 1345 + }, + { + "epoch": 0.363980530016225, + "grad_norm": 1.8444960117340088, + "learning_rate": 1.9913965916706734e-05, + "loss": 2.3581, + "mean_token_accuracy": 0.49326616525650024, + "num_tokens": 688458431.0, + "step": 1346 + }, + { + "epoch": 0.36425094645754463, + "grad_norm": 2.2006735801696777, + "learning_rate": 1.9913748319830537e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.48446783423423767, + "num_tokens": 688982647.0, + "step": 1347 + }, + { + "epoch": 0.3645213628988643, + "grad_norm": 1.980364441871643, + "learning_rate": 1.9913530449454324e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.47993189096450806, + "num_tokens": 689506783.0, + "step": 1348 + }, + { + "epoch": 0.36479177934018386, + "grad_norm": 1.5819168090820312, + "learning_rate": 1.991331230558479e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.4770817160606384, + "num_tokens": 690030866.0, + "step": 1349 + }, + { + "epoch": 0.3650621957815035, + "grad_norm": 2.0321316719055176, + "learning_rate": 1.991309388822862e-05, + "loss": 2.447, + "mean_token_accuracy": 0.4781062602996826, + "num_tokens": 690555004.0, + "step": 1350 + }, + { + "epoch": 0.36533261222282315, + "grad_norm": 2.1908340454101562, + "learning_rate": 1.9912875197392513e-05, + "loss": 2.419, + "mean_token_accuracy": 0.48576003313064575, + "num_tokens": 691045095.0, + "step": 1351 + }, + { + "epoch": 0.3656030286641428, + "grad_norm": 1.5494036674499512, + "learning_rate": 1.991265623308319e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.49260175228118896, + "num_tokens": 691569260.0, + "step": 1352 + }, + { + "epoch": 0.36587344510546244, + "grad_norm": 1.8434593677520752, + "learning_rate": 1.991243699530736e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.47943606972694397, + "num_tokens": 692062294.0, + "step": 1353 + }, + { + "epoch": 0.366143861546782, + "grad_norm": 2.146144390106201, + "learning_rate": 1.9912217484071756e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.4876396059989929, + "num_tokens": 692586468.0, + "step": 1354 + }, + { + "epoch": 0.36641427798810167, + "grad_norm": 1.681550145149231, + "learning_rate": 1.9911997699383107e-05, + "loss": 2.2327, + "mean_token_accuracy": 0.47792214155197144, + "num_tokens": 693110599.0, + "step": 1355 + }, + { + "epoch": 0.3666846944294213, + "grad_norm": 2.3832461833953857, + "learning_rate": 1.991177764124816e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.4904767870903015, + "num_tokens": 693634753.0, + "step": 1356 + }, + { + "epoch": 0.36695511087074095, + "grad_norm": 2.193222761154175, + "learning_rate": 1.9911557309673673e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.49283522367477417, + "num_tokens": 694120320.0, + "step": 1357 + }, + { + "epoch": 0.3672255273120606, + "grad_norm": 1.89323091506958, + "learning_rate": 1.9911336704666396e-05, + "loss": 2.369, + "mean_token_accuracy": 0.4656599164009094, + "num_tokens": 694644526.0, + "step": 1358 + }, + { + "epoch": 0.3674959437533802, + "grad_norm": 1.9773210287094116, + "learning_rate": 1.9911115826233102e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.4899287223815918, + "num_tokens": 695168543.0, + "step": 1359 + }, + { + "epoch": 0.3677663601946998, + "grad_norm": 2.309678792953491, + "learning_rate": 1.991089467438057e-05, + "loss": 2.2135, + "mean_token_accuracy": 0.5205296277999878, + "num_tokens": 695663738.0, + "step": 1360 + }, + { + "epoch": 0.36803677663601947, + "grad_norm": 0.9641743898391724, + "learning_rate": 1.9910673249115583e-05, + "loss": 1.2428, + "mean_token_accuracy": 0.6804561614990234, + "num_tokens": 696187978.0, + "step": 1361 + }, + { + "epoch": 0.3683071930773391, + "grad_norm": 3.1439104080200195, + "learning_rate": 1.9910451550444938e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.4729337990283966, + "num_tokens": 696712186.0, + "step": 1362 + }, + { + "epoch": 0.36857760951865876, + "grad_norm": 2.901932716369629, + "learning_rate": 1.9910229578375432e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.4655648469924927, + "num_tokens": 697236457.0, + "step": 1363 + }, + { + "epoch": 0.36884802595997834, + "grad_norm": 2.0612690448760986, + "learning_rate": 1.9910007332913882e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.5011166334152222, + "num_tokens": 697712093.0, + "step": 1364 + }, + { + "epoch": 0.369118442401298, + "grad_norm": 1.8455878496170044, + "learning_rate": 1.990978481406711e-05, + "loss": 2.2821, + "mean_token_accuracy": 0.487280011177063, + "num_tokens": 698236297.0, + "step": 1365 + }, + { + "epoch": 0.36938885884261763, + "grad_norm": 2.34257435798645, + "learning_rate": 1.9909562021841926e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.5131963491439819, + "num_tokens": 698642779.0, + "step": 1366 + }, + { + "epoch": 0.36965927528393727, + "grad_norm": 2.1138787269592285, + "learning_rate": 1.9909338956245184e-05, + "loss": 2.406, + "mean_token_accuracy": 0.5008454322814941, + "num_tokens": 699166979.0, + "step": 1367 + }, + { + "epoch": 0.3699296917252569, + "grad_norm": 2.2999730110168457, + "learning_rate": 1.9909115617283724e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.4705413281917572, + "num_tokens": 699691247.0, + "step": 1368 + }, + { + "epoch": 0.3702001081665765, + "grad_norm": 6.303678512573242, + "learning_rate": 1.9908892004964396e-05, + "loss": 2.0952, + "mean_token_accuracy": 0.5295449495315552, + "num_tokens": 700156098.0, + "step": 1369 + }, + { + "epoch": 0.37047052460789615, + "grad_norm": 2.746171474456787, + "learning_rate": 1.9908668119294057e-05, + "loss": 2.4556, + "mean_token_accuracy": 0.49235081672668457, + "num_tokens": 700680250.0, + "step": 1370 + }, + { + "epoch": 0.3707409410492158, + "grad_norm": 2.1550748348236084, + "learning_rate": 1.9908443960279587e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.47967690229415894, + "num_tokens": 701154117.0, + "step": 1371 + }, + { + "epoch": 0.37101135749053543, + "grad_norm": 2.8477039337158203, + "learning_rate": 1.9908219527927856e-05, + "loss": 2.407, + "mean_token_accuracy": 0.4871048331260681, + "num_tokens": 701678323.0, + "step": 1372 + }, + { + "epoch": 0.3712817739318551, + "grad_norm": 3.1135599613189697, + "learning_rate": 1.9907994822245755e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.4911220073699951, + "num_tokens": 702202587.0, + "step": 1373 + }, + { + "epoch": 0.37155219037317466, + "grad_norm": 1.9764927625656128, + "learning_rate": 1.9907769843240172e-05, + "loss": 2.5647, + "mean_token_accuracy": 0.47620242834091187, + "num_tokens": 702644138.0, + "step": 1374 + }, + { + "epoch": 0.3718226068144943, + "grad_norm": 2.706568956375122, + "learning_rate": 1.9907544590918017e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.4730013310909271, + "num_tokens": 703128397.0, + "step": 1375 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 2.696242570877075, + "learning_rate": 1.99073190652862e-05, + "loss": 2.4285, + "mean_token_accuracy": 0.4845448136329651, + "num_tokens": 703588793.0, + "step": 1376 + }, + { + "epoch": 0.3723634396971336, + "grad_norm": 2.0532779693603516, + "learning_rate": 1.9907093266351637e-05, + "loss": 2.5457, + "mean_token_accuracy": 0.4720498323440552, + "num_tokens": 704112848.0, + "step": 1377 + }, + { + "epoch": 0.37263385613845323, + "grad_norm": 2.680865526199341, + "learning_rate": 1.9906867194121256e-05, + "loss": 2.474, + "mean_token_accuracy": 0.4868641793727875, + "num_tokens": 704600608.0, + "step": 1378 + }, + { + "epoch": 0.3729042725797729, + "grad_norm": 2.3452484607696533, + "learning_rate": 1.9906640848601997e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.4807250201702118, + "num_tokens": 705124888.0, + "step": 1379 + }, + { + "epoch": 0.37317468902109246, + "grad_norm": 1.8586701154708862, + "learning_rate": 1.990641422980081e-05, + "loss": 2.4487, + "mean_token_accuracy": 0.4764178991317749, + "num_tokens": 705648937.0, + "step": 1380 + }, + { + "epoch": 0.3734451054624121, + "grad_norm": 0.9700098037719727, + "learning_rate": 1.9906187337724637e-05, + "loss": 1.177, + "mean_token_accuracy": 0.6664687395095825, + "num_tokens": 706159306.0, + "step": 1381 + }, + { + "epoch": 0.37371552190373175, + "grad_norm": 3.719928503036499, + "learning_rate": 1.9905960172380447e-05, + "loss": 2.4159, + "mean_token_accuracy": 0.4836757481098175, + "num_tokens": 706683467.0, + "step": 1382 + }, + { + "epoch": 0.3739859383450514, + "grad_norm": 3.5205795764923096, + "learning_rate": 1.9905732733775205e-05, + "loss": 2.4745, + "mean_token_accuracy": 0.46824830770492554, + "num_tokens": 707207735.0, + "step": 1383 + }, + { + "epoch": 0.37425635478637104, + "grad_norm": 1.7341140508651733, + "learning_rate": 1.9905505021915895e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.46979793906211853, + "num_tokens": 707731812.0, + "step": 1384 + }, + { + "epoch": 0.3745267712276906, + "grad_norm": 2.588181495666504, + "learning_rate": 1.9905277036809502e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.4840034246444702, + "num_tokens": 708256085.0, + "step": 1385 + }, + { + "epoch": 0.37479718766901027, + "grad_norm": 2.7268171310424805, + "learning_rate": 1.9905048778463023e-05, + "loss": 2.4065, + "mean_token_accuracy": 0.4889025390148163, + "num_tokens": 708780310.0, + "step": 1386 + }, + { + "epoch": 0.3750676041103299, + "grad_norm": 2.0968127250671387, + "learning_rate": 1.9904820246883457e-05, + "loss": 2.3856, + "mean_token_accuracy": 0.49122196435928345, + "num_tokens": 709259529.0, + "step": 1387 + }, + { + "epoch": 0.37533802055164955, + "grad_norm": 1.928870439529419, + "learning_rate": 1.9904591442077816e-05, + "loss": 2.4283, + "mean_token_accuracy": 0.5131143927574158, + "num_tokens": 709719614.0, + "step": 1388 + }, + { + "epoch": 0.3756084369929692, + "grad_norm": 2.699613332748413, + "learning_rate": 1.9904362364053126e-05, + "loss": 2.4896, + "mean_token_accuracy": 0.47307685017585754, + "num_tokens": 710243877.0, + "step": 1389 + }, + { + "epoch": 0.3758788534342888, + "grad_norm": 1.863867163658142, + "learning_rate": 1.9904133012816413e-05, + "loss": 2.4938, + "mean_token_accuracy": 0.47990119457244873, + "num_tokens": 710768115.0, + "step": 1390 + }, + { + "epoch": 0.3761492698756084, + "grad_norm": 2.1561806201934814, + "learning_rate": 1.9903903388374712e-05, + "loss": 2.4699, + "mean_token_accuracy": 0.5115053653717041, + "num_tokens": 711226964.0, + "step": 1391 + }, + { + "epoch": 0.37641968631692807, + "grad_norm": 1.8804322481155396, + "learning_rate": 1.990367349073507e-05, + "loss": 2.3303, + "mean_token_accuracy": 0.48931780457496643, + "num_tokens": 711736456.0, + "step": 1392 + }, + { + "epoch": 0.3766901027582477, + "grad_norm": 2.1571457386016846, + "learning_rate": 1.990344331990454e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.4885706603527069, + "num_tokens": 712254496.0, + "step": 1393 + }, + { + "epoch": 0.37696051919956736, + "grad_norm": 2.207106351852417, + "learning_rate": 1.9903212875890193e-05, + "loss": 2.4914, + "mean_token_accuracy": 0.4786354899406433, + "num_tokens": 712778766.0, + "step": 1394 + }, + { + "epoch": 0.37723093564088694, + "grad_norm": 2.0692496299743652, + "learning_rate": 1.9902982158699087e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.49740302562713623, + "num_tokens": 713302819.0, + "step": 1395 + }, + { + "epoch": 0.3775013520822066, + "grad_norm": 1.9471994638442993, + "learning_rate": 1.9902751168338306e-05, + "loss": 2.4889, + "mean_token_accuracy": 0.5005873441696167, + "num_tokens": 713826971.0, + "step": 1396 + }, + { + "epoch": 0.37777176852352623, + "grad_norm": 1.9003571271896362, + "learning_rate": 1.990251990481494e-05, + "loss": 2.3378, + "mean_token_accuracy": 0.5032593011856079, + "num_tokens": 714351225.0, + "step": 1397 + }, + { + "epoch": 0.3780421849648459, + "grad_norm": 2.1954150199890137, + "learning_rate": 1.990228836813608e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4846784472465515, + "num_tokens": 714875500.0, + "step": 1398 + }, + { + "epoch": 0.3783126014061655, + "grad_norm": 1.7004821300506592, + "learning_rate": 1.9902056558308835e-05, + "loss": 2.5276, + "mean_token_accuracy": 0.4700794816017151, + "num_tokens": 715355388.0, + "step": 1399 + }, + { + "epoch": 0.3785830178474851, + "grad_norm": 2.132324457168579, + "learning_rate": 1.9901824475340316e-05, + "loss": 2.6211, + "mean_token_accuracy": 0.4512079060077667, + "num_tokens": 715879671.0, + "step": 1400 + }, + { + "epoch": 0.37885343428880475, + "grad_norm": 0.9348033666610718, + "learning_rate": 1.9901592119237642e-05, + "loss": 1.0797, + "mean_token_accuracy": 0.7093971967697144, + "num_tokens": 716403866.0, + "step": 1401 + }, + { + "epoch": 0.3791238507301244, + "grad_norm": 2.36806058883667, + "learning_rate": 1.9901359490007943e-05, + "loss": 2.3412, + "mean_token_accuracy": 0.5089192986488342, + "num_tokens": 716928012.0, + "step": 1402 + }, + { + "epoch": 0.37939426717144403, + "grad_norm": 1.8821347951889038, + "learning_rate": 1.990112658765836e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.4908658266067505, + "num_tokens": 717452109.0, + "step": 1403 + }, + { + "epoch": 0.3796646836127637, + "grad_norm": 1.823987364768982, + "learning_rate": 1.9900893412196036e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.4963071942329407, + "num_tokens": 717913056.0, + "step": 1404 + }, + { + "epoch": 0.37993510005408326, + "grad_norm": 2.014268159866333, + "learning_rate": 1.9900659963628125e-05, + "loss": 2.4783, + "mean_token_accuracy": 0.4786670207977295, + "num_tokens": 718437171.0, + "step": 1405 + }, + { + "epoch": 0.3802055164954029, + "grad_norm": 2.286689043045044, + "learning_rate": 1.9900426241961793e-05, + "loss": 2.588, + "mean_token_accuracy": 0.48387274146080017, + "num_tokens": 718898418.0, + "step": 1406 + }, + { + "epoch": 0.38047593293672255, + "grad_norm": 1.6170272827148438, + "learning_rate": 1.9900192247204212e-05, + "loss": 2.4067, + "mean_token_accuracy": 0.4807029366493225, + "num_tokens": 719422700.0, + "step": 1407 + }, + { + "epoch": 0.3807463493780422, + "grad_norm": 1.9299594163894653, + "learning_rate": 1.989995797936255e-05, + "loss": 2.48, + "mean_token_accuracy": 0.47391873598098755, + "num_tokens": 719946987.0, + "step": 1408 + }, + { + "epoch": 0.38101676581936184, + "grad_norm": 2.1096620559692383, + "learning_rate": 1.9899723438444015e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.48805221915245056, + "num_tokens": 720471249.0, + "step": 1409 + }, + { + "epoch": 0.3812871822606814, + "grad_norm": 1.7654037475585938, + "learning_rate": 1.9899488624455784e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.4868181347846985, + "num_tokens": 720995454.0, + "step": 1410 + }, + { + "epoch": 0.38155759870200107, + "grad_norm": 1.8865327835083008, + "learning_rate": 1.9899253537405074e-05, + "loss": 2.4391, + "mean_token_accuracy": 0.47502100467681885, + "num_tokens": 721519536.0, + "step": 1411 + }, + { + "epoch": 0.3818280151433207, + "grad_norm": 1.9199696779251099, + "learning_rate": 1.9899018177299093e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.4758608341217041, + "num_tokens": 722043729.0, + "step": 1412 + }, + { + "epoch": 0.38209843158464035, + "grad_norm": 1.903106927871704, + "learning_rate": 1.989878254414507e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.49488672614097595, + "num_tokens": 722567873.0, + "step": 1413 + }, + { + "epoch": 0.38236884802596, + "grad_norm": 2.0083558559417725, + "learning_rate": 1.9898546637950222e-05, + "loss": 2.3026, + "mean_token_accuracy": 0.5151956677436829, + "num_tokens": 723092035.0, + "step": 1414 + }, + { + "epoch": 0.3826392644672796, + "grad_norm": 1.6286870241165161, + "learning_rate": 1.9898310458721795e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.49053359031677246, + "num_tokens": 723611857.0, + "step": 1415 + }, + { + "epoch": 0.3829096809085992, + "grad_norm": 1.6116676330566406, + "learning_rate": 1.9898074006467036e-05, + "loss": 2.438, + "mean_token_accuracy": 0.4800403118133545, + "num_tokens": 724123438.0, + "step": 1416 + }, + { + "epoch": 0.38318009734991887, + "grad_norm": 1.3735885620117188, + "learning_rate": 1.98978372811932e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4887582063674927, + "num_tokens": 724647574.0, + "step": 1417 + }, + { + "epoch": 0.3834505137912385, + "grad_norm": 1.8534255027770996, + "learning_rate": 1.9897600282907548e-05, + "loss": 2.5395, + "mean_token_accuracy": 0.47706741094589233, + "num_tokens": 725147077.0, + "step": 1418 + }, + { + "epoch": 0.38372093023255816, + "grad_norm": 1.5569961071014404, + "learning_rate": 1.9897363011617354e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.4568495750427246, + "num_tokens": 725665083.0, + "step": 1419 + }, + { + "epoch": 0.3839913466738778, + "grad_norm": 1.410066843032837, + "learning_rate": 1.9897125467329898e-05, + "loss": 2.4954, + "mean_token_accuracy": 0.4735792875289917, + "num_tokens": 726189282.0, + "step": 1420 + }, + { + "epoch": 0.3842617631151974, + "grad_norm": 1.0750442743301392, + "learning_rate": 1.9896887650052468e-05, + "loss": 1.1952, + "mean_token_accuracy": 0.681831955909729, + "num_tokens": 726686395.0, + "step": 1421 + }, + { + "epoch": 0.38453217955651703, + "grad_norm": 1.878326416015625, + "learning_rate": 1.9896649559792358e-05, + "loss": 2.2464, + "mean_token_accuracy": 0.5003941059112549, + "num_tokens": 727210570.0, + "step": 1422 + }, + { + "epoch": 0.38480259599783667, + "grad_norm": 2.1833250522613525, + "learning_rate": 1.989641119655688e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.48783978819847107, + "num_tokens": 727675706.0, + "step": 1423 + }, + { + "epoch": 0.3850730124391563, + "grad_norm": 1.2284302711486816, + "learning_rate": 1.9896172560353343e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.4917038679122925, + "num_tokens": 728199807.0, + "step": 1424 + }, + { + "epoch": 0.38534342888047596, + "grad_norm": 1.5539323091506958, + "learning_rate": 1.9895933651189064e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.5218170881271362, + "num_tokens": 728714117.0, + "step": 1425 + }, + { + "epoch": 0.38561384532179555, + "grad_norm": 1.902405858039856, + "learning_rate": 1.9895694469071386e-05, + "loss": 2.4362, + "mean_token_accuracy": 0.4587203860282898, + "num_tokens": 729238067.0, + "step": 1426 + }, + { + "epoch": 0.3858842617631152, + "grad_norm": 1.8469401597976685, + "learning_rate": 1.989545501400764e-05, + "loss": 2.5709, + "mean_token_accuracy": 0.4776883125305176, + "num_tokens": 729762167.0, + "step": 1427 + }, + { + "epoch": 0.38615467820443483, + "grad_norm": 1.562450885772705, + "learning_rate": 1.9895215286005174e-05, + "loss": 2.5254, + "mean_token_accuracy": 0.45879608392715454, + "num_tokens": 730286446.0, + "step": 1428 + }, + { + "epoch": 0.3864250946457545, + "grad_norm": 1.389794945716858, + "learning_rate": 1.9894975285071344e-05, + "loss": 2.37, + "mean_token_accuracy": 0.49780935049057007, + "num_tokens": 730810644.0, + "step": 1429 + }, + { + "epoch": 0.3866955110870741, + "grad_norm": 1.4811097383499146, + "learning_rate": 1.989473501121351e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.491183340549469, + "num_tokens": 731334685.0, + "step": 1430 + }, + { + "epoch": 0.3869659275283937, + "grad_norm": 1.3935256004333496, + "learning_rate": 1.9894494464439052e-05, + "loss": 2.4578, + "mean_token_accuracy": 0.478386789560318, + "num_tokens": 731847979.0, + "step": 1431 + }, + { + "epoch": 0.38723634396971335, + "grad_norm": 1.3033446073532104, + "learning_rate": 1.989425364475535e-05, + "loss": 2.5531, + "mean_token_accuracy": 0.47447311878204346, + "num_tokens": 732315331.0, + "step": 1432 + }, + { + "epoch": 0.387506760411033, + "grad_norm": 2.711300849914551, + "learning_rate": 1.9894012552169786e-05, + "loss": 2.3005, + "mean_token_accuracy": 0.5170555114746094, + "num_tokens": 732814345.0, + "step": 1433 + }, + { + "epoch": 0.38777717685235263, + "grad_norm": 2.301368236541748, + "learning_rate": 1.9893771186689758e-05, + "loss": 2.5551, + "mean_token_accuracy": 0.4727986454963684, + "num_tokens": 733328121.0, + "step": 1434 + }, + { + "epoch": 0.3880475932936723, + "grad_norm": 2.1913418769836426, + "learning_rate": 1.989352954832268e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.4843217134475708, + "num_tokens": 733852256.0, + "step": 1435 + }, + { + "epoch": 0.38831800973499186, + "grad_norm": 1.5904505252838135, + "learning_rate": 1.9893287637075962e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.4763653874397278, + "num_tokens": 734376403.0, + "step": 1436 + }, + { + "epoch": 0.3885884261763115, + "grad_norm": 2.0208723545074463, + "learning_rate": 1.9893045452957022e-05, + "loss": 2.5643, + "mean_token_accuracy": 0.47592848539352417, + "num_tokens": 734886494.0, + "step": 1437 + }, + { + "epoch": 0.38885884261763115, + "grad_norm": 1.8275834321975708, + "learning_rate": 1.98928029959733e-05, + "loss": 2.2069, + "mean_token_accuracy": 0.5100820064544678, + "num_tokens": 735363739.0, + "step": 1438 + }, + { + "epoch": 0.3891292590589508, + "grad_norm": 1.6584135293960571, + "learning_rate": 1.9892560266132226e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.5056284070014954, + "num_tokens": 735887924.0, + "step": 1439 + }, + { + "epoch": 0.38939967550027044, + "grad_norm": 2.220780611038208, + "learning_rate": 1.9892317263441252e-05, + "loss": 2.4104, + "mean_token_accuracy": 0.49569910764694214, + "num_tokens": 736370313.0, + "step": 1440 + }, + { + "epoch": 0.38967009194159, + "grad_norm": 1.0151476860046387, + "learning_rate": 1.9892073987907833e-05, + "loss": 1.215, + "mean_token_accuracy": 0.6791099309921265, + "num_tokens": 736894570.0, + "step": 1441 + }, + { + "epoch": 0.38994050838290967, + "grad_norm": 3.135591745376587, + "learning_rate": 1.9891830439539434e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.480124831199646, + "num_tokens": 737418844.0, + "step": 1442 + }, + { + "epoch": 0.3902109248242293, + "grad_norm": 2.9859066009521484, + "learning_rate": 1.989158661834353e-05, + "loss": 2.5991, + "mean_token_accuracy": 0.4790472388267517, + "num_tokens": 737907592.0, + "step": 1443 + }, + { + "epoch": 0.39048134126554895, + "grad_norm": 1.7702534198760986, + "learning_rate": 1.98913425243276e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.5035253763198853, + "num_tokens": 738431761.0, + "step": 1444 + }, + { + "epoch": 0.3907517577068686, + "grad_norm": 2.706820249557495, + "learning_rate": 1.989109815749913e-05, + "loss": 2.526, + "mean_token_accuracy": 0.47718536853790283, + "num_tokens": 738943244.0, + "step": 1445 + }, + { + "epoch": 0.3910221741481882, + "grad_norm": 2.200378656387329, + "learning_rate": 1.9890853517865628e-05, + "loss": 2.535, + "mean_token_accuracy": 0.47225669026374817, + "num_tokens": 739467349.0, + "step": 1446 + }, + { + "epoch": 0.3912925905895078, + "grad_norm": 2.387084722518921, + "learning_rate": 1.9890608605434588e-05, + "loss": 2.4166, + "mean_token_accuracy": 0.48294946551322937, + "num_tokens": 739987227.0, + "step": 1447 + }, + { + "epoch": 0.39156300703082747, + "grad_norm": 1.874860405921936, + "learning_rate": 1.9890363420213534e-05, + "loss": 2.2368, + "mean_token_accuracy": 0.5084484815597534, + "num_tokens": 740511213.0, + "step": 1448 + }, + { + "epoch": 0.3918334234721471, + "grad_norm": 2.0770485401153564, + "learning_rate": 1.9890117962209985e-05, + "loss": 2.3652, + "mean_token_accuracy": 0.5033928751945496, + "num_tokens": 740975710.0, + "step": 1449 + }, + { + "epoch": 0.39210383991346676, + "grad_norm": 2.1575982570648193, + "learning_rate": 1.9889872231431472e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.47116416692733765, + "num_tokens": 741489983.0, + "step": 1450 + }, + { + "epoch": 0.39237425635478634, + "grad_norm": 1.9878456592559814, + "learning_rate": 1.9889626227885535e-05, + "loss": 2.5505, + "mean_token_accuracy": 0.4865528345108032, + "num_tokens": 741954053.0, + "step": 1451 + }, + { + "epoch": 0.392644672796106, + "grad_norm": 2.3252577781677246, + "learning_rate": 1.9889379951579722e-05, + "loss": 2.2775, + "mean_token_accuracy": 0.4906599521636963, + "num_tokens": 742478124.0, + "step": 1452 + }, + { + "epoch": 0.39291508923742563, + "grad_norm": 1.5998395681381226, + "learning_rate": 1.9889133402521592e-05, + "loss": 2.4645, + "mean_token_accuracy": 0.47684553265571594, + "num_tokens": 743002309.0, + "step": 1453 + }, + { + "epoch": 0.3931855056787453, + "grad_norm": 1.9989784955978394, + "learning_rate": 1.9888886580718708e-05, + "loss": 2.4448, + "mean_token_accuracy": 0.4993084967136383, + "num_tokens": 743526551.0, + "step": 1454 + }, + { + "epoch": 0.3934559221200649, + "grad_norm": 1.829702615737915, + "learning_rate": 1.988863948617864e-05, + "loss": 2.2375, + "mean_token_accuracy": 0.49237024784088135, + "num_tokens": 744050829.0, + "step": 1455 + }, + { + "epoch": 0.39372633856138456, + "grad_norm": 1.9087220430374146, + "learning_rate": 1.9888392118908978e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.4891654849052429, + "num_tokens": 744575112.0, + "step": 1456 + }, + { + "epoch": 0.39399675500270415, + "grad_norm": 2.1299211978912354, + "learning_rate": 1.98881444789173e-05, + "loss": 2.4741, + "mean_token_accuracy": 0.47849228978157043, + "num_tokens": 745099141.0, + "step": 1457 + }, + { + "epoch": 0.3942671714440238, + "grad_norm": 1.463080644607544, + "learning_rate": 1.9887896566211217e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.5096280574798584, + "num_tokens": 745623405.0, + "step": 1458 + }, + { + "epoch": 0.39453758788534343, + "grad_norm": 1.8959389925003052, + "learning_rate": 1.988764838079833e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.4721486270427704, + "num_tokens": 746147571.0, + "step": 1459 + }, + { + "epoch": 0.3948080043266631, + "grad_norm": 1.7738114595413208, + "learning_rate": 1.9887399922686246e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.4818136394023895, + "num_tokens": 746617213.0, + "step": 1460 + }, + { + "epoch": 0.3950784207679827, + "grad_norm": 0.9711384773254395, + "learning_rate": 1.9887151191882606e-05, + "loss": 1.1894, + "mean_token_accuracy": 0.706847071647644, + "num_tokens": 747141433.0, + "step": 1461 + }, + { + "epoch": 0.3953488372093023, + "grad_norm": 3.361802101135254, + "learning_rate": 1.9886902188395028e-05, + "loss": 2.371, + "mean_token_accuracy": 0.4894932210445404, + "num_tokens": 747621056.0, + "step": 1462 + }, + { + "epoch": 0.39561925365062195, + "grad_norm": 2.700660467147827, + "learning_rate": 1.9886652912231156e-05, + "loss": 2.4395, + "mean_token_accuracy": 0.4803393483161926, + "num_tokens": 748145328.0, + "step": 1463 + }, + { + "epoch": 0.3958896700919416, + "grad_norm": 1.6906371116638184, + "learning_rate": 1.9886403363398637e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.5000825524330139, + "num_tokens": 748639856.0, + "step": 1464 + }, + { + "epoch": 0.39616008653326124, + "grad_norm": 2.9298384189605713, + "learning_rate": 1.9886153541905135e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.47475865483283997, + "num_tokens": 749087637.0, + "step": 1465 + }, + { + "epoch": 0.3964305029745809, + "grad_norm": 3.158020496368408, + "learning_rate": 1.9885903447758303e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.4721035361289978, + "num_tokens": 749611760.0, + "step": 1466 + }, + { + "epoch": 0.39670091941590047, + "grad_norm": 2.5872914791107178, + "learning_rate": 1.9885653080965823e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.4760352075099945, + "num_tokens": 750135932.0, + "step": 1467 + }, + { + "epoch": 0.3969713358572201, + "grad_norm": 2.067929744720459, + "learning_rate": 1.9885402441535378e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.46773475408554077, + "num_tokens": 750660132.0, + "step": 1468 + }, + { + "epoch": 0.39724175229853975, + "grad_norm": 2.1767382621765137, + "learning_rate": 1.9885151529474654e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.5169572830200195, + "num_tokens": 751184350.0, + "step": 1469 + }, + { + "epoch": 0.3975121687398594, + "grad_norm": 2.711374044418335, + "learning_rate": 1.9884900344791354e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.47895511984825134, + "num_tokens": 751708378.0, + "step": 1470 + }, + { + "epoch": 0.39778258518117904, + "grad_norm": 2.282693862915039, + "learning_rate": 1.988464888749318e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.4914144277572632, + "num_tokens": 752206726.0, + "step": 1471 + }, + { + "epoch": 0.3980530016224986, + "grad_norm": 2.1455228328704834, + "learning_rate": 1.9884397157587848e-05, + "loss": 2.4956, + "mean_token_accuracy": 0.4914281368255615, + "num_tokens": 752731007.0, + "step": 1472 + }, + { + "epoch": 0.39832341806381827, + "grad_norm": 2.804234027862549, + "learning_rate": 1.988414515508309e-05, + "loss": 2.4267, + "mean_token_accuracy": 0.4891984164714813, + "num_tokens": 753255191.0, + "step": 1473 + }, + { + "epoch": 0.3985938345051379, + "grad_norm": 1.9150549173355103, + "learning_rate": 1.988389287998663e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.4876656234264374, + "num_tokens": 753779354.0, + "step": 1474 + }, + { + "epoch": 0.39886425094645755, + "grad_norm": 2.349522829055786, + "learning_rate": 1.9883640332306207e-05, + "loss": 2.3048, + "mean_token_accuracy": 0.5064656734466553, + "num_tokens": 754303624.0, + "step": 1475 + }, + { + "epoch": 0.3991346673877772, + "grad_norm": 2.168693780899048, + "learning_rate": 1.9883387512049578e-05, + "loss": 2.3581, + "mean_token_accuracy": 0.4988313615322113, + "num_tokens": 754827797.0, + "step": 1476 + }, + { + "epoch": 0.3994050838290968, + "grad_norm": 1.677569031715393, + "learning_rate": 1.9883134419224496e-05, + "loss": 2.458, + "mean_token_accuracy": 0.4812631607055664, + "num_tokens": 755351856.0, + "step": 1477 + }, + { + "epoch": 0.39967550027041643, + "grad_norm": 1.9326107501983643, + "learning_rate": 1.9882881053838722e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.4880017638206482, + "num_tokens": 755876054.0, + "step": 1478 + }, + { + "epoch": 0.39994591671173607, + "grad_norm": 1.415482521057129, + "learning_rate": 1.9882627415900037e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.4696331322193146, + "num_tokens": 756400136.0, + "step": 1479 + }, + { + "epoch": 0.4002163331530557, + "grad_norm": 1.8812226057052612, + "learning_rate": 1.9882373505416222e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.4804319739341736, + "num_tokens": 756924184.0, + "step": 1480 + }, + { + "epoch": 0.40048674959437536, + "grad_norm": 1.1935325860977173, + "learning_rate": 1.9882119322395064e-05, + "loss": 1.3407, + "mean_token_accuracy": 0.667701005935669, + "num_tokens": 757379561.0, + "step": 1481 + }, + { + "epoch": 0.40075716603569495, + "grad_norm": 2.1799347400665283, + "learning_rate": 1.988186486684437e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.5029711723327637, + "num_tokens": 757903707.0, + "step": 1482 + }, + { + "epoch": 0.4010275824770146, + "grad_norm": 1.6191788911819458, + "learning_rate": 1.9881610138771936e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.498365581035614, + "num_tokens": 758427893.0, + "step": 1483 + }, + { + "epoch": 0.40129799891833423, + "grad_norm": 1.5568767786026, + "learning_rate": 1.9881355138185585e-05, + "loss": 2.5286, + "mean_token_accuracy": 0.478273868560791, + "num_tokens": 758952168.0, + "step": 1484 + }, + { + "epoch": 0.4015684153596539, + "grad_norm": 1.8174961805343628, + "learning_rate": 1.988109986509314e-05, + "loss": 2.404, + "mean_token_accuracy": 0.482735812664032, + "num_tokens": 759476361.0, + "step": 1485 + }, + { + "epoch": 0.4018388318009735, + "grad_norm": 1.9070360660552979, + "learning_rate": 1.9880844319502432e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.49362051486968994, + "num_tokens": 760000522.0, + "step": 1486 + }, + { + "epoch": 0.4021092482422931, + "grad_norm": 1.7040033340454102, + "learning_rate": 1.9880588501421304e-05, + "loss": 2.2933, + "mean_token_accuracy": 0.4974954128265381, + "num_tokens": 760478294.0, + "step": 1487 + }, + { + "epoch": 0.40237966468361275, + "grad_norm": 2.026482582092285, + "learning_rate": 1.9880332410857608e-05, + "loss": 2.4095, + "mean_token_accuracy": 0.4935852289199829, + "num_tokens": 761002489.0, + "step": 1488 + }, + { + "epoch": 0.4026500811249324, + "grad_norm": 1.733527421951294, + "learning_rate": 1.9880076047819198e-05, + "loss": 2.3202, + "mean_token_accuracy": 0.4876459836959839, + "num_tokens": 761526520.0, + "step": 1489 + }, + { + "epoch": 0.40292049756625203, + "grad_norm": 7.574014663696289, + "learning_rate": 1.987981941231394e-05, + "loss": 2.0888, + "mean_token_accuracy": 0.540627121925354, + "num_tokens": 762050767.0, + "step": 1490 + }, + { + "epoch": 0.4031909140075717, + "grad_norm": 2.4967832565307617, + "learning_rate": 1.9879562504349703e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.48778098821640015, + "num_tokens": 762551556.0, + "step": 1491 + }, + { + "epoch": 0.4034613304488913, + "grad_norm": 2.2785706520080566, + "learning_rate": 1.9879305323934383e-05, + "loss": 2.4323, + "mean_token_accuracy": 0.4965423345565796, + "num_tokens": 763074209.0, + "step": 1492 + }, + { + "epoch": 0.4037317468902109, + "grad_norm": 1.677049160003662, + "learning_rate": 1.9879047871075858e-05, + "loss": 2.2711, + "mean_token_accuracy": 0.4919687509536743, + "num_tokens": 763598468.0, + "step": 1493 + }, + { + "epoch": 0.40400216333153055, + "grad_norm": 2.0998620986938477, + "learning_rate": 1.9878790145782034e-05, + "loss": 2.4022, + "mean_token_accuracy": 0.4834098815917969, + "num_tokens": 764122701.0, + "step": 1494 + }, + { + "epoch": 0.4042725797728502, + "grad_norm": 2.676194190979004, + "learning_rate": 1.9878532148060817e-05, + "loss": 2.3949, + "mean_token_accuracy": 0.5067750215530396, + "num_tokens": 764586038.0, + "step": 1495 + }, + { + "epoch": 0.40454299621416984, + "grad_norm": 1.840446949005127, + "learning_rate": 1.9878273877920128e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.49962764978408813, + "num_tokens": 765110235.0, + "step": 1496 + }, + { + "epoch": 0.4048134126554895, + "grad_norm": 1.5419931411743164, + "learning_rate": 1.9878015335367882e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.4930818974971771, + "num_tokens": 765634352.0, + "step": 1497 + }, + { + "epoch": 0.40508382909680907, + "grad_norm": 2.225480794906616, + "learning_rate": 1.987775652041202e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4896799325942993, + "num_tokens": 766158603.0, + "step": 1498 + }, + { + "epoch": 0.4053542455381287, + "grad_norm": 1.6366374492645264, + "learning_rate": 1.9877497433060477e-05, + "loss": 2.4127, + "mean_token_accuracy": 0.4780726432800293, + "num_tokens": 766682806.0, + "step": 1499 + }, + { + "epoch": 0.40562466197944835, + "grad_norm": 2.3018746376037598, + "learning_rate": 1.987723807332121e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.5339062213897705, + "num_tokens": 767207088.0, + "step": 1500 + }, + { + "epoch": 0.405895078420768, + "grad_norm": 0.9047436714172363, + "learning_rate": 1.987697844120217e-05, + "loss": 1.1945, + "mean_token_accuracy": 0.676152229309082, + "num_tokens": 767731350.0, + "step": 1501 + }, + { + "epoch": 0.40616549486208764, + "grad_norm": 3.17917799949646, + "learning_rate": 1.9876718536711326e-05, + "loss": 2.5227, + "mean_token_accuracy": 0.47573399543762207, + "num_tokens": 768255610.0, + "step": 1502 + }, + { + "epoch": 0.4064359113034072, + "grad_norm": 2.416168689727783, + "learning_rate": 1.9876458359856652e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.4998161494731903, + "num_tokens": 768753345.0, + "step": 1503 + }, + { + "epoch": 0.40670632774472687, + "grad_norm": 1.5589183568954468, + "learning_rate": 1.987619791064613e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.5016249418258667, + "num_tokens": 769277600.0, + "step": 1504 + }, + { + "epoch": 0.4069767441860465, + "grad_norm": 2.0083513259887695, + "learning_rate": 1.9875937189087758e-05, + "loss": 2.2899, + "mean_token_accuracy": 0.5078930854797363, + "num_tokens": 769704216.0, + "step": 1505 + }, + { + "epoch": 0.40724716062736616, + "grad_norm": 2.1901464462280273, + "learning_rate": 1.9875676195189528e-05, + "loss": 2.3757, + "mean_token_accuracy": 0.49520695209503174, + "num_tokens": 770228231.0, + "step": 1506 + }, + { + "epoch": 0.4075175770686858, + "grad_norm": 1.5826436281204224, + "learning_rate": 1.987541492895945e-05, + "loss": 2.1867, + "mean_token_accuracy": 0.5172996520996094, + "num_tokens": 770752380.0, + "step": 1507 + }, + { + "epoch": 0.4077879935100054, + "grad_norm": 2.1929845809936523, + "learning_rate": 1.9875153390405543e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.4956890940666199, + "num_tokens": 771276626.0, + "step": 1508 + }, + { + "epoch": 0.40805840995132503, + "grad_norm": 1.9947062730789185, + "learning_rate": 1.9874891579535826e-05, + "loss": 2.5137, + "mean_token_accuracy": 0.47997891902923584, + "num_tokens": 771800832.0, + "step": 1509 + }, + { + "epoch": 0.4083288263926447, + "grad_norm": 1.801741361618042, + "learning_rate": 1.987462949635834e-05, + "loss": 2.2802, + "mean_token_accuracy": 0.5187973976135254, + "num_tokens": 772315331.0, + "step": 1510 + }, + { + "epoch": 0.4085992428339643, + "grad_norm": 2.219327688217163, + "learning_rate": 1.9874367140881122e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.5179867744445801, + "num_tokens": 772777356.0, + "step": 1511 + }, + { + "epoch": 0.40886965927528396, + "grad_norm": 2.075981378555298, + "learning_rate": 1.987410451311222e-05, + "loss": 2.2083, + "mean_token_accuracy": 0.5166034698486328, + "num_tokens": 773301562.0, + "step": 1512 + }, + { + "epoch": 0.40914007571660355, + "grad_norm": 1.9247198104858398, + "learning_rate": 1.9873841613059697e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.4705837070941925, + "num_tokens": 773825836.0, + "step": 1513 + }, + { + "epoch": 0.4094104921579232, + "grad_norm": 1.892741322517395, + "learning_rate": 1.9873578440731614e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.49045324325561523, + "num_tokens": 774298148.0, + "step": 1514 + }, + { + "epoch": 0.40968090859924283, + "grad_norm": 1.470799446105957, + "learning_rate": 1.9873314996136052e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.4842093586921692, + "num_tokens": 774811736.0, + "step": 1515 + }, + { + "epoch": 0.4099513250405625, + "grad_norm": 1.8280034065246582, + "learning_rate": 1.987305127928109e-05, + "loss": 2.3185, + "mean_token_accuracy": 0.4934113025665283, + "num_tokens": 775335934.0, + "step": 1516 + }, + { + "epoch": 0.4102217414818821, + "grad_norm": 1.9535061120986938, + "learning_rate": 1.987278729017482e-05, + "loss": 2.501, + "mean_token_accuracy": 0.4949776530265808, + "num_tokens": 775852943.0, + "step": 1517 + }, + { + "epoch": 0.4104921579232017, + "grad_norm": 1.5345458984375, + "learning_rate": 1.9872523028825344e-05, + "loss": 2.4163, + "mean_token_accuracy": 0.4654303193092346, + "num_tokens": 776377156.0, + "step": 1518 + }, + { + "epoch": 0.41076257436452135, + "grad_norm": 1.6696854829788208, + "learning_rate": 1.987225849524077e-05, + "loss": 2.3886, + "mean_token_accuracy": 0.4897821247577667, + "num_tokens": 776901333.0, + "step": 1519 + }, + { + "epoch": 0.411032990805841, + "grad_norm": 1.5970351696014404, + "learning_rate": 1.9871993689429213e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.501751720905304, + "num_tokens": 777425208.0, + "step": 1520 + }, + { + "epoch": 0.41130340724716064, + "grad_norm": 1.1412190198898315, + "learning_rate": 1.9871728611398798e-05, + "loss": 1.2318, + "mean_token_accuracy": 0.6747081279754639, + "num_tokens": 777886211.0, + "step": 1521 + }, + { + "epoch": 0.4115738236884803, + "grad_norm": 2.5889198780059814, + "learning_rate": 1.987146326115766e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.49467596411705017, + "num_tokens": 778387604.0, + "step": 1522 + }, + { + "epoch": 0.41184424012979987, + "grad_norm": 2.0176844596862793, + "learning_rate": 1.987119763871394e-05, + "loss": 2.384, + "mean_token_accuracy": 0.48892658948898315, + "num_tokens": 778882741.0, + "step": 1523 + }, + { + "epoch": 0.4121146565711195, + "grad_norm": 1.5438991785049438, + "learning_rate": 1.9870931744075782e-05, + "loss": 2.4127, + "mean_token_accuracy": 0.490383118391037, + "num_tokens": 779406965.0, + "step": 1524 + }, + { + "epoch": 0.41238507301243915, + "grad_norm": 2.146122694015503, + "learning_rate": 1.9870665577251358e-05, + "loss": 2.4747, + "mean_token_accuracy": 0.47544676065444946, + "num_tokens": 779931209.0, + "step": 1525 + }, + { + "epoch": 0.4126554894537588, + "grad_norm": 1.8918441534042358, + "learning_rate": 1.9870399138248827e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.4614180624485016, + "num_tokens": 780436066.0, + "step": 1526 + }, + { + "epoch": 0.41292590589507844, + "grad_norm": 2.4307737350463867, + "learning_rate": 1.9870132427076362e-05, + "loss": 2.4687, + "mean_token_accuracy": 0.48064684867858887, + "num_tokens": 780960335.0, + "step": 1527 + }, + { + "epoch": 0.413196322336398, + "grad_norm": 2.443211555480957, + "learning_rate": 1.9869865443742145e-05, + "loss": 2.7025, + "mean_token_accuracy": 0.45909738540649414, + "num_tokens": 781437680.0, + "step": 1528 + }, + { + "epoch": 0.41346673877771767, + "grad_norm": 1.892126441001892, + "learning_rate": 1.986959818825438e-05, + "loss": 2.2363, + "mean_token_accuracy": 0.5254320502281189, + "num_tokens": 781961833.0, + "step": 1529 + }, + { + "epoch": 0.4137371552190373, + "grad_norm": 2.1113009452819824, + "learning_rate": 1.986933066062125e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.5209131240844727, + "num_tokens": 782428303.0, + "step": 1530 + }, + { + "epoch": 0.41400757166035695, + "grad_norm": 2.1760547161102295, + "learning_rate": 1.9869062860850977e-05, + "loss": 2.4396, + "mean_token_accuracy": 0.4878217875957489, + "num_tokens": 782952584.0, + "step": 1531 + }, + { + "epoch": 0.4142779881016766, + "grad_norm": 1.5789692401885986, + "learning_rate": 1.986879478895177e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4974552392959595, + "num_tokens": 783468298.0, + "step": 1532 + }, + { + "epoch": 0.41454840454299624, + "grad_norm": 4.067779541015625, + "learning_rate": 1.9868526444931864e-05, + "loss": 1.994, + "mean_token_accuracy": 0.5779571533203125, + "num_tokens": 783992573.0, + "step": 1533 + }, + { + "epoch": 0.41481882098431583, + "grad_norm": 2.863403558731079, + "learning_rate": 1.9868257828799482e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.4845895767211914, + "num_tokens": 784516662.0, + "step": 1534 + }, + { + "epoch": 0.41508923742563547, + "grad_norm": 2.289487361907959, + "learning_rate": 1.986798894056287e-05, + "loss": 2.3844, + "mean_token_accuracy": 0.48991596698760986, + "num_tokens": 785003148.0, + "step": 1535 + }, + { + "epoch": 0.4153596538669551, + "grad_norm": 2.0399343967437744, + "learning_rate": 1.986771978023028e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.5280691385269165, + "num_tokens": 785464656.0, + "step": 1536 + }, + { + "epoch": 0.41563007030827476, + "grad_norm": 2.546003580093384, + "learning_rate": 1.9867450347809965e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.4850637912750244, + "num_tokens": 785988900.0, + "step": 1537 + }, + { + "epoch": 0.4159004867495944, + "grad_norm": 3.9514708518981934, + "learning_rate": 1.98671806433102e-05, + "loss": 2.3179, + "mean_token_accuracy": 0.5248183012008667, + "num_tokens": 786513024.0, + "step": 1538 + }, + { + "epoch": 0.416170903190914, + "grad_norm": 2.2301268577575684, + "learning_rate": 1.9866910666739252e-05, + "loss": 2.5216, + "mean_token_accuracy": 0.469613254070282, + "num_tokens": 787037207.0, + "step": 1539 + }, + { + "epoch": 0.41644131963223363, + "grad_norm": 1.8143391609191895, + "learning_rate": 1.9866640418105414e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.49376416206359863, + "num_tokens": 787561342.0, + "step": 1540 + }, + { + "epoch": 0.4167117360735533, + "grad_norm": 0.7645040154457092, + "learning_rate": 1.986636989741697e-05, + "loss": 1.1326, + "mean_token_accuracy": 0.7039350271224976, + "num_tokens": 788000768.0, + "step": 1541 + }, + { + "epoch": 0.4169821525148729, + "grad_norm": 2.5613229274749756, + "learning_rate": 1.986609910468222e-05, + "loss": 2.4072, + "mean_token_accuracy": 0.5201317071914673, + "num_tokens": 788484973.0, + "step": 1542 + }, + { + "epoch": 0.41725256895619256, + "grad_norm": 2.0904901027679443, + "learning_rate": 1.986582803990948e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.5121880769729614, + "num_tokens": 788916439.0, + "step": 1543 + }, + { + "epoch": 0.41752298539751215, + "grad_norm": 1.5936524868011475, + "learning_rate": 1.9865556703107062e-05, + "loss": 2.428, + "mean_token_accuracy": 0.47721272706985474, + "num_tokens": 789440705.0, + "step": 1544 + }, + { + "epoch": 0.4177934018388318, + "grad_norm": 1.7470813989639282, + "learning_rate": 1.986528509428329e-05, + "loss": 2.3413, + "mean_token_accuracy": 0.5086788535118103, + "num_tokens": 789964870.0, + "step": 1545 + }, + { + "epoch": 0.41806381828015143, + "grad_norm": 1.6245934963226318, + "learning_rate": 1.9865013213446508e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.4911026358604431, + "num_tokens": 790489053.0, + "step": 1546 + }, + { + "epoch": 0.4183342347214711, + "grad_norm": 1.5841273069381714, + "learning_rate": 1.9864741060605043e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.5164617300033569, + "num_tokens": 791013243.0, + "step": 1547 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 1.4501166343688965, + "learning_rate": 1.9864468635767255e-05, + "loss": 2.324, + "mean_token_accuracy": 0.5105125904083252, + "num_tokens": 791537500.0, + "step": 1548 + }, + { + "epoch": 0.4188750676041103, + "grad_norm": 1.542193055152893, + "learning_rate": 1.9864195938941502e-05, + "loss": 2.4391, + "mean_token_accuracy": 0.4874729812145233, + "num_tokens": 792061671.0, + "step": 1549 + }, + { + "epoch": 0.41914548404542995, + "grad_norm": 1.4408745765686035, + "learning_rate": 1.986392297013615e-05, + "loss": 2.328, + "mean_token_accuracy": 0.5153874158859253, + "num_tokens": 792542203.0, + "step": 1550 + }, + { + "epoch": 0.4194159004867496, + "grad_norm": 1.604811429977417, + "learning_rate": 1.9863649729359567e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.46299195289611816, + "num_tokens": 793066421.0, + "step": 1551 + }, + { + "epoch": 0.41968631692806924, + "grad_norm": 1.4225786924362183, + "learning_rate": 1.9863376216620153e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.48112186789512634, + "num_tokens": 793590600.0, + "step": 1552 + }, + { + "epoch": 0.4199567333693889, + "grad_norm": 1.4170401096343994, + "learning_rate": 1.986310243192629e-05, + "loss": 2.472, + "mean_token_accuracy": 0.48132336139678955, + "num_tokens": 794114867.0, + "step": 1553 + }, + { + "epoch": 0.42022714981070847, + "grad_norm": 1.4402663707733154, + "learning_rate": 1.9862828375286372e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.5082546472549438, + "num_tokens": 794639027.0, + "step": 1554 + }, + { + "epoch": 0.4204975662520281, + "grad_norm": 1.3828775882720947, + "learning_rate": 1.9862554046708825e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.49848419427871704, + "num_tokens": 795163277.0, + "step": 1555 + }, + { + "epoch": 0.42076798269334775, + "grad_norm": 1.248295783996582, + "learning_rate": 1.986227944620205e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.5027672052383423, + "num_tokens": 795687484.0, + "step": 1556 + }, + { + "epoch": 0.4210383991346674, + "grad_norm": 17.210573196411133, + "learning_rate": 1.9862004573774486e-05, + "loss": 1.9196, + "mean_token_accuracy": 0.6037278175354004, + "num_tokens": 796211712.0, + "step": 1557 + }, + { + "epoch": 0.42130881557598704, + "grad_norm": 2.694570302963257, + "learning_rate": 1.9861729429434553e-05, + "loss": 2.4517, + "mean_token_accuracy": 0.4798309803009033, + "num_tokens": 796735866.0, + "step": 1558 + }, + { + "epoch": 0.4215792320173066, + "grad_norm": 1.701303243637085, + "learning_rate": 1.9861454013190706e-05, + "loss": 2.3966, + "mean_token_accuracy": 0.48195570707321167, + "num_tokens": 797260104.0, + "step": 1559 + }, + { + "epoch": 0.42184964845862627, + "grad_norm": 1.604691982269287, + "learning_rate": 1.9861178325051386e-05, + "loss": 2.3178, + "mean_token_accuracy": 0.507247805595398, + "num_tokens": 797749971.0, + "step": 1560 + }, + { + "epoch": 0.4221200648999459, + "grad_norm": 1.0199893712997437, + "learning_rate": 1.9860902365025058e-05, + "loss": 1.2043, + "mean_token_accuracy": 0.6873761415481567, + "num_tokens": 798274250.0, + "step": 1561 + }, + { + "epoch": 0.42239048134126556, + "grad_norm": 2.4983158111572266, + "learning_rate": 1.9860626133120188e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.49894261360168457, + "num_tokens": 798759055.0, + "step": 1562 + }, + { + "epoch": 0.4226608977825852, + "grad_norm": 1.9986495971679688, + "learning_rate": 1.9860349629345247e-05, + "loss": 2.3443, + "mean_token_accuracy": 0.4935830533504486, + "num_tokens": 799283214.0, + "step": 1563 + }, + { + "epoch": 0.4229313142239048, + "grad_norm": 1.3043283224105835, + "learning_rate": 1.986007285370873e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.4868719279766083, + "num_tokens": 799807420.0, + "step": 1564 + }, + { + "epoch": 0.42320173066522443, + "grad_norm": 1.7744388580322266, + "learning_rate": 1.9859795806219114e-05, + "loss": 2.4674, + "mean_token_accuracy": 0.4847182631492615, + "num_tokens": 800331443.0, + "step": 1565 + }, + { + "epoch": 0.4234721471065441, + "grad_norm": 1.6478118896484375, + "learning_rate": 1.9859518486884914e-05, + "loss": 2.4266, + "mean_token_accuracy": 0.47976455092430115, + "num_tokens": 800826848.0, + "step": 1566 + }, + { + "epoch": 0.4237425635478637, + "grad_norm": 1.7913234233856201, + "learning_rate": 1.985924089571463e-05, + "loss": 2.319, + "mean_token_accuracy": 0.515108585357666, + "num_tokens": 801339589.0, + "step": 1567 + }, + { + "epoch": 0.42401297998918336, + "grad_norm": 1.8324071168899536, + "learning_rate": 1.985896303271678e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.4958881139755249, + "num_tokens": 801863840.0, + "step": 1568 + }, + { + "epoch": 0.424283396430503, + "grad_norm": 1.7947977781295776, + "learning_rate": 1.9858684897899894e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.4751932621002197, + "num_tokens": 802388000.0, + "step": 1569 + }, + { + "epoch": 0.4245538128718226, + "grad_norm": 1.8268848657608032, + "learning_rate": 1.9858406491272505e-05, + "loss": 2.3206, + "mean_token_accuracy": 0.5052655935287476, + "num_tokens": 802912122.0, + "step": 1570 + }, + { + "epoch": 0.42482422931314223, + "grad_norm": 1.5203883647918701, + "learning_rate": 1.985812781284315e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.4862734079360962, + "num_tokens": 803436325.0, + "step": 1571 + }, + { + "epoch": 0.4250946457544619, + "grad_norm": 1.7647552490234375, + "learning_rate": 1.985784886262039e-05, + "loss": 2.4445, + "mean_token_accuracy": 0.4933818578720093, + "num_tokens": 803958381.0, + "step": 1572 + }, + { + "epoch": 0.4253650621957815, + "grad_norm": 1.8679115772247314, + "learning_rate": 1.985756964061277e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.49196794629096985, + "num_tokens": 804482590.0, + "step": 1573 + }, + { + "epoch": 0.42563547863710116, + "grad_norm": 1.9386149644851685, + "learning_rate": 1.985729014682887e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.5050064325332642, + "num_tokens": 805006831.0, + "step": 1574 + }, + { + "epoch": 0.42590589507842075, + "grad_norm": 1.6627907752990723, + "learning_rate": 1.985701038127726e-05, + "loss": 2.4005, + "mean_token_accuracy": 0.48655858635902405, + "num_tokens": 805530992.0, + "step": 1575 + }, + { + "epoch": 0.4261763115197404, + "grad_norm": 1.901206374168396, + "learning_rate": 1.9856730343966522e-05, + "loss": 2.5667, + "mean_token_accuracy": 0.47864967584609985, + "num_tokens": 806055221.0, + "step": 1576 + }, + { + "epoch": 0.42644672796106003, + "grad_norm": 1.6812810897827148, + "learning_rate": 1.9856450034905254e-05, + "loss": 2.2642, + "mean_token_accuracy": 0.5062351226806641, + "num_tokens": 806546086.0, + "step": 1577 + }, + { + "epoch": 0.4267171444023797, + "grad_norm": 1.6069613695144653, + "learning_rate": 1.985616945410205e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.4793426990509033, + "num_tokens": 807070280.0, + "step": 1578 + }, + { + "epoch": 0.4269875608436993, + "grad_norm": 1.7328635454177856, + "learning_rate": 1.9855888601565528e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.4775872230529785, + "num_tokens": 807594552.0, + "step": 1579 + }, + { + "epoch": 0.4272579772850189, + "grad_norm": 1.7681350708007812, + "learning_rate": 1.9855607477304298e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.48677170276641846, + "num_tokens": 808118775.0, + "step": 1580 + }, + { + "epoch": 0.42752839372633855, + "grad_norm": 0.8179428577423096, + "learning_rate": 1.9855326081326988e-05, + "loss": 1.2142, + "mean_token_accuracy": 0.6704235076904297, + "num_tokens": 808642897.0, + "step": 1581 + }, + { + "epoch": 0.4277988101676582, + "grad_norm": 4.256215572357178, + "learning_rate": 1.985504441364223e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.5030946731567383, + "num_tokens": 809163617.0, + "step": 1582 + }, + { + "epoch": 0.42806922660897784, + "grad_norm": 3.196939706802368, + "learning_rate": 1.985476247425867e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.47660842537879944, + "num_tokens": 809679941.0, + "step": 1583 + }, + { + "epoch": 0.4283396430502975, + "grad_norm": 2.1477112770080566, + "learning_rate": 1.9854480263184958e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.4969020187854767, + "num_tokens": 810204174.0, + "step": 1584 + }, + { + "epoch": 0.42861005949161707, + "grad_norm": 2.340684175491333, + "learning_rate": 1.9854197780429753e-05, + "loss": 2.5703, + "mean_token_accuracy": 0.4642125964164734, + "num_tokens": 810728437.0, + "step": 1585 + }, + { + "epoch": 0.4288804759329367, + "grad_norm": 2.3896117210388184, + "learning_rate": 1.9853915026001717e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.5068140625953674, + "num_tokens": 811239782.0, + "step": 1586 + }, + { + "epoch": 0.42915089237425635, + "grad_norm": 2.3225836753845215, + "learning_rate": 1.9853631999909533e-05, + "loss": 2.4253, + "mean_token_accuracy": 0.48599469661712646, + "num_tokens": 811750877.0, + "step": 1587 + }, + { + "epoch": 0.429421308815576, + "grad_norm": 1.6740336418151855, + "learning_rate": 1.9853348702161887e-05, + "loss": 2.388, + "mean_token_accuracy": 0.48387235403060913, + "num_tokens": 812275047.0, + "step": 1588 + }, + { + "epoch": 0.42969172525689564, + "grad_norm": 1.9557820558547974, + "learning_rate": 1.9853065132767463e-05, + "loss": 2.4087, + "mean_token_accuracy": 0.5182390213012695, + "num_tokens": 812725862.0, + "step": 1589 + }, + { + "epoch": 0.42996214169821523, + "grad_norm": 1.89043128490448, + "learning_rate": 1.9852781291734964e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.49101370573043823, + "num_tokens": 813250029.0, + "step": 1590 + }, + { + "epoch": 0.43023255813953487, + "grad_norm": 1.9523024559020996, + "learning_rate": 1.9852497179073104e-05, + "loss": 2.3719, + "mean_token_accuracy": 0.5099279284477234, + "num_tokens": 813774311.0, + "step": 1591 + }, + { + "epoch": 0.4305029745808545, + "grad_norm": 2.030973196029663, + "learning_rate": 1.9852212794790593e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.48188117146492004, + "num_tokens": 814298447.0, + "step": 1592 + }, + { + "epoch": 0.43077339102217416, + "grad_norm": 1.9746899604797363, + "learning_rate": 1.9851928138896163e-05, + "loss": 2.171, + "mean_token_accuracy": 0.5465145707130432, + "num_tokens": 814822662.0, + "step": 1593 + }, + { + "epoch": 0.4310438074634938, + "grad_norm": 1.9844608306884766, + "learning_rate": 1.9851643211398545e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.4910687208175659, + "num_tokens": 815346824.0, + "step": 1594 + }, + { + "epoch": 0.4313142239048134, + "grad_norm": 1.6828131675720215, + "learning_rate": 1.9851358012306487e-05, + "loss": 2.2619, + "mean_token_accuracy": 0.5041022300720215, + "num_tokens": 815870965.0, + "step": 1595 + }, + { + "epoch": 0.43158464034613303, + "grad_norm": 1.8124197721481323, + "learning_rate": 1.985107254162873e-05, + "loss": 2.2846, + "mean_token_accuracy": 0.5163733959197998, + "num_tokens": 816276308.0, + "step": 1596 + }, + { + "epoch": 0.4318550567874527, + "grad_norm": 1.5610013008117676, + "learning_rate": 1.9850786799374037e-05, + "loss": 2.4749, + "mean_token_accuracy": 0.47964024543762207, + "num_tokens": 816744648.0, + "step": 1597 + }, + { + "epoch": 0.4321254732287723, + "grad_norm": 1.9760574102401733, + "learning_rate": 1.985050078555118e-05, + "loss": 2.319, + "mean_token_accuracy": 0.5058733224868774, + "num_tokens": 817214495.0, + "step": 1598 + }, + { + "epoch": 0.43239588967009196, + "grad_norm": 1.4955593347549438, + "learning_rate": 1.985021450016893e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.5032685995101929, + "num_tokens": 817738770.0, + "step": 1599 + }, + { + "epoch": 0.43266630611141155, + "grad_norm": 1.9614695310592651, + "learning_rate": 1.984992794323607e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.4981076121330261, + "num_tokens": 818262994.0, + "step": 1600 + }, + { + "epoch": 0.4329367225527312, + "grad_norm": 0.9456242918968201, + "learning_rate": 1.98496411147614e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6635696887969971, + "num_tokens": 818787252.0, + "step": 1601 + }, + { + "epoch": 0.43320713899405083, + "grad_norm": 2.5171656608581543, + "learning_rate": 1.9849354014753712e-05, + "loss": 2.3448, + "mean_token_accuracy": 0.5120751857757568, + "num_tokens": 819249822.0, + "step": 1602 + }, + { + "epoch": 0.4334775554353705, + "grad_norm": 2.0969114303588867, + "learning_rate": 1.9849066643221817e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.4783908426761627, + "num_tokens": 819773990.0, + "step": 1603 + }, + { + "epoch": 0.4337479718766901, + "grad_norm": 1.8155839443206787, + "learning_rate": 1.9848779000174538e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.4954547882080078, + "num_tokens": 820260271.0, + "step": 1604 + }, + { + "epoch": 0.43401838831800976, + "grad_norm": 1.8482130765914917, + "learning_rate": 1.9848491085620693e-05, + "loss": 2.3895, + "mean_token_accuracy": 0.507729172706604, + "num_tokens": 820783436.0, + "step": 1605 + }, + { + "epoch": 0.43428880475932935, + "grad_norm": 1.981575608253479, + "learning_rate": 1.9848202899569122e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.5031902194023132, + "num_tokens": 821264671.0, + "step": 1606 + }, + { + "epoch": 0.434559221200649, + "grad_norm": 1.711276650428772, + "learning_rate": 1.9847914442028665e-05, + "loss": 2.3139, + "mean_token_accuracy": 0.4862903356552124, + "num_tokens": 821788809.0, + "step": 1607 + }, + { + "epoch": 0.43482963764196864, + "grad_norm": 1.61264967918396, + "learning_rate": 1.9847625713008175e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.4819161295890808, + "num_tokens": 822313048.0, + "step": 1608 + }, + { + "epoch": 0.4351000540832883, + "grad_norm": 2.0292770862579346, + "learning_rate": 1.9847336712516503e-05, + "loss": 2.4282, + "mean_token_accuracy": 0.509305477142334, + "num_tokens": 822807269.0, + "step": 1609 + }, + { + "epoch": 0.4353704705246079, + "grad_norm": 1.6393375396728516, + "learning_rate": 1.9847047440562527e-05, + "loss": 2.3213, + "mean_token_accuracy": 0.5113201141357422, + "num_tokens": 823300535.0, + "step": 1610 + }, + { + "epoch": 0.4356408869659275, + "grad_norm": 1.624848484992981, + "learning_rate": 1.984675789715512e-05, + "loss": 2.308, + "mean_token_accuracy": 0.5030292272567749, + "num_tokens": 823824697.0, + "step": 1611 + }, + { + "epoch": 0.43591130340724715, + "grad_norm": 1.560818076133728, + "learning_rate": 1.984646808230316e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.501541793346405, + "num_tokens": 824330212.0, + "step": 1612 + }, + { + "epoch": 0.4361817198485668, + "grad_norm": 2.1847329139709473, + "learning_rate": 1.9846177996015543e-05, + "loss": 2.4254, + "mean_token_accuracy": 0.4954491853713989, + "num_tokens": 824854425.0, + "step": 1613 + }, + { + "epoch": 0.43645213628988644, + "grad_norm": 1.4615072011947632, + "learning_rate": 1.984588763830117e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.5021845698356628, + "num_tokens": 825378628.0, + "step": 1614 + }, + { + "epoch": 0.4367225527312061, + "grad_norm": 1.6207436323165894, + "learning_rate": 1.9845597009168956e-05, + "loss": 2.5089, + "mean_token_accuracy": 0.5131390690803528, + "num_tokens": 825839881.0, + "step": 1615 + }, + { + "epoch": 0.43699296917252567, + "grad_norm": 1.4549152851104736, + "learning_rate": 1.9845306108627806e-05, + "loss": 2.3737, + "mean_token_accuracy": 0.4928216338157654, + "num_tokens": 826350719.0, + "step": 1616 + }, + { + "epoch": 0.4372633856138453, + "grad_norm": 1.3967525959014893, + "learning_rate": 1.9845014936686654e-05, + "loss": 2.3227, + "mean_token_accuracy": 0.5075348615646362, + "num_tokens": 826874964.0, + "step": 1617 + }, + { + "epoch": 0.43753380205516496, + "grad_norm": 1.5419175624847412, + "learning_rate": 1.9844723493354434e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.5024611949920654, + "num_tokens": 827362375.0, + "step": 1618 + }, + { + "epoch": 0.4378042184964846, + "grad_norm": 1.3164137601852417, + "learning_rate": 1.9844431778640086e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.5025132298469543, + "num_tokens": 827886544.0, + "step": 1619 + }, + { + "epoch": 0.43807463493780424, + "grad_norm": 1.880985140800476, + "learning_rate": 1.9844139792552563e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.5035176873207092, + "num_tokens": 828410775.0, + "step": 1620 + }, + { + "epoch": 0.43834505137912383, + "grad_norm": 1.4049522876739502, + "learning_rate": 1.9843847535100816e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6532000303268433, + "num_tokens": 828934912.0, + "step": 1621 + }, + { + "epoch": 0.43861546782044347, + "grad_norm": 1.8866645097732544, + "learning_rate": 1.9843555006293825e-05, + "loss": 2.3307, + "mean_token_accuracy": 0.5122679471969604, + "num_tokens": 829394947.0, + "step": 1622 + }, + { + "epoch": 0.4388858842617631, + "grad_norm": 1.4751715660095215, + "learning_rate": 1.9843262206140558e-05, + "loss": 2.3923, + "mean_token_accuracy": 0.5140177607536316, + "num_tokens": 829854931.0, + "step": 1623 + }, + { + "epoch": 0.43915630070308276, + "grad_norm": 1.7864881753921509, + "learning_rate": 1.9842969134649997e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.4906575381755829, + "num_tokens": 830379078.0, + "step": 1624 + }, + { + "epoch": 0.4394267171444024, + "grad_norm": 1.4695637226104736, + "learning_rate": 1.984267579183114e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.5027257204055786, + "num_tokens": 830850478.0, + "step": 1625 + }, + { + "epoch": 0.439697133585722, + "grad_norm": 1.6939936876296997, + "learning_rate": 1.9842382177692986e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.5055364966392517, + "num_tokens": 831311099.0, + "step": 1626 + }, + { + "epoch": 0.43996755002704163, + "grad_norm": 2.1680006980895996, + "learning_rate": 1.984208829224454e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.48026785254478455, + "num_tokens": 831835256.0, + "step": 1627 + }, + { + "epoch": 0.4402379664683613, + "grad_norm": 1.7334681749343872, + "learning_rate": 1.9841794135494823e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.5147911310195923, + "num_tokens": 832321299.0, + "step": 1628 + }, + { + "epoch": 0.4405083829096809, + "grad_norm": 1.7409250736236572, + "learning_rate": 1.984149970745286e-05, + "loss": 2.4496, + "mean_token_accuracy": 0.47122809290885925, + "num_tokens": 832845503.0, + "step": 1629 + }, + { + "epoch": 0.44077879935100056, + "grad_norm": 1.932552456855774, + "learning_rate": 1.9841205008127687e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.49176496267318726, + "num_tokens": 833369590.0, + "step": 1630 + }, + { + "epoch": 0.44104921579232015, + "grad_norm": 1.775981068611145, + "learning_rate": 1.9840910037528343e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.4887159466743469, + "num_tokens": 833893791.0, + "step": 1631 + }, + { + "epoch": 0.4413196322336398, + "grad_norm": 1.236440896987915, + "learning_rate": 1.9840614795663878e-05, + "loss": 2.2844, + "mean_token_accuracy": 0.5099225044250488, + "num_tokens": 834418025.0, + "step": 1632 + }, + { + "epoch": 0.44159004867495943, + "grad_norm": 1.9196988344192505, + "learning_rate": 1.9840319282543353e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.4875686764717102, + "num_tokens": 834942185.0, + "step": 1633 + }, + { + "epoch": 0.4418604651162791, + "grad_norm": 1.571706771850586, + "learning_rate": 1.9840023498175837e-05, + "loss": 2.3561, + "mean_token_accuracy": 0.49871575832366943, + "num_tokens": 835466209.0, + "step": 1634 + }, + { + "epoch": 0.4421308815575987, + "grad_norm": 1.7500075101852417, + "learning_rate": 1.98397274425704e-05, + "loss": 2.3926, + "mean_token_accuracy": 0.49827849864959717, + "num_tokens": 835990422.0, + "step": 1635 + }, + { + "epoch": 0.4424012979989183, + "grad_norm": 1.4783744812011719, + "learning_rate": 1.9839431115736132e-05, + "loss": 2.3739, + "mean_token_accuracy": 0.49282151460647583, + "num_tokens": 836514607.0, + "step": 1636 + }, + { + "epoch": 0.44267171444023795, + "grad_norm": 1.4724724292755127, + "learning_rate": 1.9839134517682123e-05, + "loss": 2.3745, + "mean_token_accuracy": 0.49694880843162537, + "num_tokens": 837038843.0, + "step": 1637 + }, + { + "epoch": 0.4429421308815576, + "grad_norm": 1.5462321043014526, + "learning_rate": 1.9838837648417472e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.5195533037185669, + "num_tokens": 837563056.0, + "step": 1638 + }, + { + "epoch": 0.44321254732287724, + "grad_norm": 1.6845179796218872, + "learning_rate": 1.9838540507951285e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.5013333559036255, + "num_tokens": 838087139.0, + "step": 1639 + }, + { + "epoch": 0.4434829637641969, + "grad_norm": 1.7343324422836304, + "learning_rate": 1.9838243096292688e-05, + "loss": 2.3289, + "mean_token_accuracy": 0.4901735782623291, + "num_tokens": 838611320.0, + "step": 1640 + }, + { + "epoch": 0.44375338020551647, + "grad_norm": 0.8566964268684387, + "learning_rate": 1.9837945413450797e-05, + "loss": 1.2256, + "mean_token_accuracy": 0.6976065635681152, + "num_tokens": 839052909.0, + "step": 1641 + }, + { + "epoch": 0.4440237966468361, + "grad_norm": 3.1478991508483887, + "learning_rate": 1.983764745943475e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.45760101079940796, + "num_tokens": 839577094.0, + "step": 1642 + }, + { + "epoch": 0.44429421308815575, + "grad_norm": 3.6416165828704834, + "learning_rate": 1.9837349234253692e-05, + "loss": 2.2214, + "mean_token_accuracy": 0.5234031677246094, + "num_tokens": 840101349.0, + "step": 1643 + }, + { + "epoch": 0.4445646295294754, + "grad_norm": 1.946839690208435, + "learning_rate": 1.983705073791677e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.48943740129470825, + "num_tokens": 840599141.0, + "step": 1644 + }, + { + "epoch": 0.44483504597079504, + "grad_norm": 1.759357213973999, + "learning_rate": 1.983675197043314e-05, + "loss": 2.249, + "mean_token_accuracy": 0.49490803480148315, + "num_tokens": 841123308.0, + "step": 1645 + }, + { + "epoch": 0.4451054624121147, + "grad_norm": 2.174914598464966, + "learning_rate": 1.9836452931811978e-05, + "loss": 2.3782, + "mean_token_accuracy": 0.5043075680732727, + "num_tokens": 841593267.0, + "step": 1646 + }, + { + "epoch": 0.44537587885343427, + "grad_norm": 1.9854156970977783, + "learning_rate": 1.983615362206245e-05, + "loss": 2.3163, + "mean_token_accuracy": 0.49089789390563965, + "num_tokens": 842117359.0, + "step": 1647 + }, + { + "epoch": 0.4456462952947539, + "grad_norm": 1.7129119634628296, + "learning_rate": 1.9835854041193745e-05, + "loss": 2.3352, + "mean_token_accuracy": 0.4971337914466858, + "num_tokens": 842616464.0, + "step": 1648 + }, + { + "epoch": 0.44591671173607356, + "grad_norm": 1.9317681789398193, + "learning_rate": 1.983555418921505e-05, + "loss": 2.338, + "mean_token_accuracy": 0.48948314785957336, + "num_tokens": 843140523.0, + "step": 1649 + }, + { + "epoch": 0.4461871281773932, + "grad_norm": 1.9372894763946533, + "learning_rate": 1.9835254066135574e-05, + "loss": 2.337, + "mean_token_accuracy": 0.4812142848968506, + "num_tokens": 843664774.0, + "step": 1650 + }, + { + "epoch": 0.44645754461871284, + "grad_norm": 1.2550606727600098, + "learning_rate": 1.9834953671964516e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.5142459869384766, + "num_tokens": 844188976.0, + "step": 1651 + }, + { + "epoch": 0.44672796106003243, + "grad_norm": 1.8159162998199463, + "learning_rate": 1.9834653006711103e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.47198179364204407, + "num_tokens": 844713232.0, + "step": 1652 + }, + { + "epoch": 0.4469983775013521, + "grad_norm": 1.7658368349075317, + "learning_rate": 1.9834352070384548e-05, + "loss": 2.2362, + "mean_token_accuracy": 0.509075939655304, + "num_tokens": 845211689.0, + "step": 1653 + }, + { + "epoch": 0.4472687939426717, + "grad_norm": 1.5720014572143555, + "learning_rate": 1.9834050862994094e-05, + "loss": 2.3844, + "mean_token_accuracy": 0.49150601029396057, + "num_tokens": 845735952.0, + "step": 1654 + }, + { + "epoch": 0.44753921038399136, + "grad_norm": 1.4671231508255005, + "learning_rate": 1.983374938454898e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.498778760433197, + "num_tokens": 846260101.0, + "step": 1655 + }, + { + "epoch": 0.447809626825311, + "grad_norm": 1.7562189102172852, + "learning_rate": 1.9833447635058457e-05, + "loss": 2.4247, + "mean_token_accuracy": 0.4984366297721863, + "num_tokens": 846784315.0, + "step": 1656 + }, + { + "epoch": 0.4480800432666306, + "grad_norm": 1.3624659776687622, + "learning_rate": 1.9833145614531783e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.5035749077796936, + "num_tokens": 847308511.0, + "step": 1657 + }, + { + "epoch": 0.44835045970795023, + "grad_norm": 1.6996604204177856, + "learning_rate": 1.9832843322978225e-05, + "loss": 2.1982, + "mean_token_accuracy": 0.5010300874710083, + "num_tokens": 847832738.0, + "step": 1658 + }, + { + "epoch": 0.4486208761492699, + "grad_norm": 1.8168416023254395, + "learning_rate": 1.983254076040706e-05, + "loss": 2.3046, + "mean_token_accuracy": 0.4924180507659912, + "num_tokens": 848357003.0, + "step": 1659 + }, + { + "epoch": 0.4488912925905895, + "grad_norm": 1.452141523361206, + "learning_rate": 1.983223792682757e-05, + "loss": 2.3366, + "mean_token_accuracy": 0.5115615129470825, + "num_tokens": 848881261.0, + "step": 1660 + }, + { + "epoch": 0.44916170903190916, + "grad_norm": 0.9876728653907776, + "learning_rate": 1.9831934822249043e-05, + "loss": 1.1488, + "mean_token_accuracy": 0.6979063749313354, + "num_tokens": 849389217.0, + "step": 1661 + }, + { + "epoch": 0.44943212547322875, + "grad_norm": 3.109041690826416, + "learning_rate": 1.9831631446680785e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.4939132034778595, + "num_tokens": 849913416.0, + "step": 1662 + }, + { + "epoch": 0.4497025419145484, + "grad_norm": 2.683967113494873, + "learning_rate": 1.98313278001321e-05, + "loss": 2.2909, + "mean_token_accuracy": 0.5108497738838196, + "num_tokens": 850437686.0, + "step": 1663 + }, + { + "epoch": 0.44997295835586804, + "grad_norm": 1.7133145332336426, + "learning_rate": 1.983102388261231e-05, + "loss": 2.556, + "mean_token_accuracy": 0.4568818211555481, + "num_tokens": 850961860.0, + "step": 1664 + }, + { + "epoch": 0.4502433747971877, + "grad_norm": 2.4242491722106934, + "learning_rate": 1.9830719694130732e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.47621995210647583, + "num_tokens": 851424259.0, + "step": 1665 + }, + { + "epoch": 0.4505137912385073, + "grad_norm": 2.0388360023498535, + "learning_rate": 1.9830415234696706e-05, + "loss": 2.1316, + "mean_token_accuracy": 0.5040064454078674, + "num_tokens": 851941252.0, + "step": 1666 + }, + { + "epoch": 0.4507842076798269, + "grad_norm": 1.908827781677246, + "learning_rate": 1.9830110504319574e-05, + "loss": 2.2416, + "mean_token_accuracy": 0.5099035501480103, + "num_tokens": 852465482.0, + "step": 1667 + }, + { + "epoch": 0.45105462412114655, + "grad_norm": 1.861911654472351, + "learning_rate": 1.9829805503008684e-05, + "loss": 2.2228, + "mean_token_accuracy": 0.4890851080417633, + "num_tokens": 852989671.0, + "step": 1668 + }, + { + "epoch": 0.4513250405624662, + "grad_norm": 2.1010799407958984, + "learning_rate": 1.9829500230773395e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4930694103240967, + "num_tokens": 853455020.0, + "step": 1669 + }, + { + "epoch": 0.45159545700378584, + "grad_norm": 1.7147736549377441, + "learning_rate": 1.982919468762307e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4835212230682373, + "num_tokens": 853979267.0, + "step": 1670 + }, + { + "epoch": 0.4518658734451055, + "grad_norm": 1.965742826461792, + "learning_rate": 1.982888887356709e-05, + "loss": 2.4473, + "mean_token_accuracy": 0.5006893873214722, + "num_tokens": 854503471.0, + "step": 1671 + }, + { + "epoch": 0.45213628988642507, + "grad_norm": 1.486236333847046, + "learning_rate": 1.9828582788614832e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.5093939304351807, + "num_tokens": 855027517.0, + "step": 1672 + }, + { + "epoch": 0.4524067063277447, + "grad_norm": 1.6014484167099, + "learning_rate": 1.9828276432775695e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.5057798624038696, + "num_tokens": 855517601.0, + "step": 1673 + }, + { + "epoch": 0.45267712276906436, + "grad_norm": 1.7834352254867554, + "learning_rate": 1.9827969806059076e-05, + "loss": 2.2073, + "mean_token_accuracy": 0.508568286895752, + "num_tokens": 855986416.0, + "step": 1674 + }, + { + "epoch": 0.452947539210384, + "grad_norm": 1.3900532722473145, + "learning_rate": 1.9827662908474375e-05, + "loss": 2.2046, + "mean_token_accuracy": 0.5101585984230042, + "num_tokens": 856510650.0, + "step": 1675 + }, + { + "epoch": 0.45321795565170364, + "grad_norm": 1.5197662115097046, + "learning_rate": 1.9827355740031022e-05, + "loss": 2.3595, + "mean_token_accuracy": 0.5038032531738281, + "num_tokens": 857029768.0, + "step": 1676 + }, + { + "epoch": 0.45348837209302323, + "grad_norm": 1.4734785556793213, + "learning_rate": 1.982704830073843e-05, + "loss": 2.3546, + "mean_token_accuracy": 0.504706084728241, + "num_tokens": 857553971.0, + "step": 1677 + }, + { + "epoch": 0.45375878853434287, + "grad_norm": 1.8791780471801758, + "learning_rate": 1.9826740590606042e-05, + "loss": 2.2579, + "mean_token_accuracy": 0.5080627799034119, + "num_tokens": 858078245.0, + "step": 1678 + }, + { + "epoch": 0.4540292049756625, + "grad_norm": 1.7239434719085693, + "learning_rate": 1.9826432609643295e-05, + "loss": 2.3397, + "mean_token_accuracy": 0.5108177661895752, + "num_tokens": 858579976.0, + "step": 1679 + }, + { + "epoch": 0.45429962141698216, + "grad_norm": 2.009592294692993, + "learning_rate": 1.9826124357859635e-05, + "loss": 2.2978, + "mean_token_accuracy": 0.5069150328636169, + "num_tokens": 859104168.0, + "step": 1680 + }, + { + "epoch": 0.4545700378583018, + "grad_norm": 1.1002202033996582, + "learning_rate": 1.9825815835264522e-05, + "loss": 1.2772, + "mean_token_accuracy": 0.6590325832366943, + "num_tokens": 859592696.0, + "step": 1681 + }, + { + "epoch": 0.45484045429962144, + "grad_norm": 3.397482395172119, + "learning_rate": 1.982550704186743e-05, + "loss": 2.4044, + "mean_token_accuracy": 0.48586297035217285, + "num_tokens": 860085234.0, + "step": 1682 + }, + { + "epoch": 0.45511087074094103, + "grad_norm": 1.8098663091659546, + "learning_rate": 1.9825197977677825e-05, + "loss": 2.3467, + "mean_token_accuracy": 0.48419398069381714, + "num_tokens": 860609390.0, + "step": 1683 + }, + { + "epoch": 0.4553812871822607, + "grad_norm": 1.5323156118392944, + "learning_rate": 1.9824888642705193e-05, + "loss": 2.3858, + "mean_token_accuracy": 0.5008026957511902, + "num_tokens": 861132478.0, + "step": 1684 + }, + { + "epoch": 0.4556517036235803, + "grad_norm": 1.8369063138961792, + "learning_rate": 1.9824579036959025e-05, + "loss": 2.4859, + "mean_token_accuracy": 0.49036186933517456, + "num_tokens": 861656611.0, + "step": 1685 + }, + { + "epoch": 0.45592212006489996, + "grad_norm": 1.477738618850708, + "learning_rate": 1.982426916044882e-05, + "loss": 2.2554, + "mean_token_accuracy": 0.4972376823425293, + "num_tokens": 862180885.0, + "step": 1686 + }, + { + "epoch": 0.4561925365062196, + "grad_norm": 1.7304049730300903, + "learning_rate": 1.982395901318409e-05, + "loss": 2.4938, + "mean_token_accuracy": 0.47394704818725586, + "num_tokens": 862704963.0, + "step": 1687 + }, + { + "epoch": 0.4564629529475392, + "grad_norm": 1.6308283805847168, + "learning_rate": 1.9823648595174344e-05, + "loss": 2.3258, + "mean_token_accuracy": 0.4854554533958435, + "num_tokens": 863229081.0, + "step": 1688 + }, + { + "epoch": 0.45673336938885883, + "grad_norm": 1.829442024230957, + "learning_rate": 1.9823337906429115e-05, + "loss": 2.5383, + "mean_token_accuracy": 0.4700135290622711, + "num_tokens": 863753352.0, + "step": 1689 + }, + { + "epoch": 0.4570037858301785, + "grad_norm": 1.6266818046569824, + "learning_rate": 1.9823026946957926e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.5210749506950378, + "num_tokens": 864197221.0, + "step": 1690 + }, + { + "epoch": 0.4572742022714981, + "grad_norm": 1.5000113248825073, + "learning_rate": 1.982271571677033e-05, + "loss": 2.3349, + "mean_token_accuracy": 0.49272018671035767, + "num_tokens": 864721369.0, + "step": 1691 + }, + { + "epoch": 0.45754461871281776, + "grad_norm": 2.120631456375122, + "learning_rate": 1.9822404215875863e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.49073341488838196, + "num_tokens": 865190014.0, + "step": 1692 + }, + { + "epoch": 0.45781503515413735, + "grad_norm": 1.7907218933105469, + "learning_rate": 1.982209244428409e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4775974750518799, + "num_tokens": 865714197.0, + "step": 1693 + }, + { + "epoch": 0.458085451595457, + "grad_norm": 2.0561866760253906, + "learning_rate": 1.9821780402004584e-05, + "loss": 2.2061, + "mean_token_accuracy": 0.519152820110321, + "num_tokens": 866181237.0, + "step": 1694 + }, + { + "epoch": 0.45835586803677664, + "grad_norm": 1.946363925933838, + "learning_rate": 1.9821468089046906e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.49279364943504333, + "num_tokens": 866705513.0, + "step": 1695 + }, + { + "epoch": 0.4586262844780963, + "grad_norm": 1.5347498655319214, + "learning_rate": 1.9821155505420642e-05, + "loss": 2.2411, + "mean_token_accuracy": 0.5036203861236572, + "num_tokens": 867229708.0, + "step": 1696 + }, + { + "epoch": 0.4588967009194159, + "grad_norm": 1.5928092002868652, + "learning_rate": 1.9820842651135396e-05, + "loss": 2.2649, + "mean_token_accuracy": 0.5126684904098511, + "num_tokens": 867753924.0, + "step": 1697 + }, + { + "epoch": 0.4591671173607355, + "grad_norm": 1.7293293476104736, + "learning_rate": 1.982052952620075e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.46318480372428894, + "num_tokens": 868278050.0, + "step": 1698 + }, + { + "epoch": 0.45943753380205515, + "grad_norm": 1.5809205770492554, + "learning_rate": 1.982021613062632e-05, + "loss": 2.2109, + "mean_token_accuracy": 0.5164775848388672, + "num_tokens": 868802330.0, + "step": 1699 + }, + { + "epoch": 0.4597079502433748, + "grad_norm": 1.7323577404022217, + "learning_rate": 1.9819902464421716e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.4892738163471222, + "num_tokens": 869270808.0, + "step": 1700 + }, + { + "epoch": 0.45997836668469444, + "grad_norm": 2.1888465881347656, + "learning_rate": 1.981958852759657e-05, + "loss": 1.3042, + "mean_token_accuracy": 0.6609374284744263, + "num_tokens": 869795019.0, + "step": 1701 + }, + { + "epoch": 0.4602487831260141, + "grad_norm": 3.3555402755737305, + "learning_rate": 1.9819274320160515e-05, + "loss": 2.3396, + "mean_token_accuracy": 0.515528678894043, + "num_tokens": 870259295.0, + "step": 1702 + }, + { + "epoch": 0.46051919956733367, + "grad_norm": 2.86444354057312, + "learning_rate": 1.9818959842123184e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.5017523765563965, + "num_tokens": 870744432.0, + "step": 1703 + }, + { + "epoch": 0.4607896160086533, + "grad_norm": 2.3627123832702637, + "learning_rate": 1.981864509349423e-05, + "loss": 2.3871, + "mean_token_accuracy": 0.49659061431884766, + "num_tokens": 871268697.0, + "step": 1704 + }, + { + "epoch": 0.46106003244997296, + "grad_norm": 2.5336389541625977, + "learning_rate": 1.9818330074283314e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.49837449193000793, + "num_tokens": 871792840.0, + "step": 1705 + }, + { + "epoch": 0.4613304488912926, + "grad_norm": 2.5708043575286865, + "learning_rate": 1.9818014784500097e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.5139374732971191, + "num_tokens": 872304735.0, + "step": 1706 + }, + { + "epoch": 0.46160086533261224, + "grad_norm": 2.6034998893737793, + "learning_rate": 1.981769922415425e-05, + "loss": 2.3331, + "mean_token_accuracy": 0.48153650760650635, + "num_tokens": 872828997.0, + "step": 1707 + }, + { + "epoch": 0.46187128177393183, + "grad_norm": 1.8831329345703125, + "learning_rate": 1.9817383393255464e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.48877963423728943, + "num_tokens": 873353186.0, + "step": 1708 + }, + { + "epoch": 0.4621416982152515, + "grad_norm": 1.9853432178497314, + "learning_rate": 1.9817067291813428e-05, + "loss": 2.4468, + "mean_token_accuracy": 0.4693445861339569, + "num_tokens": 873877459.0, + "step": 1709 + }, + { + "epoch": 0.4624121146565711, + "grad_norm": 1.985938310623169, + "learning_rate": 1.9816750919837834e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.4968075156211853, + "num_tokens": 874401646.0, + "step": 1710 + }, + { + "epoch": 0.46268253109789076, + "grad_norm": 1.7204492092132568, + "learning_rate": 1.9816434277338396e-05, + "loss": 2.2342, + "mean_token_accuracy": 0.5152949094772339, + "num_tokens": 874925918.0, + "step": 1711 + }, + { + "epoch": 0.4629529475392104, + "grad_norm": 1.917206048965454, + "learning_rate": 1.9816117364324823e-05, + "loss": 2.4132, + "mean_token_accuracy": 0.496183305978775, + "num_tokens": 875394481.0, + "step": 1712 + }, + { + "epoch": 0.46322336398053, + "grad_norm": 1.7797491550445557, + "learning_rate": 1.9815800180806844e-05, + "loss": 2.2467, + "mean_token_accuracy": 0.5107963681221008, + "num_tokens": 875918726.0, + "step": 1713 + }, + { + "epoch": 0.46349378042184963, + "grad_norm": 1.9260529279708862, + "learning_rate": 1.9815482726794192e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.4929620623588562, + "num_tokens": 876442823.0, + "step": 1714 + }, + { + "epoch": 0.4637641968631693, + "grad_norm": 1.6264008283615112, + "learning_rate": 1.9815165002296605e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.514434278011322, + "num_tokens": 876913999.0, + "step": 1715 + }, + { + "epoch": 0.4640346133044889, + "grad_norm": 1.8530218601226807, + "learning_rate": 1.981484700732383e-05, + "loss": 2.3669, + "mean_token_accuracy": 0.5110722184181213, + "num_tokens": 877380345.0, + "step": 1716 + }, + { + "epoch": 0.46430502974580856, + "grad_norm": 1.8836880922317505, + "learning_rate": 1.9814528741885626e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.47955894470214844, + "num_tokens": 877904622.0, + "step": 1717 + }, + { + "epoch": 0.4645754461871282, + "grad_norm": 2.281597852706909, + "learning_rate": 1.981421020599176e-05, + "loss": 2.2944, + "mean_token_accuracy": 0.5099891424179077, + "num_tokens": 878428863.0, + "step": 1718 + }, + { + "epoch": 0.4648458626284478, + "grad_norm": 2.0110256671905518, + "learning_rate": 1.9813891399652e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.5171778202056885, + "num_tokens": 878920005.0, + "step": 1719 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 5.744056224822998, + "learning_rate": 1.9813572322876135e-05, + "loss": 2.1767, + "mean_token_accuracy": 0.5062112808227539, + "num_tokens": 879444057.0, + "step": 1720 + }, + { + "epoch": 0.4653866955110871, + "grad_norm": 2.8470492362976074, + "learning_rate": 1.9813252975673953e-05, + "loss": 1.2725, + "mean_token_accuracy": 0.6650217771530151, + "num_tokens": 879968067.0, + "step": 1721 + }, + { + "epoch": 0.4656571119524067, + "grad_norm": 4.216518878936768, + "learning_rate": 1.9812933358055252e-05, + "loss": 2.2827, + "mean_token_accuracy": 0.5169744491577148, + "num_tokens": 880492341.0, + "step": 1722 + }, + { + "epoch": 0.46592752839372636, + "grad_norm": 3.396787166595459, + "learning_rate": 1.9812613470029834e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.4835171103477478, + "num_tokens": 880989764.0, + "step": 1723 + }, + { + "epoch": 0.46619794483504595, + "grad_norm": 1.8172228336334229, + "learning_rate": 1.981229331160752e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.4959483742713928, + "num_tokens": 881514002.0, + "step": 1724 + }, + { + "epoch": 0.4664683612763656, + "grad_norm": 2.301685333251953, + "learning_rate": 1.9811972882798137e-05, + "loss": 2.3032, + "mean_token_accuracy": 0.5054715275764465, + "num_tokens": 881980682.0, + "step": 1725 + }, + { + "epoch": 0.46673877771768524, + "grad_norm": 2.4840586185455322, + "learning_rate": 1.981165218361151e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.4916086792945862, + "num_tokens": 882504787.0, + "step": 1726 + }, + { + "epoch": 0.4670091941590049, + "grad_norm": 1.8361239433288574, + "learning_rate": 1.981133121405748e-05, + "loss": 2.3407, + "mean_token_accuracy": 0.49407482147216797, + "num_tokens": 883028880.0, + "step": 1727 + }, + { + "epoch": 0.4672796106003245, + "grad_norm": 2.0133719444274902, + "learning_rate": 1.9811009974145893e-05, + "loss": 2.4914, + "mean_token_accuracy": 0.5021611452102661, + "num_tokens": 883472462.0, + "step": 1728 + }, + { + "epoch": 0.4675500270416441, + "grad_norm": 1.6857038736343384, + "learning_rate": 1.981068846388661e-05, + "loss": 2.3088, + "mean_token_accuracy": 0.5064984560012817, + "num_tokens": 883996735.0, + "step": 1729 + }, + { + "epoch": 0.46782044348296375, + "grad_norm": 1.7454007863998413, + "learning_rate": 1.9810366683289498e-05, + "loss": 2.1183, + "mean_token_accuracy": 0.525471568107605, + "num_tokens": 884520828.0, + "step": 1730 + }, + { + "epoch": 0.4680908599242834, + "grad_norm": 1.700032353401184, + "learning_rate": 1.981004463236442e-05, + "loss": 2.378, + "mean_token_accuracy": 0.48438626527786255, + "num_tokens": 884993471.0, + "step": 1731 + }, + { + "epoch": 0.46836127636560304, + "grad_norm": 1.8354653120040894, + "learning_rate": 1.9809722311121268e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.4962981939315796, + "num_tokens": 885513624.0, + "step": 1732 + }, + { + "epoch": 0.4686316928069227, + "grad_norm": 2.0442776679992676, + "learning_rate": 1.980939971956993e-05, + "loss": 2.3481, + "mean_token_accuracy": 0.5030238628387451, + "num_tokens": 885983424.0, + "step": 1733 + }, + { + "epoch": 0.46890210924824227, + "grad_norm": 1.9044724702835083, + "learning_rate": 1.9809076857720298e-05, + "loss": 2.3754, + "mean_token_accuracy": 0.49294814467430115, + "num_tokens": 886507706.0, + "step": 1734 + }, + { + "epoch": 0.4691725256895619, + "grad_norm": 1.6935739517211914, + "learning_rate": 1.980875372558228e-05, + "loss": 2.3959, + "mean_token_accuracy": 0.5036360025405884, + "num_tokens": 887001942.0, + "step": 1735 + }, + { + "epoch": 0.46944294213088156, + "grad_norm": 2.069868803024292, + "learning_rate": 1.9808430323165797e-05, + "loss": 2.3707, + "mean_token_accuracy": 0.49724096059799194, + "num_tokens": 887526085.0, + "step": 1736 + }, + { + "epoch": 0.4697133585722012, + "grad_norm": 1.9357599020004272, + "learning_rate": 1.980810665048077e-05, + "loss": 2.1806, + "mean_token_accuracy": 0.5173506736755371, + "num_tokens": 888004814.0, + "step": 1737 + }, + { + "epoch": 0.46998377501352084, + "grad_norm": 1.6494450569152832, + "learning_rate": 1.9807782707537125e-05, + "loss": 2.1633, + "mean_token_accuracy": 0.5141805410385132, + "num_tokens": 888529071.0, + "step": 1738 + }, + { + "epoch": 0.47025419145484043, + "grad_norm": 1.689610242843628, + "learning_rate": 1.98074584943448e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.49736711382865906, + "num_tokens": 889053269.0, + "step": 1739 + }, + { + "epoch": 0.4705246078961601, + "grad_norm": 1.7247893810272217, + "learning_rate": 1.9807134010913754e-05, + "loss": 2.1381, + "mean_token_accuracy": 0.5166451930999756, + "num_tokens": 889577394.0, + "step": 1740 + }, + { + "epoch": 0.4707950243374797, + "grad_norm": 1.12993586063385, + "learning_rate": 1.9806809257253934e-05, + "loss": 1.3224, + "mean_token_accuracy": 0.6471790075302124, + "num_tokens": 890101613.0, + "step": 1741 + }, + { + "epoch": 0.47106544077879936, + "grad_norm": 2.2014784812927246, + "learning_rate": 1.9806484233375307e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.5078772306442261, + "num_tokens": 890625787.0, + "step": 1742 + }, + { + "epoch": 0.471335857220119, + "grad_norm": 1.945487380027771, + "learning_rate": 1.9806158939287844e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.4838862419128418, + "num_tokens": 891138141.0, + "step": 1743 + }, + { + "epoch": 0.4716062736614386, + "grad_norm": 1.6601157188415527, + "learning_rate": 1.980583337500153e-05, + "loss": 2.3475, + "mean_token_accuracy": 0.4833526611328125, + "num_tokens": 891662412.0, + "step": 1744 + }, + { + "epoch": 0.47187669010275823, + "grad_norm": 1.569892168045044, + "learning_rate": 1.980550754052635e-05, + "loss": 2.2797, + "mean_token_accuracy": 0.5053259134292603, + "num_tokens": 892159381.0, + "step": 1745 + }, + { + "epoch": 0.4721471065440779, + "grad_norm": 2.0224416255950928, + "learning_rate": 1.9805181435872305e-05, + "loss": 2.1271, + "mean_token_accuracy": 0.5566741228103638, + "num_tokens": 892646855.0, + "step": 1746 + }, + { + "epoch": 0.4724175229853975, + "grad_norm": 1.8192272186279297, + "learning_rate": 1.9804855061049396e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.4925841987133026, + "num_tokens": 893171064.0, + "step": 1747 + }, + { + "epoch": 0.47268793942671716, + "grad_norm": 1.5255050659179688, + "learning_rate": 1.980452841606765e-05, + "loss": 2.3535, + "mean_token_accuracy": 0.5089452266693115, + "num_tokens": 893625723.0, + "step": 1748 + }, + { + "epoch": 0.47295835586803675, + "grad_norm": 1.6570618152618408, + "learning_rate": 1.980420150093707e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.5078034400939941, + "num_tokens": 894146303.0, + "step": 1749 + }, + { + "epoch": 0.4732287723093564, + "grad_norm": 1.9061113595962524, + "learning_rate": 1.9803874315667696e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.5003045797348022, + "num_tokens": 894608531.0, + "step": 1750 + }, + { + "epoch": 0.47349918875067604, + "grad_norm": 1.6140810251235962, + "learning_rate": 1.9803546860269572e-05, + "loss": 2.3056, + "mean_token_accuracy": 0.4951857924461365, + "num_tokens": 895132787.0, + "step": 1751 + }, + { + "epoch": 0.4737696051919957, + "grad_norm": 1.5819600820541382, + "learning_rate": 1.980321913475274e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.4893532693386078, + "num_tokens": 895656923.0, + "step": 1752 + }, + { + "epoch": 0.4740400216333153, + "grad_norm": 1.8409804105758667, + "learning_rate": 1.9802891139127254e-05, + "loss": 2.4134, + "mean_token_accuracy": 0.49737632274627686, + "num_tokens": 896181085.0, + "step": 1753 + }, + { + "epoch": 0.4743104380746349, + "grad_norm": 1.6936511993408203, + "learning_rate": 1.9802562873403182e-05, + "loss": 2.327, + "mean_token_accuracy": 0.4994058310985565, + "num_tokens": 896705320.0, + "step": 1754 + }, + { + "epoch": 0.47458085451595455, + "grad_norm": 1.453834056854248, + "learning_rate": 1.9802234337590597e-05, + "loss": 2.2342, + "mean_token_accuracy": 0.5266329646110535, + "num_tokens": 897229558.0, + "step": 1755 + }, + { + "epoch": 0.4748512709572742, + "grad_norm": 1.6278352737426758, + "learning_rate": 1.9801905531699574e-05, + "loss": 2.4603, + "mean_token_accuracy": 0.4951798617839813, + "num_tokens": 897730732.0, + "step": 1756 + }, + { + "epoch": 0.47512168739859384, + "grad_norm": 1.4929771423339844, + "learning_rate": 1.9801576455740204e-05, + "loss": 2.4154, + "mean_token_accuracy": 0.4935230612754822, + "num_tokens": 898254996.0, + "step": 1757 + }, + { + "epoch": 0.4753921038399135, + "grad_norm": 1.683853030204773, + "learning_rate": 1.9801247109722586e-05, + "loss": 2.3915, + "mean_token_accuracy": 0.4930976331233978, + "num_tokens": 898779202.0, + "step": 1758 + }, + { + "epoch": 0.4756625202812331, + "grad_norm": 1.3737208843231201, + "learning_rate": 1.980091749365682e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.5215469598770142, + "num_tokens": 899303313.0, + "step": 1759 + }, + { + "epoch": 0.4759329367225527, + "grad_norm": 1.8367888927459717, + "learning_rate": 1.980058760755303e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.491641640663147, + "num_tokens": 899827535.0, + "step": 1760 + }, + { + "epoch": 0.47620335316387236, + "grad_norm": 0.685337483882904, + "learning_rate": 1.9800257451421327e-05, + "loss": 1.2142, + "mean_token_accuracy": 0.6811874508857727, + "num_tokens": 900351767.0, + "step": 1761 + }, + { + "epoch": 0.476473769605192, + "grad_norm": 2.2962687015533447, + "learning_rate": 1.9799927025271845e-05, + "loss": 2.5033, + "mean_token_accuracy": 0.4920644164085388, + "num_tokens": 900782640.0, + "step": 1762 + }, + { + "epoch": 0.47674418604651164, + "grad_norm": 1.6561601161956787, + "learning_rate": 1.9799596329114725e-05, + "loss": 2.2779, + "mean_token_accuracy": 0.49914342164993286, + "num_tokens": 901306826.0, + "step": 1763 + }, + { + "epoch": 0.4770146024878313, + "grad_norm": 1.6443969011306763, + "learning_rate": 1.979926536296011e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.49669766426086426, + "num_tokens": 901831069.0, + "step": 1764 + }, + { + "epoch": 0.4772850189291509, + "grad_norm": 1.738100528717041, + "learning_rate": 1.979893412681816e-05, + "loss": 2.5332, + "mean_token_accuracy": 0.479867160320282, + "num_tokens": 902317494.0, + "step": 1765 + }, + { + "epoch": 0.4775554353704705, + "grad_norm": 1.6106330156326294, + "learning_rate": 1.979860262069903e-05, + "loss": 2.3377, + "mean_token_accuracy": 0.4969741106033325, + "num_tokens": 902818017.0, + "step": 1766 + }, + { + "epoch": 0.47782585181179016, + "grad_norm": 1.5847584009170532, + "learning_rate": 1.97982708446129e-05, + "loss": 2.2825, + "mean_token_accuracy": 0.5084721446037292, + "num_tokens": 903342174.0, + "step": 1767 + }, + { + "epoch": 0.4780962682531098, + "grad_norm": 1.6559374332427979, + "learning_rate": 1.9797938798569943e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.5133031606674194, + "num_tokens": 903866400.0, + "step": 1768 + }, + { + "epoch": 0.47836668469442944, + "grad_norm": 1.752196192741394, + "learning_rate": 1.9797606482580357e-05, + "loss": 2.3167, + "mean_token_accuracy": 0.504928469657898, + "num_tokens": 904390608.0, + "step": 1769 + }, + { + "epoch": 0.47863710113574903, + "grad_norm": 1.4507060050964355, + "learning_rate": 1.9797273896654327e-05, + "loss": 2.2043, + "mean_token_accuracy": 0.515440821647644, + "num_tokens": 904898025.0, + "step": 1770 + }, + { + "epoch": 0.4789075175770687, + "grad_norm": 1.5376465320587158, + "learning_rate": 1.9796941040802067e-05, + "loss": 2.2189, + "mean_token_accuracy": 0.5062826871871948, + "num_tokens": 905422191.0, + "step": 1771 + }, + { + "epoch": 0.4791779340183883, + "grad_norm": 1.4690489768981934, + "learning_rate": 1.9796607915033784e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.4922754168510437, + "num_tokens": 905913738.0, + "step": 1772 + }, + { + "epoch": 0.47944835045970796, + "grad_norm": 1.8546056747436523, + "learning_rate": 1.9796274519359696e-05, + "loss": 2.3481, + "mean_token_accuracy": 0.4782950282096863, + "num_tokens": 906437878.0, + "step": 1773 + }, + { + "epoch": 0.4797187669010276, + "grad_norm": 1.8102624416351318, + "learning_rate": 1.9795940853790045e-05, + "loss": 2.3535, + "mean_token_accuracy": 0.4979785978794098, + "num_tokens": 906961995.0, + "step": 1774 + }, + { + "epoch": 0.4799891833423472, + "grad_norm": 1.408160924911499, + "learning_rate": 1.9795606918335056e-05, + "loss": 2.3406, + "mean_token_accuracy": 0.4868524670600891, + "num_tokens": 907486100.0, + "step": 1775 + }, + { + "epoch": 0.48025959978366684, + "grad_norm": 1.9421530961990356, + "learning_rate": 1.9795272713004986e-05, + "loss": 2.5225, + "mean_token_accuracy": 0.4678182005882263, + "num_tokens": 908010361.0, + "step": 1776 + }, + { + "epoch": 0.4805300162249865, + "grad_norm": 1.828474760055542, + "learning_rate": 1.979493823781008e-05, + "loss": 2.4608, + "mean_token_accuracy": 0.49081623554229736, + "num_tokens": 908534516.0, + "step": 1777 + }, + { + "epoch": 0.4808004326663061, + "grad_norm": 1.3760972023010254, + "learning_rate": 1.9794603492760604e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.5040514469146729, + "num_tokens": 909058713.0, + "step": 1778 + }, + { + "epoch": 0.48107084910762576, + "grad_norm": 1.6014702320098877, + "learning_rate": 1.979426847786683e-05, + "loss": 2.4106, + "mean_token_accuracy": 0.468433678150177, + "num_tokens": 909582969.0, + "step": 1779 + }, + { + "epoch": 0.48134126554894535, + "grad_norm": 1.7552112340927124, + "learning_rate": 1.979393319313904e-05, + "loss": 2.3094, + "mean_token_accuracy": 0.4849015772342682, + "num_tokens": 910086296.0, + "step": 1780 + }, + { + "epoch": 0.481611681990265, + "grad_norm": 2.203630208969116, + "learning_rate": 1.9793597638587515e-05, + "loss": 1.2648, + "mean_token_accuracy": 0.6846590042114258, + "num_tokens": 910571969.0, + "step": 1781 + }, + { + "epoch": 0.48188209843158464, + "grad_norm": 3.05954909324646, + "learning_rate": 1.9793261814222556e-05, + "loss": 2.4854, + "mean_token_accuracy": 0.46640315651893616, + "num_tokens": 911096229.0, + "step": 1782 + }, + { + "epoch": 0.4821525148729043, + "grad_norm": 2.197732925415039, + "learning_rate": 1.979292572005446e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.5253029465675354, + "num_tokens": 911561471.0, + "step": 1783 + }, + { + "epoch": 0.4824229313142239, + "grad_norm": 1.9219915866851807, + "learning_rate": 1.9792589356093553e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.4791140854358673, + "num_tokens": 912041823.0, + "step": 1784 + }, + { + "epoch": 0.4826933477555435, + "grad_norm": 1.8437992334365845, + "learning_rate": 1.9792252722350144e-05, + "loss": 2.1765, + "mean_token_accuracy": 0.5173226594924927, + "num_tokens": 912565950.0, + "step": 1785 + }, + { + "epoch": 0.48296376419686315, + "grad_norm": 1.5299607515335083, + "learning_rate": 1.9791915818834564e-05, + "loss": 2.3399, + "mean_token_accuracy": 0.496563583612442, + "num_tokens": 913051358.0, + "step": 1786 + }, + { + "epoch": 0.4832341806381828, + "grad_norm": 1.665890097618103, + "learning_rate": 1.979157864555715e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.5108053684234619, + "num_tokens": 913554945.0, + "step": 1787 + }, + { + "epoch": 0.48350459707950244, + "grad_norm": 1.8869757652282715, + "learning_rate": 1.979124120252825e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.4991205334663391, + "num_tokens": 914079116.0, + "step": 1788 + }, + { + "epoch": 0.4837750135208221, + "grad_norm": 2.554299831390381, + "learning_rate": 1.979090348975822e-05, + "loss": 2.3898, + "mean_token_accuracy": 0.5013232231140137, + "num_tokens": 914603380.0, + "step": 1789 + }, + { + "epoch": 0.48404542996214167, + "grad_norm": 2.254486083984375, + "learning_rate": 1.9790565507257414e-05, + "loss": 2.4507, + "mean_token_accuracy": 0.47657907009124756, + "num_tokens": 915127630.0, + "step": 1790 + }, + { + "epoch": 0.4843158464034613, + "grad_norm": 1.6454342603683472, + "learning_rate": 1.979022725503621e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.4831443428993225, + "num_tokens": 915651848.0, + "step": 1791 + }, + { + "epoch": 0.48458626284478096, + "grad_norm": 1.5333205461502075, + "learning_rate": 1.9789888733104982e-05, + "loss": 2.2499, + "mean_token_accuracy": 0.515984296798706, + "num_tokens": 916121677.0, + "step": 1792 + }, + { + "epoch": 0.4848566792861006, + "grad_norm": 1.6822574138641357, + "learning_rate": 1.9789549941474117e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.4830409288406372, + "num_tokens": 916645852.0, + "step": 1793 + }, + { + "epoch": 0.48512709572742024, + "grad_norm": 1.5936121940612793, + "learning_rate": 1.9789210880154013e-05, + "loss": 2.3079, + "mean_token_accuracy": 0.48413076996803284, + "num_tokens": 917170092.0, + "step": 1794 + }, + { + "epoch": 0.4853975121687399, + "grad_norm": 5.065690517425537, + "learning_rate": 1.9788871549155072e-05, + "loss": 2.1262, + "mean_token_accuracy": 0.5415754318237305, + "num_tokens": 917694152.0, + "step": 1795 + }, + { + "epoch": 0.4856679286100595, + "grad_norm": 2.408201217651367, + "learning_rate": 1.9788531948487704e-05, + "loss": 2.279, + "mean_token_accuracy": 0.5177244544029236, + "num_tokens": 918218354.0, + "step": 1796 + }, + { + "epoch": 0.4859383450513791, + "grad_norm": 1.8444890975952148, + "learning_rate": 1.9788192078162333e-05, + "loss": 2.2872, + "mean_token_accuracy": 0.5048216581344604, + "num_tokens": 918728961.0, + "step": 1797 + }, + { + "epoch": 0.48620876149269876, + "grad_norm": 1.576583743095398, + "learning_rate": 1.9787851938189383e-05, + "loss": 2.0392, + "mean_token_accuracy": 0.5598021149635315, + "num_tokens": 919253140.0, + "step": 1798 + }, + { + "epoch": 0.4864791779340184, + "grad_norm": 2.2410717010498047, + "learning_rate": 1.978751152857929e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.48758482933044434, + "num_tokens": 919777342.0, + "step": 1799 + }, + { + "epoch": 0.48674959437533805, + "grad_norm": 1.90419602394104, + "learning_rate": 1.9787170849342505e-05, + "loss": 2.284, + "mean_token_accuracy": 0.5190035104751587, + "num_tokens": 920301520.0, + "step": 1800 + }, + { + "epoch": 0.48702001081665763, + "grad_norm": 1.837210774421692, + "learning_rate": 1.9786829900489475e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.6680508852005005, + "num_tokens": 920825804.0, + "step": 1801 + }, + { + "epoch": 0.4872904272579773, + "grad_norm": 3.4574246406555176, + "learning_rate": 1.9786488682030663e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.49116331338882446, + "num_tokens": 921350041.0, + "step": 1802 + }, + { + "epoch": 0.4875608436992969, + "grad_norm": 2.644455909729004, + "learning_rate": 1.978614719397654e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.5032081604003906, + "num_tokens": 921819708.0, + "step": 1803 + }, + { + "epoch": 0.48783126014061656, + "grad_norm": 1.9413917064666748, + "learning_rate": 1.978580543633758e-05, + "loss": 2.264, + "mean_token_accuracy": 0.5066073536872864, + "num_tokens": 922339084.0, + "step": 1804 + }, + { + "epoch": 0.4881016765819362, + "grad_norm": 2.230689287185669, + "learning_rate": 1.9785463409124274e-05, + "loss": 2.4261, + "mean_token_accuracy": 0.49041736125946045, + "num_tokens": 922863340.0, + "step": 1805 + }, + { + "epoch": 0.4883720930232558, + "grad_norm": 2.2385964393615723, + "learning_rate": 1.9785121112347115e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.4762139618396759, + "num_tokens": 923387438.0, + "step": 1806 + }, + { + "epoch": 0.48864250946457544, + "grad_norm": 1.9064174890518188, + "learning_rate": 1.9784778546016605e-05, + "loss": 2.2124, + "mean_token_accuracy": 0.5215499401092529, + "num_tokens": 923911425.0, + "step": 1807 + }, + { + "epoch": 0.4889129259058951, + "grad_norm": 1.8957113027572632, + "learning_rate": 1.9784435710143254e-05, + "loss": 2.1299, + "mean_token_accuracy": 0.5551459789276123, + "num_tokens": 924435326.0, + "step": 1808 + }, + { + "epoch": 0.4891833423472147, + "grad_norm": 2.415924310684204, + "learning_rate": 1.978409260473758e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.5221145749092102, + "num_tokens": 924951697.0, + "step": 1809 + }, + { + "epoch": 0.48945375878853437, + "grad_norm": 2.1211233139038086, + "learning_rate": 1.9783749229810117e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.4930794835090637, + "num_tokens": 925475954.0, + "step": 1810 + }, + { + "epoch": 0.48972417522985395, + "grad_norm": 1.7492629289627075, + "learning_rate": 1.9783405585371393e-05, + "loss": 2.3073, + "mean_token_accuracy": 0.5051624774932861, + "num_tokens": 925951225.0, + "step": 1811 + }, + { + "epoch": 0.4899945916711736, + "grad_norm": 2.0918407440185547, + "learning_rate": 1.9783061671431955e-05, + "loss": 2.4517, + "mean_token_accuracy": 0.5074170231819153, + "num_tokens": 926423681.0, + "step": 1812 + }, + { + "epoch": 0.49026500811249324, + "grad_norm": 1.8831075429916382, + "learning_rate": 1.9782717488002355e-05, + "loss": 2.3934, + "mean_token_accuracy": 0.500749409198761, + "num_tokens": 926947909.0, + "step": 1813 + }, + { + "epoch": 0.4905354245538129, + "grad_norm": 2.5576956272125244, + "learning_rate": 1.978237303509316e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.5131826996803284, + "num_tokens": 927472098.0, + "step": 1814 + }, + { + "epoch": 0.4908058409951325, + "grad_norm": 2.0532214641571045, + "learning_rate": 1.9782028312714926e-05, + "loss": 2.4153, + "mean_token_accuracy": 0.4872078001499176, + "num_tokens": 927973232.0, + "step": 1815 + }, + { + "epoch": 0.4910762574364521, + "grad_norm": 2.0229289531707764, + "learning_rate": 1.9781683320878243e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.5143499374389648, + "num_tokens": 928497383.0, + "step": 1816 + }, + { + "epoch": 0.49134667387777176, + "grad_norm": 1.7068822383880615, + "learning_rate": 1.9781338059593684e-05, + "loss": 2.2642, + "mean_token_accuracy": 0.5011531114578247, + "num_tokens": 929021635.0, + "step": 1817 + }, + { + "epoch": 0.4916170903190914, + "grad_norm": 1.7208791971206665, + "learning_rate": 1.9780992528871855e-05, + "loss": 2.3213, + "mean_token_accuracy": 0.509723961353302, + "num_tokens": 929545815.0, + "step": 1818 + }, + { + "epoch": 0.49188750676041104, + "grad_norm": 3.432126045227051, + "learning_rate": 1.978064672872335e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.5340398550033569, + "num_tokens": 930069884.0, + "step": 1819 + }, + { + "epoch": 0.4921579232017307, + "grad_norm": 2.3972482681274414, + "learning_rate": 1.978030065915878e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.5128003358840942, + "num_tokens": 930593967.0, + "step": 1820 + }, + { + "epoch": 0.4924283396430503, + "grad_norm": 0.6600515842437744, + "learning_rate": 1.9779954320188765e-05, + "loss": 1.181, + "mean_token_accuracy": 0.7024133205413818, + "num_tokens": 931055281.0, + "step": 1821 + }, + { + "epoch": 0.4926987560843699, + "grad_norm": 2.7547607421875, + "learning_rate": 1.9779607711823933e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.49428191781044006, + "num_tokens": 931579527.0, + "step": 1822 + }, + { + "epoch": 0.49296917252568956, + "grad_norm": 1.7145719528198242, + "learning_rate": 1.9779260834074915e-05, + "loss": 2.3613, + "mean_token_accuracy": 0.5046997666358948, + "num_tokens": 932077762.0, + "step": 1823 + }, + { + "epoch": 0.4932395889670092, + "grad_norm": 2.029191732406616, + "learning_rate": 1.9778913686952353e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.5080069303512573, + "num_tokens": 932543726.0, + "step": 1824 + }, + { + "epoch": 0.49351000540832884, + "grad_norm": 2.016066789627075, + "learning_rate": 1.977856627046691e-05, + "loss": 2.3053, + "mean_token_accuracy": 0.5092359781265259, + "num_tokens": 933067985.0, + "step": 1825 + }, + { + "epoch": 0.49378042184964843, + "grad_norm": 1.6289188861846924, + "learning_rate": 1.9778218584629234e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.4812999367713928, + "num_tokens": 933592169.0, + "step": 1826 + }, + { + "epoch": 0.4940508382909681, + "grad_norm": 2.5263147354125977, + "learning_rate": 1.9777870629449997e-05, + "loss": 2.4125, + "mean_token_accuracy": 0.49600374698638916, + "num_tokens": 934116345.0, + "step": 1827 + }, + { + "epoch": 0.4943212547322877, + "grad_norm": 2.2564549446105957, + "learning_rate": 1.9777522404939874e-05, + "loss": 2.2834, + "mean_token_accuracy": 0.49206480383872986, + "num_tokens": 934640623.0, + "step": 1828 + }, + { + "epoch": 0.49459167117360736, + "grad_norm": 2.0704102516174316, + "learning_rate": 1.9777173911109554e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.463570237159729, + "num_tokens": 935164792.0, + "step": 1829 + }, + { + "epoch": 0.494862087614927, + "grad_norm": 2.1911861896514893, + "learning_rate": 1.977682514796972e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.5166687965393066, + "num_tokens": 935689061.0, + "step": 1830 + }, + { + "epoch": 0.49513250405624665, + "grad_norm": 2.879399061203003, + "learning_rate": 1.9776476115531083e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.5279144048690796, + "num_tokens": 936150897.0, + "step": 1831 + }, + { + "epoch": 0.49540292049756623, + "grad_norm": 1.8791364431381226, + "learning_rate": 1.977612681380435e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.49986201524734497, + "num_tokens": 936675073.0, + "step": 1832 + }, + { + "epoch": 0.4956733369388859, + "grad_norm": 2.1979880332946777, + "learning_rate": 1.9775777242800238e-05, + "loss": 2.3133, + "mean_token_accuracy": 0.503791093826294, + "num_tokens": 937199284.0, + "step": 1833 + }, + { + "epoch": 0.4959437533802055, + "grad_norm": 2.3026251792907715, + "learning_rate": 1.977542740252947e-05, + "loss": 2.3291, + "mean_token_accuracy": 0.4992716610431671, + "num_tokens": 937723453.0, + "step": 1834 + }, + { + "epoch": 0.49621416982152516, + "grad_norm": 1.6644645929336548, + "learning_rate": 1.9775077293002785e-05, + "loss": 2.3023, + "mean_token_accuracy": 0.513201117515564, + "num_tokens": 938247649.0, + "step": 1835 + }, + { + "epoch": 0.4964845862628448, + "grad_norm": 1.8244550228118896, + "learning_rate": 1.977472691423092e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.5071980953216553, + "num_tokens": 938771916.0, + "step": 1836 + }, + { + "epoch": 0.4967550027041644, + "grad_norm": 1.4275153875350952, + "learning_rate": 1.977437626622463e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4907095730304718, + "num_tokens": 939296197.0, + "step": 1837 + }, + { + "epoch": 0.49702541914548404, + "grad_norm": 1.8481613397598267, + "learning_rate": 1.9774025348994673e-05, + "loss": 2.4194, + "mean_token_accuracy": 0.5008498430252075, + "num_tokens": 939792392.0, + "step": 1838 + }, + { + "epoch": 0.4972958355868037, + "grad_norm": 1.440727949142456, + "learning_rate": 1.9773674162551815e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.5073797702789307, + "num_tokens": 940316592.0, + "step": 1839 + }, + { + "epoch": 0.4975662520281233, + "grad_norm": 2.371264934539795, + "learning_rate": 1.9773322706906833e-05, + "loss": 2.3324, + "mean_token_accuracy": 0.527229905128479, + "num_tokens": 940829445.0, + "step": 1840 + }, + { + "epoch": 0.49783666846944297, + "grad_norm": 1.0305486917495728, + "learning_rate": 1.9772970982070504e-05, + "loss": 1.1898, + "mean_token_accuracy": 0.6836770176887512, + "num_tokens": 941294035.0, + "step": 1841 + }, + { + "epoch": 0.49810708491076255, + "grad_norm": 2.470059633255005, + "learning_rate": 1.977261898805363e-05, + "loss": 2.3653, + "mean_token_accuracy": 0.4839942753314972, + "num_tokens": 941817731.0, + "step": 1842 + }, + { + "epoch": 0.4983775013520822, + "grad_norm": 1.9247231483459473, + "learning_rate": 1.9772266724867012e-05, + "loss": 2.3048, + "mean_token_accuracy": 0.5039564371109009, + "num_tokens": 942325871.0, + "step": 1843 + }, + { + "epoch": 0.49864791779340184, + "grad_norm": 1.6508820056915283, + "learning_rate": 1.9771914192521446e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.5192325115203857, + "num_tokens": 942813451.0, + "step": 1844 + }, + { + "epoch": 0.4989183342347215, + "grad_norm": 1.7131221294403076, + "learning_rate": 1.9771561391027758e-05, + "loss": 2.1851, + "mean_token_accuracy": 0.5405622720718384, + "num_tokens": 943337727.0, + "step": 1845 + }, + { + "epoch": 0.4991887506760411, + "grad_norm": 1.809905767440796, + "learning_rate": 1.977120832039677e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.48398256301879883, + "num_tokens": 943861829.0, + "step": 1846 + }, + { + "epoch": 0.4994591671173607, + "grad_norm": 1.638169527053833, + "learning_rate": 1.977085498063932e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.48454469442367554, + "num_tokens": 944386110.0, + "step": 1847 + }, + { + "epoch": 0.49972958355868036, + "grad_norm": 1.6664283275604248, + "learning_rate": 1.9770501371766243e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.5075442790985107, + "num_tokens": 944884648.0, + "step": 1848 + }, + { + "epoch": 0.5, + "grad_norm": 1.6837011575698853, + "learning_rate": 1.977014749378839e-05, + "loss": 2.3939, + "mean_token_accuracy": 0.5088481903076172, + "num_tokens": 945373079.0, + "step": 1849 + }, + { + "epoch": 0.5002704164413196, + "grad_norm": 1.5540485382080078, + "learning_rate": 1.9769793346716622e-05, + "loss": 2.218, + "mean_token_accuracy": 0.4840265214443207, + "num_tokens": 945897166.0, + "step": 1850 + }, + { + "epoch": 0.5005408328826393, + "grad_norm": 2.0522549152374268, + "learning_rate": 1.976943893056181e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.5030641555786133, + "num_tokens": 946421439.0, + "step": 1851 + }, + { + "epoch": 0.5008112493239589, + "grad_norm": 1.5615288019180298, + "learning_rate": 1.9769084245334816e-05, + "loss": 2.3925, + "mean_token_accuracy": 0.49304455518722534, + "num_tokens": 946921073.0, + "step": 1852 + }, + { + "epoch": 0.5010816657652786, + "grad_norm": 1.9145482778549194, + "learning_rate": 1.9768729291046526e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.5133252143859863, + "num_tokens": 947445350.0, + "step": 1853 + }, + { + "epoch": 0.5013520822065982, + "grad_norm": 1.8909828662872314, + "learning_rate": 1.9768374067707837e-05, + "loss": 2.3765, + "mean_token_accuracy": 0.498935341835022, + "num_tokens": 947969549.0, + "step": 1854 + }, + { + "epoch": 0.5016224986479177, + "grad_norm": 1.5839184522628784, + "learning_rate": 1.9768018575329647e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.49245205521583557, + "num_tokens": 948493723.0, + "step": 1855 + }, + { + "epoch": 0.5018929150892374, + "grad_norm": 1.5646048784255981, + "learning_rate": 1.9767662813922866e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.47367170453071594, + "num_tokens": 949017999.0, + "step": 1856 + }, + { + "epoch": 0.502163331530557, + "grad_norm": 1.974769115447998, + "learning_rate": 1.9767306783498398e-05, + "loss": 2.3906, + "mean_token_accuracy": 0.47893720865249634, + "num_tokens": 949542239.0, + "step": 1857 + }, + { + "epoch": 0.5024337479718767, + "grad_norm": 2.198057174682617, + "learning_rate": 1.976695048406718e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.4908677935600281, + "num_tokens": 950066385.0, + "step": 1858 + }, + { + "epoch": 0.5027041644131963, + "grad_norm": 1.7624768018722534, + "learning_rate": 1.9766593915640138e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.4687836170196533, + "num_tokens": 950590462.0, + "step": 1859 + }, + { + "epoch": 0.502974580854516, + "grad_norm": 2.610370635986328, + "learning_rate": 1.976623707822821e-05, + "loss": 2.3805, + "mean_token_accuracy": 0.5060676336288452, + "num_tokens": 951064320.0, + "step": 1860 + }, + { + "epoch": 0.5032449972958356, + "grad_norm": 1.1181970834732056, + "learning_rate": 1.9765879971842353e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6610571146011353, + "num_tokens": 951588587.0, + "step": 1861 + }, + { + "epoch": 0.5035154137371552, + "grad_norm": 2.713813066482544, + "learning_rate": 1.976552259649352e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.5055984258651733, + "num_tokens": 952112823.0, + "step": 1862 + }, + { + "epoch": 0.5037858301784749, + "grad_norm": 2.394524097442627, + "learning_rate": 1.9765164952192674e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.4944789707660675, + "num_tokens": 952636935.0, + "step": 1863 + }, + { + "epoch": 0.5040562466197945, + "grad_norm": 1.7154291868209839, + "learning_rate": 1.9764807038950793e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.5094190835952759, + "num_tokens": 953161104.0, + "step": 1864 + }, + { + "epoch": 0.5043266630611141, + "grad_norm": 2.065701723098755, + "learning_rate": 1.9764448856778855e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.5065184831619263, + "num_tokens": 953624550.0, + "step": 1865 + }, + { + "epoch": 0.5045970795024337, + "grad_norm": 1.6353591680526733, + "learning_rate": 1.9764090405687852e-05, + "loss": 2.4415, + "mean_token_accuracy": 0.47708749771118164, + "num_tokens": 954148826.0, + "step": 1866 + }, + { + "epoch": 0.5048674959437534, + "grad_norm": 1.9507813453674316, + "learning_rate": 1.976373168568878e-05, + "loss": 2.2571, + "mean_token_accuracy": 0.4782401919364929, + "num_tokens": 954673088.0, + "step": 1867 + }, + { + "epoch": 0.505137912385073, + "grad_norm": 1.7680015563964844, + "learning_rate": 1.9763372696792648e-05, + "loss": 2.262, + "mean_token_accuracy": 0.508700966835022, + "num_tokens": 955197205.0, + "step": 1868 + }, + { + "epoch": 0.5054083288263926, + "grad_norm": 1.7121220827102661, + "learning_rate": 1.976301343901047e-05, + "loss": 2.1354, + "mean_token_accuracy": 0.5269330739974976, + "num_tokens": 955721328.0, + "step": 1869 + }, + { + "epoch": 0.5056787452677123, + "grad_norm": 1.603342890739441, + "learning_rate": 1.976265391235327e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.4968118369579315, + "num_tokens": 956245546.0, + "step": 1870 + }, + { + "epoch": 0.5059491617090319, + "grad_norm": 1.9019050598144531, + "learning_rate": 1.976229411683208e-05, + "loss": 2.2319, + "mean_token_accuracy": 0.5060437321662903, + "num_tokens": 956691364.0, + "step": 1871 + }, + { + "epoch": 0.5062195781503516, + "grad_norm": 1.5087634325027466, + "learning_rate": 1.9761934052457937e-05, + "loss": 2.3774, + "mean_token_accuracy": 0.49781307578086853, + "num_tokens": 957166142.0, + "step": 1872 + }, + { + "epoch": 0.5064899945916712, + "grad_norm": 1.6266690492630005, + "learning_rate": 1.976157371924189e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.5137122273445129, + "num_tokens": 957662170.0, + "step": 1873 + }, + { + "epoch": 0.5067604110329909, + "grad_norm": 1.6908061504364014, + "learning_rate": 1.9761213117194996e-05, + "loss": 2.2007, + "mean_token_accuracy": 0.5224575400352478, + "num_tokens": 958186208.0, + "step": 1874 + }, + { + "epoch": 0.5070308274743104, + "grad_norm": 1.3479856252670288, + "learning_rate": 1.9760852246328316e-05, + "loss": 2.3159, + "mean_token_accuracy": 0.49091851711273193, + "num_tokens": 958710416.0, + "step": 1875 + }, + { + "epoch": 0.50730124391563, + "grad_norm": 1.6050575971603394, + "learning_rate": 1.9760491106652927e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.49698421359062195, + "num_tokens": 959230279.0, + "step": 1876 + }, + { + "epoch": 0.5075716603569497, + "grad_norm": 1.7740936279296875, + "learning_rate": 1.976012969817991e-05, + "loss": 2.524, + "mean_token_accuracy": 0.4794348478317261, + "num_tokens": 959754537.0, + "step": 1877 + }, + { + "epoch": 0.5078420767982693, + "grad_norm": 1.5053271055221558, + "learning_rate": 1.975976802092035e-05, + "loss": 2.0992, + "mean_token_accuracy": 0.5227400660514832, + "num_tokens": 960270560.0, + "step": 1878 + }, + { + "epoch": 0.508112493239589, + "grad_norm": 1.597364068031311, + "learning_rate": 1.975940607488535e-05, + "loss": 2.2584, + "mean_token_accuracy": 0.5087730288505554, + "num_tokens": 960756152.0, + "step": 1879 + }, + { + "epoch": 0.5083829096809086, + "grad_norm": 1.770432949066162, + "learning_rate": 1.9759043860086006e-05, + "loss": 2.3349, + "mean_token_accuracy": 0.5095980167388916, + "num_tokens": 961280325.0, + "step": 1880 + }, + { + "epoch": 0.5086533261222282, + "grad_norm": 1.4255518913269043, + "learning_rate": 1.975868137653344e-05, + "loss": 1.2456, + "mean_token_accuracy": 0.6780881881713867, + "num_tokens": 961804434.0, + "step": 1881 + }, + { + "epoch": 0.5089237425635479, + "grad_norm": 2.453429937362671, + "learning_rate": 1.9758318624238772e-05, + "loss": 2.2863, + "mean_token_accuracy": 0.5114002227783203, + "num_tokens": 962291583.0, + "step": 1882 + }, + { + "epoch": 0.5091941590048675, + "grad_norm": 2.4753541946411133, + "learning_rate": 1.975795560321313e-05, + "loss": 2.3957, + "mean_token_accuracy": 0.5005765557289124, + "num_tokens": 962756823.0, + "step": 1883 + }, + { + "epoch": 0.5094645754461872, + "grad_norm": 1.4882595539093018, + "learning_rate": 1.9757592313467656e-05, + "loss": 2.2855, + "mean_token_accuracy": 0.4955517053604126, + "num_tokens": 963281084.0, + "step": 1884 + }, + { + "epoch": 0.5097349918875067, + "grad_norm": 1.9123926162719727, + "learning_rate": 1.9757228755013495e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.5159559845924377, + "num_tokens": 963805327.0, + "step": 1885 + }, + { + "epoch": 0.5100054083288263, + "grad_norm": 1.8054755926132202, + "learning_rate": 1.9756864927861804e-05, + "loss": 2.1416, + "mean_token_accuracy": 0.5263910293579102, + "num_tokens": 964319631.0, + "step": 1886 + }, + { + "epoch": 0.510275824770146, + "grad_norm": 1.8657907247543335, + "learning_rate": 1.9756500832023743e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.50776207447052, + "num_tokens": 964798262.0, + "step": 1887 + }, + { + "epoch": 0.5105462412114656, + "grad_norm": 2.021942138671875, + "learning_rate": 1.9756136467510485e-05, + "loss": 2.3628, + "mean_token_accuracy": 0.4982050955295563, + "num_tokens": 965264098.0, + "step": 1888 + }, + { + "epoch": 0.5108166576527853, + "grad_norm": 1.9550217390060425, + "learning_rate": 1.975577183433321e-05, + "loss": 2.3399, + "mean_token_accuracy": 0.4956992268562317, + "num_tokens": 965788356.0, + "step": 1889 + }, + { + "epoch": 0.5110870740941049, + "grad_norm": 1.7367305755615234, + "learning_rate": 1.975540693250311e-05, + "loss": 2.3902, + "mean_token_accuracy": 0.4861968755722046, + "num_tokens": 966276058.0, + "step": 1890 + }, + { + "epoch": 0.5113574905354246, + "grad_norm": 2.0334787368774414, + "learning_rate": 1.9755041762031377e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.5222094058990479, + "num_tokens": 966800196.0, + "step": 1891 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 1.7513408660888672, + "learning_rate": 1.9754676322929214e-05, + "loss": 2.171, + "mean_token_accuracy": 0.511504054069519, + "num_tokens": 967324368.0, + "step": 1892 + }, + { + "epoch": 0.5118983234180638, + "grad_norm": 2.1716079711914062, + "learning_rate": 1.975431061520783e-05, + "loss": 2.2675, + "mean_token_accuracy": 0.49886587262153625, + "num_tokens": 967808159.0, + "step": 1893 + }, + { + "epoch": 0.5121687398593835, + "grad_norm": 2.107969045639038, + "learning_rate": 1.975394463887846e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.510108232498169, + "num_tokens": 968332369.0, + "step": 1894 + }, + { + "epoch": 0.5124391563007031, + "grad_norm": 1.7206780910491943, + "learning_rate": 1.9753578393952323e-05, + "loss": 2.3053, + "mean_token_accuracy": 0.5076207518577576, + "num_tokens": 968843392.0, + "step": 1895 + }, + { + "epoch": 0.5127095727420227, + "grad_norm": 2.0046067237854004, + "learning_rate": 1.9753211880440654e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.5289430022239685, + "num_tokens": 969303900.0, + "step": 1896 + }, + { + "epoch": 0.5129799891833423, + "grad_norm": 1.6058969497680664, + "learning_rate": 1.9752845098354706e-05, + "loss": 2.1724, + "mean_token_accuracy": 0.5288940668106079, + "num_tokens": 969792807.0, + "step": 1897 + }, + { + "epoch": 0.513250405624662, + "grad_norm": 1.8923275470733643, + "learning_rate": 1.975247804770573e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.5041218996047974, + "num_tokens": 970317088.0, + "step": 1898 + }, + { + "epoch": 0.5135208220659816, + "grad_norm": 1.8371447324752808, + "learning_rate": 1.975211072850499e-05, + "loss": 2.139, + "mean_token_accuracy": 0.5278885364532471, + "num_tokens": 970841194.0, + "step": 1899 + }, + { + "epoch": 0.5137912385073012, + "grad_norm": 1.5912606716156006, + "learning_rate": 1.975174314076375e-05, + "loss": 2.2688, + "mean_token_accuracy": 0.5023009181022644, + "num_tokens": 971354266.0, + "step": 1900 + }, + { + "epoch": 0.5140616549486209, + "grad_norm": 0.9299874305725098, + "learning_rate": 1.97513752844933e-05, + "loss": 1.3375, + "mean_token_accuracy": 0.6484772562980652, + "num_tokens": 971878486.0, + "step": 1901 + }, + { + "epoch": 0.5143320713899405, + "grad_norm": 2.6339571475982666, + "learning_rate": 1.975100715970491e-05, + "loss": 2.3236, + "mean_token_accuracy": 0.5009265542030334, + "num_tokens": 972402746.0, + "step": 1902 + }, + { + "epoch": 0.5146024878312602, + "grad_norm": 1.919602870941162, + "learning_rate": 1.9750638766409892e-05, + "loss": 2.3202, + "mean_token_accuracy": 0.5018705129623413, + "num_tokens": 972927008.0, + "step": 1903 + }, + { + "epoch": 0.5148729042725798, + "grad_norm": 1.7662029266357422, + "learning_rate": 1.975027010461954e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4916609227657318, + "num_tokens": 973451259.0, + "step": 1904 + }, + { + "epoch": 0.5151433207138995, + "grad_norm": 1.6921942234039307, + "learning_rate": 1.9749901174345173e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.5149480104446411, + "num_tokens": 973975447.0, + "step": 1905 + }, + { + "epoch": 0.515413737155219, + "grad_norm": 1.9435232877731323, + "learning_rate": 1.9749531975598104e-05, + "loss": 2.2081, + "mean_token_accuracy": 0.5355610251426697, + "num_tokens": 974407491.0, + "step": 1906 + }, + { + "epoch": 0.5156841535965386, + "grad_norm": 1.3519535064697266, + "learning_rate": 1.974916250838966e-05, + "loss": 2.2508, + "mean_token_accuracy": 0.5146704912185669, + "num_tokens": 974931672.0, + "step": 1907 + }, + { + "epoch": 0.5159545700378583, + "grad_norm": 1.9341400861740112, + "learning_rate": 1.9748792772731184e-05, + "loss": 2.4439, + "mean_token_accuracy": 0.49382567405700684, + "num_tokens": 975455714.0, + "step": 1908 + }, + { + "epoch": 0.5162249864791779, + "grad_norm": 1.6027332544326782, + "learning_rate": 1.974842276863402e-05, + "loss": 2.2368, + "mean_token_accuracy": 0.5181111097335815, + "num_tokens": 975940936.0, + "step": 1909 + }, + { + "epoch": 0.5164954029204976, + "grad_norm": 1.920896291732788, + "learning_rate": 1.9748052496109515e-05, + "loss": 2.386, + "mean_token_accuracy": 0.5059363842010498, + "num_tokens": 976457376.0, + "step": 1910 + }, + { + "epoch": 0.5167658193618172, + "grad_norm": 1.7203319072723389, + "learning_rate": 1.9747681955169028e-05, + "loss": 2.2352, + "mean_token_accuracy": 0.5078334808349609, + "num_tokens": 976981493.0, + "step": 1911 + }, + { + "epoch": 0.5170362358031368, + "grad_norm": 1.3952891826629639, + "learning_rate": 1.974731114582394e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.4968836307525635, + "num_tokens": 977412788.0, + "step": 1912 + }, + { + "epoch": 0.5173066522444565, + "grad_norm": 1.626107096672058, + "learning_rate": 1.974694006808562e-05, + "loss": 2.4435, + "mean_token_accuracy": 0.49639299511909485, + "num_tokens": 977877378.0, + "step": 1913 + }, + { + "epoch": 0.5175770686857761, + "grad_norm": 1.5671794414520264, + "learning_rate": 1.9746568721965454e-05, + "loss": 2.1486, + "mean_token_accuracy": 0.5231612324714661, + "num_tokens": 978401562.0, + "step": 1914 + }, + { + "epoch": 0.5178474851270958, + "grad_norm": 1.4214065074920654, + "learning_rate": 1.974619710747484e-05, + "loss": 2.1379, + "mean_token_accuracy": 0.5202237367630005, + "num_tokens": 978896858.0, + "step": 1915 + }, + { + "epoch": 0.5181179015684153, + "grad_norm": 2.23275089263916, + "learning_rate": 1.9745825224625174e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.5000318288803101, + "num_tokens": 979421109.0, + "step": 1916 + }, + { + "epoch": 0.518388318009735, + "grad_norm": 1.6181811094284058, + "learning_rate": 1.9745453073427877e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.5052741169929504, + "num_tokens": 979945336.0, + "step": 1917 + }, + { + "epoch": 0.5186587344510546, + "grad_norm": 1.4737135171890259, + "learning_rate": 1.974508065389436e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.5122694969177246, + "num_tokens": 980467765.0, + "step": 1918 + }, + { + "epoch": 0.5189291508923742, + "grad_norm": 1.5456169843673706, + "learning_rate": 1.9744707966036045e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.49086153507232666, + "num_tokens": 980992029.0, + "step": 1919 + }, + { + "epoch": 0.5191995673336939, + "grad_norm": 1.827057957649231, + "learning_rate": 1.974433500986438e-05, + "loss": 2.3547, + "mean_token_accuracy": 0.49171552062034607, + "num_tokens": 981516214.0, + "step": 1920 + }, + { + "epoch": 0.5194699837750135, + "grad_norm": 0.9809985160827637, + "learning_rate": 1.9743961785390795e-05, + "loss": 1.2052, + "mean_token_accuracy": 0.686648964881897, + "num_tokens": 982001758.0, + "step": 1921 + }, + { + "epoch": 0.5197404002163332, + "grad_norm": 2.2275474071502686, + "learning_rate": 1.9743588292626748e-05, + "loss": 2.4376, + "mean_token_accuracy": 0.4856700003147125, + "num_tokens": 982526035.0, + "step": 1922 + }, + { + "epoch": 0.5200108166576528, + "grad_norm": 1.8842222690582275, + "learning_rate": 1.9743214531583704e-05, + "loss": 2.3556, + "mean_token_accuracy": 0.4838998317718506, + "num_tokens": 983050258.0, + "step": 1923 + }, + { + "epoch": 0.5202812330989725, + "grad_norm": 1.7494258880615234, + "learning_rate": 1.9742840502273122e-05, + "loss": 2.4245, + "mean_token_accuracy": 0.5032804608345032, + "num_tokens": 983531882.0, + "step": 1924 + }, + { + "epoch": 0.5205516495402921, + "grad_norm": 1.6100713014602661, + "learning_rate": 1.9742466204706484e-05, + "loss": 2.3281, + "mean_token_accuracy": 0.5080649256706238, + "num_tokens": 984056162.0, + "step": 1925 + }, + { + "epoch": 0.5208220659816117, + "grad_norm": 1.4227479696273804, + "learning_rate": 1.9742091638895272e-05, + "loss": 2.311, + "mean_token_accuracy": 0.5071079730987549, + "num_tokens": 984580269.0, + "step": 1926 + }, + { + "epoch": 0.5210924824229313, + "grad_norm": 2.0267138481140137, + "learning_rate": 1.9741716804850982e-05, + "loss": 2.3555, + "mean_token_accuracy": 0.5102236866950989, + "num_tokens": 985099272.0, + "step": 1927 + }, + { + "epoch": 0.5213628988642509, + "grad_norm": 1.8023635149002075, + "learning_rate": 1.9741341702585114e-05, + "loss": 2.3757, + "mean_token_accuracy": 0.49668532609939575, + "num_tokens": 985623548.0, + "step": 1928 + }, + { + "epoch": 0.5216333153055706, + "grad_norm": 2.0218801498413086, + "learning_rate": 1.9740966332109174e-05, + "loss": 2.4167, + "mean_token_accuracy": 0.5075358748435974, + "num_tokens": 986104577.0, + "step": 1929 + }, + { + "epoch": 0.5219037317468902, + "grad_norm": 1.4884190559387207, + "learning_rate": 1.9740590693434677e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.49810653924942017, + "num_tokens": 986628764.0, + "step": 1930 + }, + { + "epoch": 0.5221741481882098, + "grad_norm": 1.5371507406234741, + "learning_rate": 1.974021478657316e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.5060381293296814, + "num_tokens": 987152946.0, + "step": 1931 + }, + { + "epoch": 0.5224445646295295, + "grad_norm": 1.7981019020080566, + "learning_rate": 1.9739838611536145e-05, + "loss": 2.2397, + "mean_token_accuracy": 0.516100287437439, + "num_tokens": 987677109.0, + "step": 1932 + }, + { + "epoch": 0.5227149810708491, + "grad_norm": 1.7203702926635742, + "learning_rate": 1.9739462168335183e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.49731913208961487, + "num_tokens": 988192745.0, + "step": 1933 + }, + { + "epoch": 0.5229853975121688, + "grad_norm": 1.9294413328170776, + "learning_rate": 1.9739085456981818e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.48939669132232666, + "num_tokens": 988717014.0, + "step": 1934 + }, + { + "epoch": 0.5232558139534884, + "grad_norm": 1.5595183372497559, + "learning_rate": 1.973870847748761e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.5042881965637207, + "num_tokens": 989241275.0, + "step": 1935 + }, + { + "epoch": 0.5235262303948081, + "grad_norm": 1.8738101720809937, + "learning_rate": 1.973833122986413e-05, + "loss": 2.2365, + "mean_token_accuracy": 0.5232430100440979, + "num_tokens": 989765510.0, + "step": 1936 + }, + { + "epoch": 0.5237966468361276, + "grad_norm": 1.837235927581787, + "learning_rate": 1.9737953714122952e-05, + "loss": 2.2991, + "mean_token_accuracy": 0.5125941038131714, + "num_tokens": 990229945.0, + "step": 1937 + }, + { + "epoch": 0.5240670632774472, + "grad_norm": 1.6365007162094116, + "learning_rate": 1.9737575930275653e-05, + "loss": 2.212, + "mean_token_accuracy": 0.5145873427391052, + "num_tokens": 990679532.0, + "step": 1938 + }, + { + "epoch": 0.5243374797187669, + "grad_norm": 1.7889295816421509, + "learning_rate": 1.973719787833383e-05, + "loss": 2.1072, + "mean_token_accuracy": 0.5590732097625732, + "num_tokens": 991203741.0, + "step": 1939 + }, + { + "epoch": 0.5246078961600865, + "grad_norm": 1.4904990196228027, + "learning_rate": 1.9736819558309085e-05, + "loss": 2.249, + "mean_token_accuracy": 0.5113467574119568, + "num_tokens": 991728007.0, + "step": 1940 + }, + { + "epoch": 0.5248783126014062, + "grad_norm": 0.8384918570518494, + "learning_rate": 1.973644097021302e-05, + "loss": 1.2274, + "mean_token_accuracy": 0.6806516647338867, + "num_tokens": 992211303.0, + "step": 1941 + }, + { + "epoch": 0.5251487290427258, + "grad_norm": 2.862474203109741, + "learning_rate": 1.9736062114057254e-05, + "loss": 2.4129, + "mean_token_accuracy": 0.5020352602005005, + "num_tokens": 992698092.0, + "step": 1942 + }, + { + "epoch": 0.5254191454840454, + "grad_norm": 1.7029123306274414, + "learning_rate": 1.9735682989853408e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.4732550084590912, + "num_tokens": 993222299.0, + "step": 1943 + }, + { + "epoch": 0.5256895619253651, + "grad_norm": 1.8272695541381836, + "learning_rate": 1.9735303597613124e-05, + "loss": 2.281, + "mean_token_accuracy": 0.5232187509536743, + "num_tokens": 993746475.0, + "step": 1944 + }, + { + "epoch": 0.5259599783666847, + "grad_norm": 2.056770086288452, + "learning_rate": 1.973492393734803e-05, + "loss": 2.2733, + "mean_token_accuracy": 0.5264080762863159, + "num_tokens": 994228756.0, + "step": 1945 + }, + { + "epoch": 0.5262303948080044, + "grad_norm": 1.7350016832351685, + "learning_rate": 1.9734544009069787e-05, + "loss": 2.2064, + "mean_token_accuracy": 0.5197269916534424, + "num_tokens": 994753037.0, + "step": 1946 + }, + { + "epoch": 0.5265008112493239, + "grad_norm": 1.7131288051605225, + "learning_rate": 1.9734163812790045e-05, + "loss": 2.3616, + "mean_token_accuracy": 0.5054427981376648, + "num_tokens": 995277298.0, + "step": 1947 + }, + { + "epoch": 0.5267712276906436, + "grad_norm": 2.7082860469818115, + "learning_rate": 1.9733783348520472e-05, + "loss": 2.1328, + "mean_token_accuracy": 0.5526278018951416, + "num_tokens": 995801555.0, + "step": 1948 + }, + { + "epoch": 0.5270416441319632, + "grad_norm": 1.8942526578903198, + "learning_rate": 1.9733402616272745e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.5029809474945068, + "num_tokens": 996325802.0, + "step": 1949 + }, + { + "epoch": 0.5273120605732828, + "grad_norm": 2.5640485286712646, + "learning_rate": 1.9733021616058538e-05, + "loss": 1.969, + "mean_token_accuracy": 0.5677615404129028, + "num_tokens": 996814884.0, + "step": 1950 + }, + { + "epoch": 0.5275824770146025, + "grad_norm": 1.8488924503326416, + "learning_rate": 1.9732640347889548e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.5118147134780884, + "num_tokens": 997338962.0, + "step": 1951 + }, + { + "epoch": 0.5278528934559221, + "grad_norm": 1.5151417255401611, + "learning_rate": 1.973225881177747e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.5060885548591614, + "num_tokens": 997814649.0, + "step": 1952 + }, + { + "epoch": 0.5281233098972418, + "grad_norm": 1.5428192615509033, + "learning_rate": 1.973187700773401e-05, + "loss": 2.3791, + "mean_token_accuracy": 0.5098286867141724, + "num_tokens": 998338900.0, + "step": 1953 + }, + { + "epoch": 0.5283937263385614, + "grad_norm": 1.5167510509490967, + "learning_rate": 1.9731494935770886e-05, + "loss": 2.2844, + "mean_token_accuracy": 0.5029087066650391, + "num_tokens": 998863175.0, + "step": 1954 + }, + { + "epoch": 0.528664142779881, + "grad_norm": 1.8957817554473877, + "learning_rate": 1.9731112595899822e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.5176033973693848, + "num_tokens": 999358590.0, + "step": 1955 + }, + { + "epoch": 0.5289345592212007, + "grad_norm": 1.6355366706848145, + "learning_rate": 1.9730729988132543e-05, + "loss": 2.2904, + "mean_token_accuracy": 0.4870922565460205, + "num_tokens": 999882856.0, + "step": 1956 + }, + { + "epoch": 0.5292049756625202, + "grad_norm": 1.9067821502685547, + "learning_rate": 1.97303471124808e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.503097653388977, + "num_tokens": 1000394987.0, + "step": 1957 + }, + { + "epoch": 0.5294753921038399, + "grad_norm": 1.7122771739959717, + "learning_rate": 1.9729963968956324e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.4908657670021057, + "num_tokens": 1000919219.0, + "step": 1958 + }, + { + "epoch": 0.5297458085451595, + "grad_norm": 1.6743245124816895, + "learning_rate": 1.9729580557570886e-05, + "loss": 2.2963, + "mean_token_accuracy": 0.5026180744171143, + "num_tokens": 1001377901.0, + "step": 1959 + }, + { + "epoch": 0.5300162249864792, + "grad_norm": 1.5784825086593628, + "learning_rate": 1.9729196878336242e-05, + "loss": 2.2354, + "mean_token_accuracy": 0.5040435791015625, + "num_tokens": 1001866488.0, + "step": 1960 + }, + { + "epoch": 0.5302866414277988, + "grad_norm": 1.2300268411636353, + "learning_rate": 1.9728812931264166e-05, + "loss": 1.2786, + "mean_token_accuracy": 0.6635280847549438, + "num_tokens": 1002376160.0, + "step": 1961 + }, + { + "epoch": 0.5305570578691184, + "grad_norm": 2.6707918643951416, + "learning_rate": 1.9728428716366442e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.5127331018447876, + "num_tokens": 1002835610.0, + "step": 1962 + }, + { + "epoch": 0.5308274743104381, + "grad_norm": 1.7632564306259155, + "learning_rate": 1.9728044233654857e-05, + "loss": 2.0657, + "mean_token_accuracy": 0.5237548351287842, + "num_tokens": 1003340500.0, + "step": 1963 + }, + { + "epoch": 0.5310978907517577, + "grad_norm": 1.5229588747024536, + "learning_rate": 1.9727659483141205e-05, + "loss": 2.3085, + "mean_token_accuracy": 0.5048524737358093, + "num_tokens": 1003864631.0, + "step": 1964 + }, + { + "epoch": 0.5313683071930774, + "grad_norm": 2.468439817428589, + "learning_rate": 1.9727274464837298e-05, + "loss": 2.2567, + "mean_token_accuracy": 0.522781252861023, + "num_tokens": 1004388560.0, + "step": 1965 + }, + { + "epoch": 0.531638723634397, + "grad_norm": 1.9450652599334717, + "learning_rate": 1.972688917875494e-05, + "loss": 2.2253, + "mean_token_accuracy": 0.5149626731872559, + "num_tokens": 1004867680.0, + "step": 1966 + }, + { + "epoch": 0.5319091400757167, + "grad_norm": 2.524869203567505, + "learning_rate": 1.972650362490596e-05, + "loss": 2.0118, + "mean_token_accuracy": 0.558415412902832, + "num_tokens": 1005391858.0, + "step": 1967 + }, + { + "epoch": 0.5321795565170362, + "grad_norm": 1.9115461111068726, + "learning_rate": 1.972611780330219e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.5113301277160645, + "num_tokens": 1005897359.0, + "step": 1968 + }, + { + "epoch": 0.5324499729583558, + "grad_norm": 2.452528476715088, + "learning_rate": 1.9725731713955453e-05, + "loss": 2.0116, + "mean_token_accuracy": 0.5548890829086304, + "num_tokens": 1006421542.0, + "step": 1969 + }, + { + "epoch": 0.5327203893996755, + "grad_norm": 2.169543981552124, + "learning_rate": 1.9725345356877616e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.5307556390762329, + "num_tokens": 1006914167.0, + "step": 1970 + }, + { + "epoch": 0.5329908058409951, + "grad_norm": 1.478385090827942, + "learning_rate": 1.972495873208052e-05, + "loss": 2.249, + "mean_token_accuracy": 0.5177117586135864, + "num_tokens": 1007438382.0, + "step": 1971 + }, + { + "epoch": 0.5332612222823148, + "grad_norm": 1.950005292892456, + "learning_rate": 1.9724571839576033e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4973737597465515, + "num_tokens": 1007936088.0, + "step": 1972 + }, + { + "epoch": 0.5335316387236344, + "grad_norm": 1.7087925672531128, + "learning_rate": 1.9724184679376022e-05, + "loss": 2.3579, + "mean_token_accuracy": 0.4952956438064575, + "num_tokens": 1008411873.0, + "step": 1973 + }, + { + "epoch": 0.533802055164954, + "grad_norm": 1.6398299932479858, + "learning_rate": 1.972379725149237e-05, + "loss": 2.0996, + "mean_token_accuracy": 0.5225857496261597, + "num_tokens": 1008936078.0, + "step": 1974 + }, + { + "epoch": 0.5340724716062737, + "grad_norm": 1.360260009765625, + "learning_rate": 1.9723409555936966e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.49230337142944336, + "num_tokens": 1009460153.0, + "step": 1975 + }, + { + "epoch": 0.5343428880475933, + "grad_norm": 1.3868721723556519, + "learning_rate": 1.97230215927217e-05, + "loss": 2.2701, + "mean_token_accuracy": 0.5175325274467468, + "num_tokens": 1009984403.0, + "step": 1976 + }, + { + "epoch": 0.534613304488913, + "grad_norm": 1.783341407775879, + "learning_rate": 1.9722633361858482e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.4892035126686096, + "num_tokens": 1010461386.0, + "step": 1977 + }, + { + "epoch": 0.5348837209302325, + "grad_norm": 1.948641300201416, + "learning_rate": 1.9722244863359215e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.5228583812713623, + "num_tokens": 1010954224.0, + "step": 1978 + }, + { + "epoch": 0.5351541373715522, + "grad_norm": 1.7488675117492676, + "learning_rate": 1.9721856097235833e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.5084273219108582, + "num_tokens": 1011416103.0, + "step": 1979 + }, + { + "epoch": 0.5354245538128718, + "grad_norm": 1.3690714836120605, + "learning_rate": 1.972146706350025e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.49872025847435, + "num_tokens": 1011910167.0, + "step": 1980 + }, + { + "epoch": 0.5356949702541914, + "grad_norm": 0.9939333200454712, + "learning_rate": 1.9721077762164414e-05, + "loss": 1.2456, + "mean_token_accuracy": 0.6818758845329285, + "num_tokens": 1012434277.0, + "step": 1981 + }, + { + "epoch": 0.5359653866955111, + "grad_norm": 3.788395881652832, + "learning_rate": 1.9720688193240267e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.5084508657455444, + "num_tokens": 1012917857.0, + "step": 1982 + }, + { + "epoch": 0.5362358031368307, + "grad_norm": 3.606877565383911, + "learning_rate": 1.972029835673975e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.4923136532306671, + "num_tokens": 1013442117.0, + "step": 1983 + }, + { + "epoch": 0.5365062195781504, + "grad_norm": 1.5824965238571167, + "learning_rate": 1.9719908252674844e-05, + "loss": 2.3839, + "mean_token_accuracy": 0.5064527988433838, + "num_tokens": 1013918253.0, + "step": 1984 + }, + { + "epoch": 0.53677663601947, + "grad_norm": 2.7381350994110107, + "learning_rate": 1.9719517881057505e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.49875038862228394, + "num_tokens": 1014442535.0, + "step": 1985 + }, + { + "epoch": 0.5370470524607897, + "grad_norm": 2.618152379989624, + "learning_rate": 1.9719127241899717e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.5055476427078247, + "num_tokens": 1014966760.0, + "step": 1986 + }, + { + "epoch": 0.5373174689021093, + "grad_norm": 1.7572071552276611, + "learning_rate": 1.9718736335213465e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.4931119978427887, + "num_tokens": 1015491034.0, + "step": 1987 + }, + { + "epoch": 0.5375878853434288, + "grad_norm": 1.8465924263000488, + "learning_rate": 1.9718345161010738e-05, + "loss": 1.9907, + "mean_token_accuracy": 0.5329225063323975, + "num_tokens": 1016015228.0, + "step": 1988 + }, + { + "epoch": 0.5378583017847485, + "grad_norm": 1.7258713245391846, + "learning_rate": 1.9717953719303545e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.5040136575698853, + "num_tokens": 1016539498.0, + "step": 1989 + }, + { + "epoch": 0.5381287182260681, + "grad_norm": 1.5240534543991089, + "learning_rate": 1.9717562010103895e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.48753225803375244, + "num_tokens": 1017063693.0, + "step": 1990 + }, + { + "epoch": 0.5383991346673878, + "grad_norm": 1.7512967586517334, + "learning_rate": 1.9717170033423808e-05, + "loss": 2.3403, + "mean_token_accuracy": 0.4981710910797119, + "num_tokens": 1017587970.0, + "step": 1991 + }, + { + "epoch": 0.5386695511087074, + "grad_norm": 1.7141011953353882, + "learning_rate": 1.9716777789275303e-05, + "loss": 2.3075, + "mean_token_accuracy": 0.5084019899368286, + "num_tokens": 1018082926.0, + "step": 1992 + }, + { + "epoch": 0.538939967550027, + "grad_norm": 1.8027046918869019, + "learning_rate": 1.9716385277670423e-05, + "loss": 2.3177, + "mean_token_accuracy": 0.4933556914329529, + "num_tokens": 1018607196.0, + "step": 1993 + }, + { + "epoch": 0.5392103839913467, + "grad_norm": 1.5415611267089844, + "learning_rate": 1.9715992498621212e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.504332423210144, + "num_tokens": 1019071311.0, + "step": 1994 + }, + { + "epoch": 0.5394808004326663, + "grad_norm": 2.0143253803253174, + "learning_rate": 1.9715599452139715e-05, + "loss": 2.3935, + "mean_token_accuracy": 0.5074319243431091, + "num_tokens": 1019532982.0, + "step": 1995 + }, + { + "epoch": 0.539751216873986, + "grad_norm": 1.4318293333053589, + "learning_rate": 1.9715206138238e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.5036816000938416, + "num_tokens": 1020057235.0, + "step": 1996 + }, + { + "epoch": 0.5400216333153056, + "grad_norm": 2.1122000217437744, + "learning_rate": 1.971481255692813e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4912240207195282, + "num_tokens": 1020581348.0, + "step": 1997 + }, + { + "epoch": 0.5402920497566251, + "grad_norm": 1.6840527057647705, + "learning_rate": 1.9714418708222182e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.47507938742637634, + "num_tokens": 1021105634.0, + "step": 1998 + }, + { + "epoch": 0.5405624661979448, + "grad_norm": 1.1847703456878662, + "learning_rate": 1.971402459213224e-05, + "loss": 2.3049, + "mean_token_accuracy": 0.5216946005821228, + "num_tokens": 1021622425.0, + "step": 1999 + }, + { + "epoch": 0.5408328826392644, + "grad_norm": 1.7557858228683472, + "learning_rate": 1.9713630208670398e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.5233116149902344, + "num_tokens": 1022137491.0, + "step": 2000 + }, + { + "epoch": 0.5411032990805841, + "grad_norm": 0.8658999800682068, + "learning_rate": 1.9713235557848752e-05, + "loss": 1.0317, + "mean_token_accuracy": 0.7279574871063232, + "num_tokens": 1022661755.0, + "step": 2001 + }, + { + "epoch": 0.5413737155219037, + "grad_norm": 1.9577497243881226, + "learning_rate": 1.971284063967942e-05, + "loss": 2.3216, + "mean_token_accuracy": 0.5074508786201477, + "num_tokens": 1023172336.0, + "step": 2002 + }, + { + "epoch": 0.5416441319632234, + "grad_norm": 1.5313575267791748, + "learning_rate": 1.971244545417451e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.4768039882183075, + "num_tokens": 1023696517.0, + "step": 2003 + }, + { + "epoch": 0.541914548404543, + "grad_norm": 1.308815598487854, + "learning_rate": 1.9712050001346156e-05, + "loss": 2.087, + "mean_token_accuracy": 0.5301229357719421, + "num_tokens": 1024220795.0, + "step": 2004 + }, + { + "epoch": 0.5421849648458626, + "grad_norm": 1.5986685752868652, + "learning_rate": 1.9711654281206486e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.5098903179168701, + "num_tokens": 1024686002.0, + "step": 2005 + }, + { + "epoch": 0.5424553812871823, + "grad_norm": 1.5521219968795776, + "learning_rate": 1.9711258293767645e-05, + "loss": 2.3496, + "mean_token_accuracy": 0.4946816563606262, + "num_tokens": 1025210234.0, + "step": 2006 + }, + { + "epoch": 0.5427257977285019, + "grad_norm": 1.474871039390564, + "learning_rate": 1.971086203904178e-05, + "loss": 2.1093, + "mean_token_accuracy": 0.5144054889678955, + "num_tokens": 1025734425.0, + "step": 2007 + }, + { + "epoch": 0.5429962141698216, + "grad_norm": 1.6241910457611084, + "learning_rate": 1.971046551704105e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.526797354221344, + "num_tokens": 1026258599.0, + "step": 2008 + }, + { + "epoch": 0.5432666306111411, + "grad_norm": 1.6834527254104614, + "learning_rate": 1.971006872777762e-05, + "loss": 2.3053, + "mean_token_accuracy": 0.4917161464691162, + "num_tokens": 1026782714.0, + "step": 2009 + }, + { + "epoch": 0.5435370470524608, + "grad_norm": 1.561569333076477, + "learning_rate": 1.970967167126367e-05, + "loss": 2.2272, + "mean_token_accuracy": 0.5138920545578003, + "num_tokens": 1027306941.0, + "step": 2010 + }, + { + "epoch": 0.5438074634937804, + "grad_norm": 2.020332098007202, + "learning_rate": 1.970927434751138e-05, + "loss": 2.1548, + "mean_token_accuracy": 0.5282331705093384, + "num_tokens": 1027831082.0, + "step": 2011 + }, + { + "epoch": 0.5440778799351, + "grad_norm": 1.5457375049591064, + "learning_rate": 1.970887675653294e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.5036712288856506, + "num_tokens": 1028355368.0, + "step": 2012 + }, + { + "epoch": 0.5443482963764197, + "grad_norm": 1.4078248739242554, + "learning_rate": 1.970847889834055e-05, + "loss": 2.1369, + "mean_token_accuracy": 0.5201356410980225, + "num_tokens": 1028879415.0, + "step": 2013 + }, + { + "epoch": 0.5446187128177393, + "grad_norm": 1.2600655555725098, + "learning_rate": 1.9708080772946418e-05, + "loss": 2.1602, + "mean_token_accuracy": 0.516918420791626, + "num_tokens": 1029403646.0, + "step": 2014 + }, + { + "epoch": 0.544889129259059, + "grad_norm": 1.784969687461853, + "learning_rate": 1.970768238036276e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.5045362114906311, + "num_tokens": 1029882812.0, + "step": 2015 + }, + { + "epoch": 0.5451595457003786, + "grad_norm": 1.4952908754348755, + "learning_rate": 1.9707283720601794e-05, + "loss": 2.1745, + "mean_token_accuracy": 0.5027347803115845, + "num_tokens": 1030406926.0, + "step": 2016 + }, + { + "epoch": 0.5454299621416983, + "grad_norm": 2.0165488719940186, + "learning_rate": 1.970688479367576e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.47100168466567993, + "num_tokens": 1030931201.0, + "step": 2017 + }, + { + "epoch": 0.5457003785830179, + "grad_norm": 1.4002586603164673, + "learning_rate": 1.9706485599596897e-05, + "loss": 2.2417, + "mean_token_accuracy": 0.5134713649749756, + "num_tokens": 1031406749.0, + "step": 2018 + }, + { + "epoch": 0.5459707950243374, + "grad_norm": 1.4724891185760498, + "learning_rate": 1.970608613837745e-05, + "loss": 2.3093, + "mean_token_accuracy": 0.5019861459732056, + "num_tokens": 1031930986.0, + "step": 2019 + }, + { + "epoch": 0.5462412114656571, + "grad_norm": 1.6731503009796143, + "learning_rate": 1.9705686410029675e-05, + "loss": 2.288, + "mean_token_accuracy": 0.5071094036102295, + "num_tokens": 1032455225.0, + "step": 2020 + }, + { + "epoch": 0.5465116279069767, + "grad_norm": 0.8678670525550842, + "learning_rate": 1.9705286414565844e-05, + "loss": 1.1999, + "mean_token_accuracy": 0.6819097995758057, + "num_tokens": 1032979444.0, + "step": 2021 + }, + { + "epoch": 0.5467820443482964, + "grad_norm": 2.63677978515625, + "learning_rate": 1.9704886151998223e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.49854177236557007, + "num_tokens": 1033503638.0, + "step": 2022 + }, + { + "epoch": 0.547052460789616, + "grad_norm": 1.9269686937332153, + "learning_rate": 1.9704485622339094e-05, + "loss": 2.3347, + "mean_token_accuracy": 0.49439579248428345, + "num_tokens": 1034027702.0, + "step": 2023 + }, + { + "epoch": 0.5473228772309356, + "grad_norm": 1.8008450269699097, + "learning_rate": 1.970408482560075e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.5127885341644287, + "num_tokens": 1034551959.0, + "step": 2024 + }, + { + "epoch": 0.5475932936722553, + "grad_norm": 2.1606764793395996, + "learning_rate": 1.9703683761795486e-05, + "loss": 2.1718, + "mean_token_accuracy": 0.5188115239143372, + "num_tokens": 1035076085.0, + "step": 2025 + }, + { + "epoch": 0.5478637101135749, + "grad_norm": 1.585146427154541, + "learning_rate": 1.9703282430935607e-05, + "loss": 2.213, + "mean_token_accuracy": 0.5098675489425659, + "num_tokens": 1035555927.0, + "step": 2026 + }, + { + "epoch": 0.5481341265548946, + "grad_norm": 2.487144708633423, + "learning_rate": 1.970288083303343e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.5231319665908813, + "num_tokens": 1036021227.0, + "step": 2027 + }, + { + "epoch": 0.5484045429962142, + "grad_norm": 1.8174313306808472, + "learning_rate": 1.9702478968101277e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.5270401835441589, + "num_tokens": 1036525535.0, + "step": 2028 + }, + { + "epoch": 0.5486749594375337, + "grad_norm": 2.6366708278656006, + "learning_rate": 1.9702076836151474e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.5055136680603027, + "num_tokens": 1037049812.0, + "step": 2029 + }, + { + "epoch": 0.5489453758788534, + "grad_norm": 2.1022257804870605, + "learning_rate": 1.9701674437196364e-05, + "loss": 2.362, + "mean_token_accuracy": 0.5001716613769531, + "num_tokens": 1037529737.0, + "step": 2030 + }, + { + "epoch": 0.549215792320173, + "grad_norm": 1.5117701292037964, + "learning_rate": 1.970127177124829e-05, + "loss": 2.2485, + "mean_token_accuracy": 0.50840824842453, + "num_tokens": 1038053860.0, + "step": 2031 + }, + { + "epoch": 0.5494862087614927, + "grad_norm": 1.8812642097473145, + "learning_rate": 1.9700868838319613e-05, + "loss": 2.334, + "mean_token_accuracy": 0.5103249549865723, + "num_tokens": 1038533895.0, + "step": 2032 + }, + { + "epoch": 0.5497566252028123, + "grad_norm": 1.4784289598464966, + "learning_rate": 1.9700465638422688e-05, + "loss": 2.3016, + "mean_token_accuracy": 0.49678927659988403, + "num_tokens": 1039058054.0, + "step": 2033 + }, + { + "epoch": 0.550027041644132, + "grad_norm": 1.7108922004699707, + "learning_rate": 1.9700062171569895e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.5046001672744751, + "num_tokens": 1039540522.0, + "step": 2034 + }, + { + "epoch": 0.5502974580854516, + "grad_norm": 1.8099020719528198, + "learning_rate": 1.9699658437773606e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.5228456854820251, + "num_tokens": 1040064707.0, + "step": 2035 + }, + { + "epoch": 0.5505678745267713, + "grad_norm": 1.5289957523345947, + "learning_rate": 1.9699254437046213e-05, + "loss": 2.3035, + "mean_token_accuracy": 0.5107333660125732, + "num_tokens": 1040548978.0, + "step": 2036 + }, + { + "epoch": 0.5508382909680909, + "grad_norm": 1.3758465051651, + "learning_rate": 1.9698850169400113e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.4897778034210205, + "num_tokens": 1041073021.0, + "step": 2037 + }, + { + "epoch": 0.5511087074094105, + "grad_norm": 2.200507164001465, + "learning_rate": 1.969844563484771e-05, + "loss": 2.3683, + "mean_token_accuracy": 0.5025473237037659, + "num_tokens": 1041597280.0, + "step": 2038 + }, + { + "epoch": 0.5513791238507302, + "grad_norm": 1.665338397026062, + "learning_rate": 1.9698040833401412e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.5159207582473755, + "num_tokens": 1042121463.0, + "step": 2039 + }, + { + "epoch": 0.5516495402920497, + "grad_norm": 1.42149019241333, + "learning_rate": 1.9697635765073638e-05, + "loss": 2.2332, + "mean_token_accuracy": 0.49893468618392944, + "num_tokens": 1042645696.0, + "step": 2040 + }, + { + "epoch": 0.5519199567333694, + "grad_norm": 1.0263020992279053, + "learning_rate": 1.9697230429876824e-05, + "loss": 1.1954, + "mean_token_accuracy": 0.6914616823196411, + "num_tokens": 1043111089.0, + "step": 2041 + }, + { + "epoch": 0.552190373174689, + "grad_norm": 3.120260238647461, + "learning_rate": 1.9696824827823402e-05, + "loss": 2.4647, + "mean_token_accuracy": 0.48671281337738037, + "num_tokens": 1043600533.0, + "step": 2042 + }, + { + "epoch": 0.5524607896160086, + "grad_norm": 2.369670867919922, + "learning_rate": 1.969641895892582e-05, + "loss": 2.3003, + "mean_token_accuracy": 0.5043464303016663, + "num_tokens": 1044124817.0, + "step": 2043 + }, + { + "epoch": 0.5527312060573283, + "grad_norm": 1.6023679971694946, + "learning_rate": 1.969601282319653e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.501675009727478, + "num_tokens": 1044648945.0, + "step": 2044 + }, + { + "epoch": 0.5530016224986479, + "grad_norm": 1.909761905670166, + "learning_rate": 1.9695606420647995e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.5118668079376221, + "num_tokens": 1045173188.0, + "step": 2045 + }, + { + "epoch": 0.5532720389399676, + "grad_norm": 2.0429999828338623, + "learning_rate": 1.969519975129268e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.49242666363716125, + "num_tokens": 1045697278.0, + "step": 2046 + }, + { + "epoch": 0.5535424553812872, + "grad_norm": 1.6408300399780273, + "learning_rate": 1.9694792815143066e-05, + "loss": 2.3389, + "mean_token_accuracy": 0.5140761137008667, + "num_tokens": 1046174902.0, + "step": 2047 + }, + { + "epoch": 0.5538128718226069, + "grad_norm": 1.7425336837768555, + "learning_rate": 1.969438561221164e-05, + "loss": 2.2499, + "mean_token_accuracy": 0.5197476148605347, + "num_tokens": 1046641395.0, + "step": 2048 + }, + { + "epoch": 0.5540832882639265, + "grad_norm": 1.7731462717056274, + "learning_rate": 1.9693978142510896e-05, + "loss": 2.3674, + "mean_token_accuracy": 0.4930301010608673, + "num_tokens": 1047129089.0, + "step": 2049 + }, + { + "epoch": 0.554353704705246, + "grad_norm": 1.3906296491622925, + "learning_rate": 1.969357040605333e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.525855541229248, + "num_tokens": 1047641419.0, + "step": 2050 + }, + { + "epoch": 0.5546241211465657, + "grad_norm": 1.9279206991195679, + "learning_rate": 1.9693162402851464e-05, + "loss": 2.3068, + "mean_token_accuracy": 0.47525402903556824, + "num_tokens": 1048165634.0, + "step": 2051 + }, + { + "epoch": 0.5548945375878853, + "grad_norm": 1.6003390550613403, + "learning_rate": 1.9692754132917806e-05, + "loss": 2.2937, + "mean_token_accuracy": 0.4930960536003113, + "num_tokens": 1048689895.0, + "step": 2052 + }, + { + "epoch": 0.555164954029205, + "grad_norm": 1.5820280313491821, + "learning_rate": 1.9692345596264894e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.5104414224624634, + "num_tokens": 1049214178.0, + "step": 2053 + }, + { + "epoch": 0.5554353704705246, + "grad_norm": 1.9074257612228394, + "learning_rate": 1.969193679290525e-05, + "loss": 2.2238, + "mean_token_accuracy": 0.4971383512020111, + "num_tokens": 1049738297.0, + "step": 2054 + }, + { + "epoch": 0.5557057869118442, + "grad_norm": 1.7220146656036377, + "learning_rate": 1.9691527722851425e-05, + "loss": 2.3111, + "mean_token_accuracy": 0.5044753551483154, + "num_tokens": 1050262492.0, + "step": 2055 + }, + { + "epoch": 0.5559762033531639, + "grad_norm": 1.8459467887878418, + "learning_rate": 1.9691118386115972e-05, + "loss": 2.2741, + "mean_token_accuracy": 0.5178616046905518, + "num_tokens": 1050786776.0, + "step": 2056 + }, + { + "epoch": 0.5562466197944835, + "grad_norm": 1.694343090057373, + "learning_rate": 1.9690708782711447e-05, + "loss": 2.2288, + "mean_token_accuracy": 0.521848738193512, + "num_tokens": 1051310959.0, + "step": 2057 + }, + { + "epoch": 0.5565170362358032, + "grad_norm": 1.7817654609680176, + "learning_rate": 1.9690298912650416e-05, + "loss": 2.2763, + "mean_token_accuracy": 0.5387350916862488, + "num_tokens": 1051781050.0, + "step": 2058 + }, + { + "epoch": 0.5567874526771228, + "grad_norm": 2.106281042098999, + "learning_rate": 1.968988877594546e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.5205810070037842, + "num_tokens": 1052305261.0, + "step": 2059 + }, + { + "epoch": 0.5570578691184424, + "grad_norm": 1.8348737955093384, + "learning_rate": 1.968947837260916e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.5088858604431152, + "num_tokens": 1052829456.0, + "step": 2060 + }, + { + "epoch": 0.557328285559762, + "grad_norm": 0.9696272015571594, + "learning_rate": 1.9689067702654115e-05, + "loss": 1.234, + "mean_token_accuracy": 0.6678909659385681, + "num_tokens": 1053353707.0, + "step": 2061 + }, + { + "epoch": 0.5575987020010816, + "grad_norm": 3.38748836517334, + "learning_rate": 1.9688656766092915e-05, + "loss": 2.244, + "mean_token_accuracy": 0.5085951089859009, + "num_tokens": 1053877992.0, + "step": 2062 + }, + { + "epoch": 0.5578691184424013, + "grad_norm": 3.281298875808716, + "learning_rate": 1.968824556293817e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.4743748903274536, + "num_tokens": 1054385114.0, + "step": 2063 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 1.826261281967163, + "learning_rate": 1.9687834093202506e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.5278415083885193, + "num_tokens": 1054909272.0, + "step": 2064 + }, + { + "epoch": 0.5584099513250406, + "grad_norm": 2.6022794246673584, + "learning_rate": 1.9687422356898543e-05, + "loss": 2.3505, + "mean_token_accuracy": 0.4952983558177948, + "num_tokens": 1055405392.0, + "step": 2065 + }, + { + "epoch": 0.5586803677663602, + "grad_norm": 2.190427303314209, + "learning_rate": 1.968701035403891e-05, + "loss": 2.2667, + "mean_token_accuracy": 0.5151470899581909, + "num_tokens": 1055882768.0, + "step": 2066 + }, + { + "epoch": 0.5589507842076799, + "grad_norm": 2.363513946533203, + "learning_rate": 1.968659808463626e-05, + "loss": 2.0272, + "mean_token_accuracy": 0.5573693513870239, + "num_tokens": 1056406951.0, + "step": 2067 + }, + { + "epoch": 0.5592212006489995, + "grad_norm": 2.6525473594665527, + "learning_rate": 1.968618554870323e-05, + "loss": 2.3778, + "mean_token_accuracy": 0.48045721650123596, + "num_tokens": 1056930959.0, + "step": 2068 + }, + { + "epoch": 0.5594916170903191, + "grad_norm": 2.1060030460357666, + "learning_rate": 1.9685772746252483e-05, + "loss": 2.3246, + "mean_token_accuracy": 0.519382119178772, + "num_tokens": 1057455160.0, + "step": 2069 + }, + { + "epoch": 0.5597620335316387, + "grad_norm": 2.103400707244873, + "learning_rate": 1.9685359677296683e-05, + "loss": 2.2769, + "mean_token_accuracy": 0.5137689709663391, + "num_tokens": 1057936383.0, + "step": 2070 + }, + { + "epoch": 0.5600324499729583, + "grad_norm": 2.3962438106536865, + "learning_rate": 1.968494634184851e-05, + "loss": 2.3654, + "mean_token_accuracy": 0.5121307373046875, + "num_tokens": 1058415005.0, + "step": 2071 + }, + { + "epoch": 0.560302866414278, + "grad_norm": 1.8067667484283447, + "learning_rate": 1.968453273992064e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.5052896738052368, + "num_tokens": 1058913740.0, + "step": 2072 + }, + { + "epoch": 0.5605732828555976, + "grad_norm": 2.479827880859375, + "learning_rate": 1.9684118871525765e-05, + "loss": 2.3472, + "mean_token_accuracy": 0.5147069096565247, + "num_tokens": 1059411238.0, + "step": 2073 + }, + { + "epoch": 0.5608436992969172, + "grad_norm": 1.9600703716278076, + "learning_rate": 1.968370473667659e-05, + "loss": 2.2399, + "mean_token_accuracy": 0.518043577671051, + "num_tokens": 1059875345.0, + "step": 2074 + }, + { + "epoch": 0.5611141157382369, + "grad_norm": 2.0782551765441895, + "learning_rate": 1.968329033538581e-05, + "loss": 2.2972, + "mean_token_accuracy": 0.5046862363815308, + "num_tokens": 1060399559.0, + "step": 2075 + }, + { + "epoch": 0.5613845321795565, + "grad_norm": 1.9010310173034668, + "learning_rate": 1.968287566766615e-05, + "loss": 2.2669, + "mean_token_accuracy": 0.5122942924499512, + "num_tokens": 1060923791.0, + "step": 2076 + }, + { + "epoch": 0.5616549486208762, + "grad_norm": 1.5560901165008545, + "learning_rate": 1.9682460733530333e-05, + "loss": 2.1787, + "mean_token_accuracy": 0.5364677906036377, + "num_tokens": 1061447956.0, + "step": 2077 + }, + { + "epoch": 0.5619253650621958, + "grad_norm": 2.2712252140045166, + "learning_rate": 1.9682045532991083e-05, + "loss": 2.2541, + "mean_token_accuracy": 0.4906080961227417, + "num_tokens": 1061972228.0, + "step": 2078 + }, + { + "epoch": 0.5621957815035155, + "grad_norm": 2.1706011295318604, + "learning_rate": 1.9681630066061143e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.503969132900238, + "num_tokens": 1062496399.0, + "step": 2079 + }, + { + "epoch": 0.5624661979448351, + "grad_norm": 1.832349181175232, + "learning_rate": 1.9681214332753263e-05, + "loss": 2.2135, + "mean_token_accuracy": 0.5171631574630737, + "num_tokens": 1063020624.0, + "step": 2080 + }, + { + "epoch": 0.5627366143861546, + "grad_norm": 1.305415153503418, + "learning_rate": 1.9680798333080197e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.6761956214904785, + "num_tokens": 1063544821.0, + "step": 2081 + }, + { + "epoch": 0.5630070308274743, + "grad_norm": 2.686492443084717, + "learning_rate": 1.9680382067054712e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.49984011054039, + "num_tokens": 1064069081.0, + "step": 2082 + }, + { + "epoch": 0.5632774472687939, + "grad_norm": 2.0260465145111084, + "learning_rate": 1.9679965534689577e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.483124703168869, + "num_tokens": 1064593182.0, + "step": 2083 + }, + { + "epoch": 0.5635478637101136, + "grad_norm": 1.631663203239441, + "learning_rate": 1.9679548735997568e-05, + "loss": 2.3289, + "mean_token_accuracy": 0.5075573921203613, + "num_tokens": 1065091812.0, + "step": 2084 + }, + { + "epoch": 0.5638182801514332, + "grad_norm": 2.12703013420105, + "learning_rate": 1.967913167099149e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.5193358659744263, + "num_tokens": 1065581943.0, + "step": 2085 + }, + { + "epoch": 0.5640886965927528, + "grad_norm": 1.7443827390670776, + "learning_rate": 1.967871433968412e-05, + "loss": 2.3187, + "mean_token_accuracy": 0.5033982396125793, + "num_tokens": 1066106170.0, + "step": 2086 + }, + { + "epoch": 0.5643591130340725, + "grad_norm": 1.885342001914978, + "learning_rate": 1.9678296742088272e-05, + "loss": 2.2722, + "mean_token_accuracy": 0.5115677714347839, + "num_tokens": 1066630379.0, + "step": 2087 + }, + { + "epoch": 0.5646295294753921, + "grad_norm": 2.1617984771728516, + "learning_rate": 1.967787887821676e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.5140810608863831, + "num_tokens": 1067154654.0, + "step": 2088 + }, + { + "epoch": 0.5648999459167118, + "grad_norm": 1.474157452583313, + "learning_rate": 1.9677460748082406e-05, + "loss": 2.2119, + "mean_token_accuracy": 0.5170090198516846, + "num_tokens": 1067678808.0, + "step": 2089 + }, + { + "epoch": 0.5651703623580314, + "grad_norm": 2.193380117416382, + "learning_rate": 1.9677042351698034e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.5218793153762817, + "num_tokens": 1068148463.0, + "step": 2090 + }, + { + "epoch": 0.565440778799351, + "grad_norm": 1.8766447305679321, + "learning_rate": 1.967662368907649e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.5019004940986633, + "num_tokens": 1068654536.0, + "step": 2091 + }, + { + "epoch": 0.5657111952406706, + "grad_norm": 1.5320138931274414, + "learning_rate": 1.9676204760230607e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.4913554787635803, + "num_tokens": 1069154724.0, + "step": 2092 + }, + { + "epoch": 0.5659816116819902, + "grad_norm": 1.7552729845046997, + "learning_rate": 1.9675785565173252e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.5094258785247803, + "num_tokens": 1069624897.0, + "step": 2093 + }, + { + "epoch": 0.5662520281233099, + "grad_norm": 1.789415717124939, + "learning_rate": 1.9675366103917283e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.4955705404281616, + "num_tokens": 1070149143.0, + "step": 2094 + }, + { + "epoch": 0.5665224445646295, + "grad_norm": 1.7764321565628052, + "learning_rate": 1.967494637647557e-05, + "loss": 2.189, + "mean_token_accuracy": 0.5276869535446167, + "num_tokens": 1070673212.0, + "step": 2095 + }, + { + "epoch": 0.5667928610059492, + "grad_norm": 1.6569082736968994, + "learning_rate": 1.9674526382860987e-05, + "loss": 2.0285, + "mean_token_accuracy": 0.5435498952865601, + "num_tokens": 1071179900.0, + "step": 2096 + }, + { + "epoch": 0.5670632774472688, + "grad_norm": 1.6637197732925415, + "learning_rate": 1.967410612308643e-05, + "loss": 2.2566, + "mean_token_accuracy": 0.48862212896347046, + "num_tokens": 1071703980.0, + "step": 2097 + }, + { + "epoch": 0.5673336938885885, + "grad_norm": 1.7443300485610962, + "learning_rate": 1.9673685597164784e-05, + "loss": 2.2433, + "mean_token_accuracy": 0.5186089277267456, + "num_tokens": 1072166138.0, + "step": 2098 + }, + { + "epoch": 0.5676041103299081, + "grad_norm": 1.6139616966247559, + "learning_rate": 1.9673264805108955e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.5195215940475464, + "num_tokens": 1072690412.0, + "step": 2099 + }, + { + "epoch": 0.5678745267712277, + "grad_norm": 1.5971523523330688, + "learning_rate": 1.9672843746931857e-05, + "loss": 2.0585, + "mean_token_accuracy": 0.5290032029151917, + "num_tokens": 1073166986.0, + "step": 2100 + }, + { + "epoch": 0.5681449432125473, + "grad_norm": 0.9451289176940918, + "learning_rate": 1.967242242264641e-05, + "loss": 1.2272, + "mean_token_accuracy": 0.6806760430335999, + "num_tokens": 1073653102.0, + "step": 2101 + }, + { + "epoch": 0.5684153596538669, + "grad_norm": 3.341834783554077, + "learning_rate": 1.9672000832265535e-05, + "loss": 2.2501, + "mean_token_accuracy": 0.5153402090072632, + "num_tokens": 1074152792.0, + "step": 2102 + }, + { + "epoch": 0.5686857760951866, + "grad_norm": 3.036970376968384, + "learning_rate": 1.9671578975802175e-05, + "loss": 2.363, + "mean_token_accuracy": 0.5017765760421753, + "num_tokens": 1074676970.0, + "step": 2103 + }, + { + "epoch": 0.5689561925365062, + "grad_norm": 1.9661649465560913, + "learning_rate": 1.967115685326927e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.49037522077560425, + "num_tokens": 1075201189.0, + "step": 2104 + }, + { + "epoch": 0.5692266089778258, + "grad_norm": 3.1596813201904297, + "learning_rate": 1.9670734464679772e-05, + "loss": 2.1477, + "mean_token_accuracy": 0.5190043449401855, + "num_tokens": 1075725465.0, + "step": 2105 + }, + { + "epoch": 0.5694970254191455, + "grad_norm": 2.8280587196350098, + "learning_rate": 1.967031181004664e-05, + "loss": 2.2959, + "mean_token_accuracy": 0.5103446841239929, + "num_tokens": 1076249599.0, + "step": 2106 + }, + { + "epoch": 0.5697674418604651, + "grad_norm": 2.4092555046081543, + "learning_rate": 1.9669888889382846e-05, + "loss": 2.3468, + "mean_token_accuracy": 0.5054019689559937, + "num_tokens": 1076773737.0, + "step": 2107 + }, + { + "epoch": 0.5700378583017848, + "grad_norm": 2.2461557388305664, + "learning_rate": 1.9669465702701365e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.5063674449920654, + "num_tokens": 1077286053.0, + "step": 2108 + }, + { + "epoch": 0.5703082747431044, + "grad_norm": 2.1220462322235107, + "learning_rate": 1.966904225001518e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.5127145051956177, + "num_tokens": 1077810264.0, + "step": 2109 + }, + { + "epoch": 0.5705786911844241, + "grad_norm": 1.889175295829773, + "learning_rate": 1.9668618531337286e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.5059614181518555, + "num_tokens": 1078334438.0, + "step": 2110 + }, + { + "epoch": 0.5708491076257436, + "grad_norm": 2.3676962852478027, + "learning_rate": 1.9668194546680678e-05, + "loss": 2.2961, + "mean_token_accuracy": 0.5181446671485901, + "num_tokens": 1078812288.0, + "step": 2111 + }, + { + "epoch": 0.5711195240670632, + "grad_norm": 1.8458811044692993, + "learning_rate": 1.9667770296058374e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.5003724098205566, + "num_tokens": 1079336415.0, + "step": 2112 + }, + { + "epoch": 0.5713899405083829, + "grad_norm": 2.0076937675476074, + "learning_rate": 1.9667345779483388e-05, + "loss": 2.355, + "mean_token_accuracy": 0.5011672973632812, + "num_tokens": 1079838808.0, + "step": 2113 + }, + { + "epoch": 0.5716603569497025, + "grad_norm": 1.782142996788025, + "learning_rate": 1.966692099696874e-05, + "loss": 2.2715, + "mean_token_accuracy": 0.5269554853439331, + "num_tokens": 1080362899.0, + "step": 2114 + }, + { + "epoch": 0.5719307733910222, + "grad_norm": 2.540471076965332, + "learning_rate": 1.9666495948527473e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.4996059238910675, + "num_tokens": 1080867077.0, + "step": 2115 + }, + { + "epoch": 0.5722011898323418, + "grad_norm": 1.8502825498580933, + "learning_rate": 1.966607063417262e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.497137188911438, + "num_tokens": 1081376230.0, + "step": 2116 + }, + { + "epoch": 0.5724716062736614, + "grad_norm": 1.5454061031341553, + "learning_rate": 1.966564505391724e-05, + "loss": 2.1951, + "mean_token_accuracy": 0.525905966758728, + "num_tokens": 1081886836.0, + "step": 2117 + }, + { + "epoch": 0.5727420227149811, + "grad_norm": 1.7658472061157227, + "learning_rate": 1.966521920777438e-05, + "loss": 2.351, + "mean_token_accuracy": 0.5016244649887085, + "num_tokens": 1082411019.0, + "step": 2118 + }, + { + "epoch": 0.5730124391563007, + "grad_norm": 1.9180048704147339, + "learning_rate": 1.9664793095757115e-05, + "loss": 2.2395, + "mean_token_accuracy": 0.526303231716156, + "num_tokens": 1082898311.0, + "step": 2119 + }, + { + "epoch": 0.5732828555976204, + "grad_norm": 1.3560786247253418, + "learning_rate": 1.9664366717878513e-05, + "loss": 2.2739, + "mean_token_accuracy": 0.5104018449783325, + "num_tokens": 1083422582.0, + "step": 2120 + }, + { + "epoch": 0.57355327203894, + "grad_norm": 0.9142431020736694, + "learning_rate": 1.9663940074151662e-05, + "loss": 1.2258, + "mean_token_accuracy": 0.6814178228378296, + "num_tokens": 1083934843.0, + "step": 2121 + }, + { + "epoch": 0.5738236884802596, + "grad_norm": 2.8515944480895996, + "learning_rate": 1.9663513164589653e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.4962426722049713, + "num_tokens": 1084401341.0, + "step": 2122 + }, + { + "epoch": 0.5740941049215792, + "grad_norm": 1.8071153163909912, + "learning_rate": 1.9663085989205578e-05, + "loss": 2.1622, + "mean_token_accuracy": 0.5039653182029724, + "num_tokens": 1084925553.0, + "step": 2123 + }, + { + "epoch": 0.5743645213628988, + "grad_norm": 1.7629542350769043, + "learning_rate": 1.9662658548012555e-05, + "loss": 2.3346, + "mean_token_accuracy": 0.5095629096031189, + "num_tokens": 1085442438.0, + "step": 2124 + }, + { + "epoch": 0.5746349378042185, + "grad_norm": 2.279010534286499, + "learning_rate": 1.9662230841023687e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.4934064745903015, + "num_tokens": 1085966719.0, + "step": 2125 + }, + { + "epoch": 0.5749053542455381, + "grad_norm": 1.7025487422943115, + "learning_rate": 1.9661802868252106e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.49538880586624146, + "num_tokens": 1086490313.0, + "step": 2126 + }, + { + "epoch": 0.5751757706868578, + "grad_norm": 1.982714295387268, + "learning_rate": 1.9661374629710937e-05, + "loss": 2.2368, + "mean_token_accuracy": 0.5066688060760498, + "num_tokens": 1087014597.0, + "step": 2127 + }, + { + "epoch": 0.5754461871281774, + "grad_norm": 1.964403748512268, + "learning_rate": 1.9660946125413324e-05, + "loss": 2.1513, + "mean_token_accuracy": 0.5166791677474976, + "num_tokens": 1087538812.0, + "step": 2128 + }, + { + "epoch": 0.575716603569497, + "grad_norm": 1.619145393371582, + "learning_rate": 1.966051735537241e-05, + "loss": 2.2348, + "mean_token_accuracy": 0.5290101170539856, + "num_tokens": 1088062936.0, + "step": 2129 + }, + { + "epoch": 0.5759870200108167, + "grad_norm": 2.015692710876465, + "learning_rate": 1.966008831960136e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.5352363586425781, + "num_tokens": 1088488611.0, + "step": 2130 + }, + { + "epoch": 0.5762574364521363, + "grad_norm": 2.389392852783203, + "learning_rate": 1.9659659018113334e-05, + "loss": 2.3398, + "mean_token_accuracy": 0.49951452016830444, + "num_tokens": 1089012802.0, + "step": 2131 + }, + { + "epoch": 0.5765278528934559, + "grad_norm": 1.576268196105957, + "learning_rate": 1.96592294509215e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.4904821813106537, + "num_tokens": 1089536926.0, + "step": 2132 + }, + { + "epoch": 0.5767982693347755, + "grad_norm": 1.803552508354187, + "learning_rate": 1.965879961803904e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.5245980620384216, + "num_tokens": 1090061203.0, + "step": 2133 + }, + { + "epoch": 0.5770686857760952, + "grad_norm": 1.6870317459106445, + "learning_rate": 1.9658369519479143e-05, + "loss": 2.2106, + "mean_token_accuracy": 0.5078306794166565, + "num_tokens": 1090585423.0, + "step": 2134 + }, + { + "epoch": 0.5773391022174148, + "grad_norm": 1.388512372970581, + "learning_rate": 1.9657939155255007e-05, + "loss": 2.1561, + "mean_token_accuracy": 0.5258511304855347, + "num_tokens": 1091060720.0, + "step": 2135 + }, + { + "epoch": 0.5776095186587344, + "grad_norm": 1.7452021837234497, + "learning_rate": 1.9657508525379844e-05, + "loss": 2.2127, + "mean_token_accuracy": 0.5100942254066467, + "num_tokens": 1091584870.0, + "step": 2136 + }, + { + "epoch": 0.5778799351000541, + "grad_norm": 1.5754432678222656, + "learning_rate": 1.965707762986685e-05, + "loss": 2.4087, + "mean_token_accuracy": 0.4906471371650696, + "num_tokens": 1092084543.0, + "step": 2137 + }, + { + "epoch": 0.5781503515413737, + "grad_norm": 1.5063982009887695, + "learning_rate": 1.9656646468729263e-05, + "loss": 2.3105, + "mean_token_accuracy": 0.5095707178115845, + "num_tokens": 1092608542.0, + "step": 2138 + }, + { + "epoch": 0.5784207679826934, + "grad_norm": 1.668470859527588, + "learning_rate": 1.96562150419803e-05, + "loss": 2.3927, + "mean_token_accuracy": 0.5013554096221924, + "num_tokens": 1093132817.0, + "step": 2139 + }, + { + "epoch": 0.578691184424013, + "grad_norm": 1.441792368888855, + "learning_rate": 1.96557833496332e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.5052025318145752, + "num_tokens": 1093637909.0, + "step": 2140 + }, + { + "epoch": 0.5789616008653327, + "grad_norm": 0.9933393597602844, + "learning_rate": 1.965535139170122e-05, + "loss": 1.1903, + "mean_token_accuracy": 0.687154233455658, + "num_tokens": 1094156554.0, + "step": 2141 + }, + { + "epoch": 0.5792320173066522, + "grad_norm": 2.7217488288879395, + "learning_rate": 1.9654919168197598e-05, + "loss": 2.2811, + "mean_token_accuracy": 0.5106374621391296, + "num_tokens": 1094680833.0, + "step": 2142 + }, + { + "epoch": 0.5795024337479718, + "grad_norm": 2.2254137992858887, + "learning_rate": 1.965448667913561e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.5039616823196411, + "num_tokens": 1095205027.0, + "step": 2143 + }, + { + "epoch": 0.5797728501892915, + "grad_norm": 1.7819668054580688, + "learning_rate": 1.9654053924528517e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.511760950088501, + "num_tokens": 1095636000.0, + "step": 2144 + }, + { + "epoch": 0.5800432666306111, + "grad_norm": 2.1217379570007324, + "learning_rate": 1.96536209043896e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.5065045356750488, + "num_tokens": 1096156148.0, + "step": 2145 + }, + { + "epoch": 0.5803136830719308, + "grad_norm": 1.9884696006774902, + "learning_rate": 1.965318761873214e-05, + "loss": 2.3423, + "mean_token_accuracy": 0.5082817077636719, + "num_tokens": 1096616907.0, + "step": 2146 + }, + { + "epoch": 0.5805840995132504, + "grad_norm": 2.0105934143066406, + "learning_rate": 1.965275406756944e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.5214532613754272, + "num_tokens": 1097141176.0, + "step": 2147 + }, + { + "epoch": 0.58085451595457, + "grad_norm": 1.7171534299850464, + "learning_rate": 1.96523202509148e-05, + "loss": 2.2801, + "mean_token_accuracy": 0.5222220420837402, + "num_tokens": 1097665433.0, + "step": 2148 + }, + { + "epoch": 0.5811249323958897, + "grad_norm": 1.4616352319717407, + "learning_rate": 1.9651886168781532e-05, + "loss": 2.167, + "mean_token_accuracy": 0.5174895524978638, + "num_tokens": 1098189701.0, + "step": 2149 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 1.8131319284439087, + "learning_rate": 1.965145182118295e-05, + "loss": 2.338, + "mean_token_accuracy": 0.48085135221481323, + "num_tokens": 1098713814.0, + "step": 2150 + }, + { + "epoch": 0.581665765278529, + "grad_norm": 1.6241564750671387, + "learning_rate": 1.9651017208132382e-05, + "loss": 2.2159, + "mean_token_accuracy": 0.5125553011894226, + "num_tokens": 1099205246.0, + "step": 2151 + }, + { + "epoch": 0.5819361817198486, + "grad_norm": 1.739920973777771, + "learning_rate": 1.965058232964317e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.5002936720848083, + "num_tokens": 1099729459.0, + "step": 2152 + }, + { + "epoch": 0.5822065981611682, + "grad_norm": 1.449051022529602, + "learning_rate": 1.965014718572865e-05, + "loss": 2.3257, + "mean_token_accuracy": 0.5015112161636353, + "num_tokens": 1100253734.0, + "step": 2153 + }, + { + "epoch": 0.5824770146024878, + "grad_norm": 1.4322881698608398, + "learning_rate": 1.9649711776402173e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.4928753972053528, + "num_tokens": 1100777886.0, + "step": 2154 + }, + { + "epoch": 0.5827474310438074, + "grad_norm": 1.8101394176483154, + "learning_rate": 1.9649276101677106e-05, + "loss": 2.3257, + "mean_token_accuracy": 0.5062799453735352, + "num_tokens": 1101302113.0, + "step": 2155 + }, + { + "epoch": 0.5830178474851271, + "grad_norm": 1.4381002187728882, + "learning_rate": 1.9648840161566812e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.5020055174827576, + "num_tokens": 1101826395.0, + "step": 2156 + }, + { + "epoch": 0.5832882639264467, + "grad_norm": 1.558353066444397, + "learning_rate": 1.9648403956084666e-05, + "loss": 2.2652, + "mean_token_accuracy": 0.5184463858604431, + "num_tokens": 1102313466.0, + "step": 2157 + }, + { + "epoch": 0.5835586803677664, + "grad_norm": 1.4288604259490967, + "learning_rate": 1.964796748524405e-05, + "loss": 2.329, + "mean_token_accuracy": 0.5222123861312866, + "num_tokens": 1102784915.0, + "step": 2158 + }, + { + "epoch": 0.583829096809086, + "grad_norm": 1.2861219644546509, + "learning_rate": 1.9647530749058364e-05, + "loss": 2.2134, + "mean_token_accuracy": 0.5024749040603638, + "num_tokens": 1103309188.0, + "step": 2159 + }, + { + "epoch": 0.5840995132504057, + "grad_norm": 1.6493096351623535, + "learning_rate": 1.9647093747541e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.5120601654052734, + "num_tokens": 1103833451.0, + "step": 2160 + }, + { + "epoch": 0.5843699296917253, + "grad_norm": 0.9554152488708496, + "learning_rate": 1.9646656480705376e-05, + "loss": 1.2468, + "mean_token_accuracy": 0.6817188262939453, + "num_tokens": 1104307192.0, + "step": 2161 + }, + { + "epoch": 0.5846403461330449, + "grad_norm": 2.3465962409973145, + "learning_rate": 1.96462189485649e-05, + "loss": 2.3482, + "mean_token_accuracy": 0.5088643431663513, + "num_tokens": 1104831367.0, + "step": 2162 + }, + { + "epoch": 0.5849107625743645, + "grad_norm": 1.951401710510254, + "learning_rate": 1.9645781151133e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.4959971308708191, + "num_tokens": 1105355527.0, + "step": 2163 + }, + { + "epoch": 0.5851811790156841, + "grad_norm": 1.5667531490325928, + "learning_rate": 1.964534308842311e-05, + "loss": 2.1848, + "mean_token_accuracy": 0.5157544612884521, + "num_tokens": 1105879627.0, + "step": 2164 + }, + { + "epoch": 0.5854515954570038, + "grad_norm": 1.6848148107528687, + "learning_rate": 1.9644904760448663e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.4893074631690979, + "num_tokens": 1106403873.0, + "step": 2165 + }, + { + "epoch": 0.5857220118983234, + "grad_norm": 1.5397710800170898, + "learning_rate": 1.9644466167223123e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.4834628701210022, + "num_tokens": 1106928088.0, + "step": 2166 + }, + { + "epoch": 0.585992428339643, + "grad_norm": 1.5513062477111816, + "learning_rate": 1.9644027308759936e-05, + "loss": 2.0554, + "mean_token_accuracy": 0.5285289883613586, + "num_tokens": 1107452213.0, + "step": 2167 + }, + { + "epoch": 0.5862628447809627, + "grad_norm": 1.6784610748291016, + "learning_rate": 1.9643588185072572e-05, + "loss": 2.2666, + "mean_token_accuracy": 0.5144408941268921, + "num_tokens": 1107976483.0, + "step": 2168 + }, + { + "epoch": 0.5865332612222823, + "grad_norm": 1.8209625482559204, + "learning_rate": 1.9643148796174504e-05, + "loss": 2.1978, + "mean_token_accuracy": 0.5256873369216919, + "num_tokens": 1108500759.0, + "step": 2169 + }, + { + "epoch": 0.586803677663602, + "grad_norm": 1.7519361972808838, + "learning_rate": 1.9642709142079208e-05, + "loss": 2.2499, + "mean_token_accuracy": 0.5357969403266907, + "num_tokens": 1108961211.0, + "step": 2170 + }, + { + "epoch": 0.5870740941049216, + "grad_norm": 3.2090041637420654, + "learning_rate": 1.9642269222800184e-05, + "loss": 1.821, + "mean_token_accuracy": 0.5887125730514526, + "num_tokens": 1109421021.0, + "step": 2171 + }, + { + "epoch": 0.5873445105462413, + "grad_norm": 1.7685856819152832, + "learning_rate": 1.9641829038350926e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.5272490978240967, + "num_tokens": 1109945263.0, + "step": 2172 + }, + { + "epoch": 0.5876149269875608, + "grad_norm": 1.4053010940551758, + "learning_rate": 1.9641388588744933e-05, + "loss": 2.2164, + "mean_token_accuracy": 0.5097631812095642, + "num_tokens": 1110409352.0, + "step": 2173 + }, + { + "epoch": 0.5878853434288804, + "grad_norm": 1.4732171297073364, + "learning_rate": 1.964094787399573e-05, + "loss": 2.1375, + "mean_token_accuracy": 0.5098499655723572, + "num_tokens": 1110933583.0, + "step": 2174 + }, + { + "epoch": 0.5881557598702001, + "grad_norm": 1.5387378931045532, + "learning_rate": 1.9640506894116835e-05, + "loss": 2.2548, + "mean_token_accuracy": 0.5417658090591431, + "num_tokens": 1111388814.0, + "step": 2175 + }, + { + "epoch": 0.5884261763115197, + "grad_norm": 1.7353333234786987, + "learning_rate": 1.9640065649121774e-05, + "loss": 2.2978, + "mean_token_accuracy": 0.5143300890922546, + "num_tokens": 1111889516.0, + "step": 2176 + }, + { + "epoch": 0.5886965927528394, + "grad_norm": 1.8431622982025146, + "learning_rate": 1.9639624139024097e-05, + "loss": 2.1813, + "mean_token_accuracy": 0.5337220430374146, + "num_tokens": 1112380151.0, + "step": 2177 + }, + { + "epoch": 0.588967009194159, + "grad_norm": 1.3373700380325317, + "learning_rate": 1.9639182363837338e-05, + "loss": 2.3309, + "mean_token_accuracy": 0.5018113851547241, + "num_tokens": 1112904282.0, + "step": 2178 + }, + { + "epoch": 0.5892374256354787, + "grad_norm": 1.7387890815734863, + "learning_rate": 1.963874032357506e-05, + "loss": 2.2566, + "mean_token_accuracy": 0.529412567615509, + "num_tokens": 1113367310.0, + "step": 2179 + }, + { + "epoch": 0.5895078420767983, + "grad_norm": 1.5802947282791138, + "learning_rate": 1.9638298018250825e-05, + "loss": 2.1683, + "mean_token_accuracy": 0.5074392557144165, + "num_tokens": 1113881509.0, + "step": 2180 + }, + { + "epoch": 0.5897782585181179, + "grad_norm": 0.9546454548835754, + "learning_rate": 1.9637855447878206e-05, + "loss": 1.2059, + "mean_token_accuracy": 0.682429313659668, + "num_tokens": 1114405725.0, + "step": 2181 + }, + { + "epoch": 0.5900486749594376, + "grad_norm": 4.488793849945068, + "learning_rate": 1.9637412612470773e-05, + "loss": 2.2903, + "mean_token_accuracy": 0.5155333280563354, + "num_tokens": 1114930007.0, + "step": 2182 + }, + { + "epoch": 0.5903190914007571, + "grad_norm": 2.99322772026062, + "learning_rate": 1.9636969512042124e-05, + "loss": 2.1618, + "mean_token_accuracy": 0.5297868847846985, + "num_tokens": 1115418365.0, + "step": 2183 + }, + { + "epoch": 0.5905895078420768, + "grad_norm": 1.673242211341858, + "learning_rate": 1.963652614660585e-05, + "loss": 2.2299, + "mean_token_accuracy": 0.5271136164665222, + "num_tokens": 1115933691.0, + "step": 2184 + }, + { + "epoch": 0.5908599242833964, + "grad_norm": 2.8587396144866943, + "learning_rate": 1.9636082516175556e-05, + "loss": 2.3944, + "mean_token_accuracy": 0.49909427762031555, + "num_tokens": 1116451988.0, + "step": 2185 + }, + { + "epoch": 0.591130340724716, + "grad_norm": 2.9581637382507324, + "learning_rate": 1.9635638620764853e-05, + "loss": 2.284, + "mean_token_accuracy": 0.5308417081832886, + "num_tokens": 1116976217.0, + "step": 2186 + }, + { + "epoch": 0.5914007571660357, + "grad_norm": 2.0312435626983643, + "learning_rate": 1.9635194460387362e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.5177211761474609, + "num_tokens": 1117500194.0, + "step": 2187 + }, + { + "epoch": 0.5916711736073553, + "grad_norm": 2.0869767665863037, + "learning_rate": 1.963475003505671e-05, + "loss": 2.2396, + "mean_token_accuracy": 0.4921882450580597, + "num_tokens": 1118024304.0, + "step": 2188 + }, + { + "epoch": 0.591941590048675, + "grad_norm": 2.27876615524292, + "learning_rate": 1.963430534478653e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.5197724103927612, + "num_tokens": 1118548497.0, + "step": 2189 + }, + { + "epoch": 0.5922120064899946, + "grad_norm": 1.7358360290527344, + "learning_rate": 1.9633860389590476e-05, + "loss": 2.1417, + "mean_token_accuracy": 0.5116521120071411, + "num_tokens": 1119042561.0, + "step": 2190 + }, + { + "epoch": 0.5924824229313143, + "grad_norm": 1.877323031425476, + "learning_rate": 1.9633415169482194e-05, + "loss": 2.1548, + "mean_token_accuracy": 0.5170331597328186, + "num_tokens": 1119566743.0, + "step": 2191 + }, + { + "epoch": 0.5927528393726339, + "grad_norm": 1.8318042755126953, + "learning_rate": 1.963296968447534e-05, + "loss": 2.3353, + "mean_token_accuracy": 0.5126356482505798, + "num_tokens": 1120054759.0, + "step": 2192 + }, + { + "epoch": 0.5930232558139535, + "grad_norm": 1.652151346206665, + "learning_rate": 1.9632523934583592e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.49826568365097046, + "num_tokens": 1120579018.0, + "step": 2193 + }, + { + "epoch": 0.5932936722552731, + "grad_norm": 2.139207363128662, + "learning_rate": 1.9632077919820625e-05, + "loss": 2.3305, + "mean_token_accuracy": 0.5082290768623352, + "num_tokens": 1121102621.0, + "step": 2194 + }, + { + "epoch": 0.5935640886965927, + "grad_norm": 1.605640172958374, + "learning_rate": 1.9631631640200117e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.5073832869529724, + "num_tokens": 1121626872.0, + "step": 2195 + }, + { + "epoch": 0.5938345051379124, + "grad_norm": 1.7381324768066406, + "learning_rate": 1.963118509573577e-05, + "loss": 2.111, + "mean_token_accuracy": 0.520506739616394, + "num_tokens": 1122151142.0, + "step": 2196 + }, + { + "epoch": 0.594104921579232, + "grad_norm": 1.8873134851455688, + "learning_rate": 1.9630738286441283e-05, + "loss": 2.065, + "mean_token_accuracy": 0.5386955142021179, + "num_tokens": 1122675241.0, + "step": 2197 + }, + { + "epoch": 0.5943753380205516, + "grad_norm": 1.5979502201080322, + "learning_rate": 1.9630291212330363e-05, + "loss": 2.2181, + "mean_token_accuracy": 0.5229296684265137, + "num_tokens": 1123199498.0, + "step": 2198 + }, + { + "epoch": 0.5946457544618713, + "grad_norm": 1.8590339422225952, + "learning_rate": 1.9629843873416727e-05, + "loss": 2.258, + "mean_token_accuracy": 0.5041812658309937, + "num_tokens": 1123689134.0, + "step": 2199 + }, + { + "epoch": 0.5949161709031909, + "grad_norm": 1.8275610208511353, + "learning_rate": 1.96293962697141e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.48822498321533203, + "num_tokens": 1124176044.0, + "step": 2200 + }, + { + "epoch": 0.5951865873445106, + "grad_norm": 0.7938820719718933, + "learning_rate": 1.962894840123622e-05, + "loss": 1.1953, + "mean_token_accuracy": 0.690011739730835, + "num_tokens": 1124700319.0, + "step": 2201 + }, + { + "epoch": 0.5954570037858302, + "grad_norm": 2.5357203483581543, + "learning_rate": 1.962850026799683e-05, + "loss": 2.2, + "mean_token_accuracy": 0.5018659830093384, + "num_tokens": 1125224422.0, + "step": 2202 + }, + { + "epoch": 0.5957274202271499, + "grad_norm": 2.443579912185669, + "learning_rate": 1.9628051870009673e-05, + "loss": 2.2127, + "mean_token_accuracy": 0.5163438320159912, + "num_tokens": 1125748701.0, + "step": 2203 + }, + { + "epoch": 0.5959978366684694, + "grad_norm": 1.5956599712371826, + "learning_rate": 1.9627603207288515e-05, + "loss": 2.2202, + "mean_token_accuracy": 0.518927812576294, + "num_tokens": 1126272841.0, + "step": 2204 + }, + { + "epoch": 0.596268253109789, + "grad_norm": 2.1984095573425293, + "learning_rate": 1.9627154279847117e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.5015804767608643, + "num_tokens": 1126797025.0, + "step": 2205 + }, + { + "epoch": 0.5965386695511087, + "grad_norm": 2.6442270278930664, + "learning_rate": 1.962670508769925e-05, + "loss": 2.4004, + "mean_token_accuracy": 0.4972009062767029, + "num_tokens": 1127321286.0, + "step": 2206 + }, + { + "epoch": 0.5968090859924283, + "grad_norm": 1.8118622303009033, + "learning_rate": 1.962625563085871e-05, + "loss": 2.2779, + "mean_token_accuracy": 0.510111391544342, + "num_tokens": 1127816421.0, + "step": 2207 + }, + { + "epoch": 0.597079502433748, + "grad_norm": 1.8301371335983276, + "learning_rate": 1.9625805909339276e-05, + "loss": 2.2306, + "mean_token_accuracy": 0.5080976486206055, + "num_tokens": 1128340601.0, + "step": 2208 + }, + { + "epoch": 0.5973499188750676, + "grad_norm": 2.079066514968872, + "learning_rate": 1.9625355923154745e-05, + "loss": 2.2448, + "mean_token_accuracy": 0.5047357082366943, + "num_tokens": 1128864758.0, + "step": 2209 + }, + { + "epoch": 0.5976203353163873, + "grad_norm": 1.5731106996536255, + "learning_rate": 1.9624905672318933e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.5113447904586792, + "num_tokens": 1129389038.0, + "step": 2210 + }, + { + "epoch": 0.5978907517577069, + "grad_norm": 1.6644481420516968, + "learning_rate": 1.962445515684565e-05, + "loss": 2.1571, + "mean_token_accuracy": 0.52021723985672, + "num_tokens": 1129913239.0, + "step": 2211 + }, + { + "epoch": 0.5981611681990265, + "grad_norm": 1.9888888597488403, + "learning_rate": 1.962400437674872e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.5046589374542236, + "num_tokens": 1130393778.0, + "step": 2212 + }, + { + "epoch": 0.5984315846403462, + "grad_norm": 1.3051775693893433, + "learning_rate": 1.9623553332041975e-05, + "loss": 2.265, + "mean_token_accuracy": 0.5031786561012268, + "num_tokens": 1130918026.0, + "step": 2213 + }, + { + "epoch": 0.5987020010816657, + "grad_norm": 1.999097228050232, + "learning_rate": 1.9623102022739254e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.5414313077926636, + "num_tokens": 1131363575.0, + "step": 2214 + }, + { + "epoch": 0.5989724175229854, + "grad_norm": 1.8889700174331665, + "learning_rate": 1.96226504488544e-05, + "loss": 2.1547, + "mean_token_accuracy": 0.5312731862068176, + "num_tokens": 1131887748.0, + "step": 2215 + }, + { + "epoch": 0.599242833964305, + "grad_norm": 1.4914815425872803, + "learning_rate": 1.9622198610401277e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.5029016733169556, + "num_tokens": 1132411957.0, + "step": 2216 + }, + { + "epoch": 0.5995132504056246, + "grad_norm": 2.0349984169006348, + "learning_rate": 1.9621746507393743e-05, + "loss": 2.3481, + "mean_token_accuracy": 0.5071622133255005, + "num_tokens": 1132902341.0, + "step": 2217 + }, + { + "epoch": 0.5997836668469443, + "grad_norm": 1.9746521711349487, + "learning_rate": 1.9621294139845673e-05, + "loss": 2.3185, + "mean_token_accuracy": 0.494345098733902, + "num_tokens": 1133426567.0, + "step": 2218 + }, + { + "epoch": 0.6000540832882639, + "grad_norm": 1.3759437799453735, + "learning_rate": 1.9620841507770945e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.5305469632148743, + "num_tokens": 1133889846.0, + "step": 2219 + }, + { + "epoch": 0.6003244997295836, + "grad_norm": 1.9158855676651, + "learning_rate": 1.9620388611183443e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.5003746747970581, + "num_tokens": 1134373479.0, + "step": 2220 + }, + { + "epoch": 0.6005949161709032, + "grad_norm": 0.9038177728652954, + "learning_rate": 1.9619935450097073e-05, + "loss": 1.1881, + "mean_token_accuracy": 0.6949093341827393, + "num_tokens": 1134897713.0, + "step": 2221 + }, + { + "epoch": 0.6008653326122229, + "grad_norm": 2.304807186126709, + "learning_rate": 1.961948202452573e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.5244648456573486, + "num_tokens": 1135421914.0, + "step": 2222 + }, + { + "epoch": 0.6011357490535425, + "grad_norm": 1.7086782455444336, + "learning_rate": 1.9619028334483332e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.5006935000419617, + "num_tokens": 1135946125.0, + "step": 2223 + }, + { + "epoch": 0.601406165494862, + "grad_norm": 1.5382812023162842, + "learning_rate": 1.9618574379983796e-05, + "loss": 2.1837, + "mean_token_accuracy": 0.5110519528388977, + "num_tokens": 1136470374.0, + "step": 2224 + }, + { + "epoch": 0.6016765819361817, + "grad_norm": 1.9268311262130737, + "learning_rate": 1.961812016104105e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5280000567436218, + "num_tokens": 1136994654.0, + "step": 2225 + }, + { + "epoch": 0.6019469983775013, + "grad_norm": 1.6794569492340088, + "learning_rate": 1.961766567766904e-05, + "loss": 2.1103, + "mean_token_accuracy": 0.5261956453323364, + "num_tokens": 1137518829.0, + "step": 2226 + }, + { + "epoch": 0.602217414818821, + "grad_norm": 1.6106420755386353, + "learning_rate": 1.96172109298817e-05, + "loss": 2.0927, + "mean_token_accuracy": 0.5250627398490906, + "num_tokens": 1138043014.0, + "step": 2227 + }, + { + "epoch": 0.6024878312601406, + "grad_norm": 1.9805426597595215, + "learning_rate": 1.961675591769299e-05, + "loss": 2.2466, + "mean_token_accuracy": 0.5176465511322021, + "num_tokens": 1138567222.0, + "step": 2228 + }, + { + "epoch": 0.6027582477014602, + "grad_norm": 1.8410948514938354, + "learning_rate": 1.9616300641116867e-05, + "loss": 2.2166, + "mean_token_accuracy": 0.5093424916267395, + "num_tokens": 1139091463.0, + "step": 2229 + }, + { + "epoch": 0.6030286641427799, + "grad_norm": 1.710126280784607, + "learning_rate": 1.96158451001673e-05, + "loss": 2.3105, + "mean_token_accuracy": 0.5015121102333069, + "num_tokens": 1139615721.0, + "step": 2230 + }, + { + "epoch": 0.6032990805840995, + "grad_norm": 1.5503002405166626, + "learning_rate": 1.9615389294858266e-05, + "loss": 2.236, + "mean_token_accuracy": 0.5121361017227173, + "num_tokens": 1140084108.0, + "step": 2231 + }, + { + "epoch": 0.6035694970254192, + "grad_norm": 1.799426794052124, + "learning_rate": 1.9614933225203754e-05, + "loss": 2.3352, + "mean_token_accuracy": 0.4954860210418701, + "num_tokens": 1140584771.0, + "step": 2232 + }, + { + "epoch": 0.6038399134667388, + "grad_norm": 1.7010531425476074, + "learning_rate": 1.9614476891217758e-05, + "loss": 2.314, + "mean_token_accuracy": 0.5190154314041138, + "num_tokens": 1141108981.0, + "step": 2233 + }, + { + "epoch": 0.6041103299080585, + "grad_norm": 1.5769709348678589, + "learning_rate": 1.9614020292914276e-05, + "loss": 2.2831, + "mean_token_accuracy": 0.5048248171806335, + "num_tokens": 1141633138.0, + "step": 2234 + }, + { + "epoch": 0.604380746349378, + "grad_norm": 2.0003652572631836, + "learning_rate": 1.961356343030732e-05, + "loss": 1.8983, + "mean_token_accuracy": 0.5528992414474487, + "num_tokens": 1142121298.0, + "step": 2235 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 1.559779167175293, + "learning_rate": 1.9613106303410905e-05, + "loss": 2.3096, + "mean_token_accuracy": 0.5022232532501221, + "num_tokens": 1142645368.0, + "step": 2236 + }, + { + "epoch": 0.6049215792320173, + "grad_norm": 1.8683733940124512, + "learning_rate": 1.9612648912239062e-05, + "loss": 2.1917, + "mean_token_accuracy": 0.5323070287704468, + "num_tokens": 1143169648.0, + "step": 2237 + }, + { + "epoch": 0.6051919956733369, + "grad_norm": 1.5676827430725098, + "learning_rate": 1.9612191256805818e-05, + "loss": 2.2339, + "mean_token_accuracy": 0.504543125629425, + "num_tokens": 1143682898.0, + "step": 2238 + }, + { + "epoch": 0.6054624121146566, + "grad_norm": 1.8831013441085815, + "learning_rate": 1.9611733337125223e-05, + "loss": 2.2631, + "mean_token_accuracy": 0.5239826440811157, + "num_tokens": 1144207143.0, + "step": 2239 + }, + { + "epoch": 0.6057328285559762, + "grad_norm": 1.5852210521697998, + "learning_rate": 1.961127515321132e-05, + "loss": 2.3492, + "mean_token_accuracy": 0.5102561116218567, + "num_tokens": 1144701227.0, + "step": 2240 + }, + { + "epoch": 0.6060032449972959, + "grad_norm": 0.754731297492981, + "learning_rate": 1.9610816705078176e-05, + "loss": 1.1945, + "mean_token_accuracy": 0.6831611394882202, + "num_tokens": 1145225476.0, + "step": 2241 + }, + { + "epoch": 0.6062736614386155, + "grad_norm": 2.6381995677948, + "learning_rate": 1.961035799273985e-05, + "loss": 2.3573, + "mean_token_accuracy": 0.5005422234535217, + "num_tokens": 1145749695.0, + "step": 2242 + }, + { + "epoch": 0.6065440778799351, + "grad_norm": 2.437753915786743, + "learning_rate": 1.9609899016210422e-05, + "loss": 2.339, + "mean_token_accuracy": 0.5147512555122375, + "num_tokens": 1146273868.0, + "step": 2243 + }, + { + "epoch": 0.6068144943212548, + "grad_norm": 1.8724193572998047, + "learning_rate": 1.9609439775503968e-05, + "loss": 2.4472, + "mean_token_accuracy": 0.47982490062713623, + "num_tokens": 1146798034.0, + "step": 2244 + }, + { + "epoch": 0.6070849107625743, + "grad_norm": 1.935068964958191, + "learning_rate": 1.9608980270634586e-05, + "loss": 2.3352, + "mean_token_accuracy": 0.5025498867034912, + "num_tokens": 1147322303.0, + "step": 2245 + }, + { + "epoch": 0.607355327203894, + "grad_norm": 1.3549222946166992, + "learning_rate": 1.960852050161637e-05, + "loss": 2.1712, + "mean_token_accuracy": 0.5246312618255615, + "num_tokens": 1147820000.0, + "step": 2246 + }, + { + "epoch": 0.6076257436452136, + "grad_norm": 2.592766523361206, + "learning_rate": 1.9608060468463433e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.5033100247383118, + "num_tokens": 1148304229.0, + "step": 2247 + }, + { + "epoch": 0.6078961600865332, + "grad_norm": 1.8797879219055176, + "learning_rate": 1.960760017118988e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.5210460424423218, + "num_tokens": 1148820904.0, + "step": 2248 + }, + { + "epoch": 0.6081665765278529, + "grad_norm": 1.716614007949829, + "learning_rate": 1.9607139609809848e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.5277361869812012, + "num_tokens": 1149339586.0, + "step": 2249 + }, + { + "epoch": 0.6084369929691725, + "grad_norm": 1.852409839630127, + "learning_rate": 1.9606678784337452e-05, + "loss": 2.3219, + "mean_token_accuracy": 0.47373467683792114, + "num_tokens": 1149863865.0, + "step": 2250 + }, + { + "epoch": 0.6087074094104922, + "grad_norm": 1.9282946586608887, + "learning_rate": 1.9606217694786847e-05, + "loss": 2.0257, + "mean_token_accuracy": 0.5456990599632263, + "num_tokens": 1150388080.0, + "step": 2251 + }, + { + "epoch": 0.6089778258518118, + "grad_norm": 3.139216423034668, + "learning_rate": 1.9605756341172173e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.5253826379776001, + "num_tokens": 1150912318.0, + "step": 2252 + }, + { + "epoch": 0.6092482422931315, + "grad_norm": 2.3536770343780518, + "learning_rate": 1.9605294723507582e-05, + "loss": 2.281, + "mean_token_accuracy": 0.5028672218322754, + "num_tokens": 1151436495.0, + "step": 2253 + }, + { + "epoch": 0.6095186587344511, + "grad_norm": 12.019201278686523, + "learning_rate": 1.960483284180725e-05, + "loss": 2.1731, + "mean_token_accuracy": 0.5482041835784912, + "num_tokens": 1151960695.0, + "step": 2254 + }, + { + "epoch": 0.6097890751757706, + "grad_norm": 2.730437755584717, + "learning_rate": 1.9604370696085333e-05, + "loss": 2.3193, + "mean_token_accuracy": 0.5072603225708008, + "num_tokens": 1152484974.0, + "step": 2255 + }, + { + "epoch": 0.6100594916170903, + "grad_norm": 1.978909969329834, + "learning_rate": 1.9603908286356024e-05, + "loss": 2.3575, + "mean_token_accuracy": 0.4827401340007782, + "num_tokens": 1153009233.0, + "step": 2256 + }, + { + "epoch": 0.6103299080584099, + "grad_norm": 1.9947785139083862, + "learning_rate": 1.9603445612633505e-05, + "loss": 2.3181, + "mean_token_accuracy": 0.495801717042923, + "num_tokens": 1153533505.0, + "step": 2257 + }, + { + "epoch": 0.6106003244997296, + "grad_norm": 2.4128265380859375, + "learning_rate": 1.9602982674931972e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.5176162719726562, + "num_tokens": 1154022658.0, + "step": 2258 + }, + { + "epoch": 0.6108707409410492, + "grad_norm": 1.7025468349456787, + "learning_rate": 1.9602519473265635e-05, + "loss": 2.1418, + "mean_token_accuracy": 0.5196125507354736, + "num_tokens": 1154546734.0, + "step": 2259 + }, + { + "epoch": 0.6111411573823688, + "grad_norm": 2.1613447666168213, + "learning_rate": 1.96020560076487e-05, + "loss": 2.298, + "mean_token_accuracy": 0.5221959352493286, + "num_tokens": 1155070927.0, + "step": 2260 + }, + { + "epoch": 0.6114115738236885, + "grad_norm": 1.052717685699463, + "learning_rate": 1.960159227809539e-05, + "loss": 1.1441, + "mean_token_accuracy": 0.6931196451187134, + "num_tokens": 1155573976.0, + "step": 2261 + }, + { + "epoch": 0.6116819902650081, + "grad_norm": 2.4327468872070312, + "learning_rate": 1.9601128284619934e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.5274252891540527, + "num_tokens": 1156098113.0, + "step": 2262 + }, + { + "epoch": 0.6119524067063278, + "grad_norm": 1.881588101387024, + "learning_rate": 1.9600664027236563e-05, + "loss": 2.2721, + "mean_token_accuracy": 0.5152384042739868, + "num_tokens": 1156595435.0, + "step": 2263 + }, + { + "epoch": 0.6122228231476474, + "grad_norm": 1.897541880607605, + "learning_rate": 1.9600199505959532e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.508259654045105, + "num_tokens": 1157119505.0, + "step": 2264 + }, + { + "epoch": 0.6124932395889671, + "grad_norm": 2.214533567428589, + "learning_rate": 1.9599734720803087e-05, + "loss": 2.3302, + "mean_token_accuracy": 0.49730390310287476, + "num_tokens": 1157643750.0, + "step": 2265 + }, + { + "epoch": 0.6127636560302866, + "grad_norm": 1.7332911491394043, + "learning_rate": 1.959926967178149e-05, + "loss": 2.2823, + "mean_token_accuracy": 0.5089775323867798, + "num_tokens": 1158167956.0, + "step": 2266 + }, + { + "epoch": 0.6130340724716062, + "grad_norm": 1.7068064212799072, + "learning_rate": 1.9598804358909012e-05, + "loss": 2.2072, + "mean_token_accuracy": 0.5213558077812195, + "num_tokens": 1158681414.0, + "step": 2267 + }, + { + "epoch": 0.6133044889129259, + "grad_norm": 1.7210355997085571, + "learning_rate": 1.959833878219993e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.4860972464084625, + "num_tokens": 1159205611.0, + "step": 2268 + }, + { + "epoch": 0.6135749053542455, + "grad_norm": 1.5527604818344116, + "learning_rate": 1.9597872941668524e-05, + "loss": 2.2192, + "mean_token_accuracy": 0.5137848854064941, + "num_tokens": 1159729693.0, + "step": 2269 + }, + { + "epoch": 0.6138453217955652, + "grad_norm": 1.5776411294937134, + "learning_rate": 1.9597406837329097e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.5090092420578003, + "num_tokens": 1160253920.0, + "step": 2270 + }, + { + "epoch": 0.6141157382368848, + "grad_norm": 1.659403681755066, + "learning_rate": 1.9596940469195942e-05, + "loss": 2.2598, + "mean_token_accuracy": 0.517869234085083, + "num_tokens": 1160755144.0, + "step": 2271 + }, + { + "epoch": 0.6143861546782045, + "grad_norm": 1.6548042297363281, + "learning_rate": 1.959647383728337e-05, + "loss": 2.0799, + "mean_token_accuracy": 0.534974217414856, + "num_tokens": 1161279312.0, + "step": 2272 + }, + { + "epoch": 0.6146565711195241, + "grad_norm": 1.7286821603775024, + "learning_rate": 1.95960069416057e-05, + "loss": 2.2216, + "mean_token_accuracy": 0.5252866148948669, + "num_tokens": 1161778623.0, + "step": 2273 + }, + { + "epoch": 0.6149269875608437, + "grad_norm": 1.3757423162460327, + "learning_rate": 1.9595539782177263e-05, + "loss": 2.3254, + "mean_token_accuracy": 0.4999566078186035, + "num_tokens": 1162302900.0, + "step": 2274 + }, + { + "epoch": 0.6151974040021634, + "grad_norm": 1.6651244163513184, + "learning_rate": 1.959507235901238e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.5020381212234497, + "num_tokens": 1162827153.0, + "step": 2275 + }, + { + "epoch": 0.6154678204434829, + "grad_norm": 1.660930871963501, + "learning_rate": 1.9594604672125405e-05, + "loss": 2.2596, + "mean_token_accuracy": 0.5520120859146118, + "num_tokens": 1163286058.0, + "step": 2276 + }, + { + "epoch": 0.6157382368848026, + "grad_norm": 1.5917731523513794, + "learning_rate": 1.9594136721530685e-05, + "loss": 2.264, + "mean_token_accuracy": 0.5144789814949036, + "num_tokens": 1163704896.0, + "step": 2277 + }, + { + "epoch": 0.6160086533261222, + "grad_norm": 1.4607775211334229, + "learning_rate": 1.959366850724257e-05, + "loss": 2.1939, + "mean_token_accuracy": 0.5205271244049072, + "num_tokens": 1164229104.0, + "step": 2278 + }, + { + "epoch": 0.6162790697674418, + "grad_norm": 1.5540659427642822, + "learning_rate": 1.959320002927544e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.5172778367996216, + "num_tokens": 1164734465.0, + "step": 2279 + }, + { + "epoch": 0.6165494862087615, + "grad_norm": 1.4605554342269897, + "learning_rate": 1.959273128764366e-05, + "loss": 2.1805, + "mean_token_accuracy": 0.5207945108413696, + "num_tokens": 1165258647.0, + "step": 2280 + }, + { + "epoch": 0.6168199026500811, + "grad_norm": 0.9108085036277771, + "learning_rate": 1.9592262282361613e-05, + "loss": 1.1932, + "mean_token_accuracy": 0.6874415874481201, + "num_tokens": 1165732816.0, + "step": 2281 + }, + { + "epoch": 0.6170903190914008, + "grad_norm": 2.6087381839752197, + "learning_rate": 1.9591793013443695e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.506901741027832, + "num_tokens": 1166257088.0, + "step": 2282 + }, + { + "epoch": 0.6173607355327204, + "grad_norm": 2.35654354095459, + "learning_rate": 1.9591323480904294e-05, + "loss": 2.4745, + "mean_token_accuracy": 0.47732990980148315, + "num_tokens": 1166781285.0, + "step": 2283 + }, + { + "epoch": 0.6176311519740401, + "grad_norm": 1.43394935131073, + "learning_rate": 1.959085368475783e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.5765019655227661, + "num_tokens": 1167240622.0, + "step": 2284 + }, + { + "epoch": 0.6179015684153597, + "grad_norm": 1.8867835998535156, + "learning_rate": 1.9590383625018708e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.5047060251235962, + "num_tokens": 1167764867.0, + "step": 2285 + }, + { + "epoch": 0.6181719848566792, + "grad_norm": 1.5546678304672241, + "learning_rate": 1.9589913301701354e-05, + "loss": 2.2073, + "mean_token_accuracy": 0.5375704765319824, + "num_tokens": 1168234512.0, + "step": 2286 + }, + { + "epoch": 0.6184424012979989, + "grad_norm": 1.4475388526916504, + "learning_rate": 1.95894427148202e-05, + "loss": 2.2567, + "mean_token_accuracy": 0.497481107711792, + "num_tokens": 1168758701.0, + "step": 2287 + }, + { + "epoch": 0.6187128177393185, + "grad_norm": 1.4527335166931152, + "learning_rate": 1.9588971864389678e-05, + "loss": 2.2098, + "mean_token_accuracy": 0.5237371921539307, + "num_tokens": 1169241990.0, + "step": 2288 + }, + { + "epoch": 0.6189832341806382, + "grad_norm": 1.734701156616211, + "learning_rate": 1.9588500750424243e-05, + "loss": 2.2534, + "mean_token_accuracy": 0.5146498680114746, + "num_tokens": 1169711522.0, + "step": 2289 + }, + { + "epoch": 0.6192536506219578, + "grad_norm": 1.7428010702133179, + "learning_rate": 1.958802937293835e-05, + "loss": 2.1172, + "mean_token_accuracy": 0.529864490032196, + "num_tokens": 1170235741.0, + "step": 2290 + }, + { + "epoch": 0.6195240670632775, + "grad_norm": 1.7308231592178345, + "learning_rate": 1.958755773194646e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.5220353007316589, + "num_tokens": 1170759849.0, + "step": 2291 + }, + { + "epoch": 0.6197944835045971, + "grad_norm": 1.5479631423950195, + "learning_rate": 1.958708582746304e-05, + "loss": 2.1668, + "mean_token_accuracy": 0.509124755859375, + "num_tokens": 1171284073.0, + "step": 2292 + }, + { + "epoch": 0.6200648999459167, + "grad_norm": 1.7488696575164795, + "learning_rate": 1.958661365950258e-05, + "loss": 2.3575, + "mean_token_accuracy": 0.49877482652664185, + "num_tokens": 1171781474.0, + "step": 2293 + }, + { + "epoch": 0.6203353163872364, + "grad_norm": 1.37052321434021, + "learning_rate": 1.958614122807956e-05, + "loss": 2.2316, + "mean_token_accuracy": 0.5047565698623657, + "num_tokens": 1172305583.0, + "step": 2294 + }, + { + "epoch": 0.620605732828556, + "grad_norm": 1.374914288520813, + "learning_rate": 1.9585668533208477e-05, + "loss": 2.2015, + "mean_token_accuracy": 0.5211240649223328, + "num_tokens": 1172829853.0, + "step": 2295 + }, + { + "epoch": 0.6208761492698756, + "grad_norm": 1.746201515197754, + "learning_rate": 1.9585195574903833e-05, + "loss": 2.1275, + "mean_token_accuracy": 0.5294758677482605, + "num_tokens": 1173334488.0, + "step": 2296 + }, + { + "epoch": 0.6211465657111952, + "grad_norm": 1.495637059211731, + "learning_rate": 1.958472235318014e-05, + "loss": 2.1659, + "mean_token_accuracy": 0.5208809971809387, + "num_tokens": 1173858716.0, + "step": 2297 + }, + { + "epoch": 0.6214169821525148, + "grad_norm": 1.3510535955429077, + "learning_rate": 1.958424886805192e-05, + "loss": 2.1335, + "mean_token_accuracy": 0.5281020402908325, + "num_tokens": 1174382905.0, + "step": 2298 + }, + { + "epoch": 0.6216873985938345, + "grad_norm": 1.3608641624450684, + "learning_rate": 1.95837751195337e-05, + "loss": 2.241, + "mean_token_accuracy": 0.5203844308853149, + "num_tokens": 1174876497.0, + "step": 2299 + }, + { + "epoch": 0.6219578150351541, + "grad_norm": 1.7951207160949707, + "learning_rate": 1.9583301107640017e-05, + "loss": 2.4294, + "mean_token_accuracy": 0.48997724056243896, + "num_tokens": 1175400670.0, + "step": 2300 + }, + { + "epoch": 0.6222282314764738, + "grad_norm": 0.7984327673912048, + "learning_rate": 1.9582826832385416e-05, + "loss": 1.2, + "mean_token_accuracy": 0.6840595006942749, + "num_tokens": 1175924950.0, + "step": 2301 + }, + { + "epoch": 0.6224986479177934, + "grad_norm": 2.122175455093384, + "learning_rate": 1.9582352293784446e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.5163912773132324, + "num_tokens": 1176414080.0, + "step": 2302 + }, + { + "epoch": 0.6227690643591131, + "grad_norm": 2.055659055709839, + "learning_rate": 1.9581877491851666e-05, + "loss": 1.8536, + "mean_token_accuracy": 0.601794958114624, + "num_tokens": 1176938353.0, + "step": 2303 + }, + { + "epoch": 0.6230394808004327, + "grad_norm": 1.7001456022262573, + "learning_rate": 1.9581402426601653e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.5123218894004822, + "num_tokens": 1177462537.0, + "step": 2304 + }, + { + "epoch": 0.6233098972417523, + "grad_norm": 1.4628773927688599, + "learning_rate": 1.958092709804897e-05, + "loss": 2.3228, + "mean_token_accuracy": 0.4993622303009033, + "num_tokens": 1177986769.0, + "step": 2305 + }, + { + "epoch": 0.623580313683072, + "grad_norm": 1.6728383302688599, + "learning_rate": 1.9580451506208213e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.5096919536590576, + "num_tokens": 1178504384.0, + "step": 2306 + }, + { + "epoch": 0.6238507301243915, + "grad_norm": 1.6116288900375366, + "learning_rate": 1.957997565109397e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.4969281554222107, + "num_tokens": 1178978373.0, + "step": 2307 + }, + { + "epoch": 0.6241211465657112, + "grad_norm": 1.5616093873977661, + "learning_rate": 1.9579499532720842e-05, + "loss": 2.1575, + "mean_token_accuracy": 0.5075238943099976, + "num_tokens": 1179502548.0, + "step": 2308 + }, + { + "epoch": 0.6243915630070308, + "grad_norm": 1.8894202709197998, + "learning_rate": 1.9579023151103437e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.519980788230896, + "num_tokens": 1180026626.0, + "step": 2309 + }, + { + "epoch": 0.6246619794483504, + "grad_norm": 1.8250892162322998, + "learning_rate": 1.957854650625637e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.5541219115257263, + "num_tokens": 1180518225.0, + "step": 2310 + }, + { + "epoch": 0.6249323958896701, + "grad_norm": 1.8455939292907715, + "learning_rate": 1.9578069598194274e-05, + "loss": 2.3844, + "mean_token_accuracy": 0.48516640067100525, + "num_tokens": 1181042487.0, + "step": 2311 + }, + { + "epoch": 0.6252028123309897, + "grad_norm": 1.6073983907699585, + "learning_rate": 1.957759242693177e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.49247920513153076, + "num_tokens": 1181566685.0, + "step": 2312 + }, + { + "epoch": 0.6254732287723094, + "grad_norm": 1.6660051345825195, + "learning_rate": 1.957711499248351e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.4955936670303345, + "num_tokens": 1182090961.0, + "step": 2313 + }, + { + "epoch": 0.625743645213629, + "grad_norm": 1.679959774017334, + "learning_rate": 1.957663729486414e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.5308755040168762, + "num_tokens": 1182615121.0, + "step": 2314 + }, + { + "epoch": 0.6260140616549487, + "grad_norm": 1.385839581489563, + "learning_rate": 1.957615933408831e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.5008769035339355, + "num_tokens": 1183139310.0, + "step": 2315 + }, + { + "epoch": 0.6262844780962683, + "grad_norm": 1.6250423192977905, + "learning_rate": 1.9575681110170693e-05, + "loss": 2.1426, + "mean_token_accuracy": 0.5262635350227356, + "num_tokens": 1183663545.0, + "step": 2316 + }, + { + "epoch": 0.6265548945375878, + "grad_norm": 1.3711636066436768, + "learning_rate": 1.957520262312596e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.5201876163482666, + "num_tokens": 1184172832.0, + "step": 2317 + }, + { + "epoch": 0.6268253109789075, + "grad_norm": 1.372853398323059, + "learning_rate": 1.957472387296879e-05, + "loss": 2.222, + "mean_token_accuracy": 0.5124978423118591, + "num_tokens": 1184697094.0, + "step": 2318 + }, + { + "epoch": 0.6270957274202271, + "grad_norm": 1.373687505722046, + "learning_rate": 1.957424485971388e-05, + "loss": 2.258, + "mean_token_accuracy": 0.49525490403175354, + "num_tokens": 1185221247.0, + "step": 2319 + }, + { + "epoch": 0.6273661438615468, + "grad_norm": 1.4268909692764282, + "learning_rate": 1.957376558337592e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.5379438996315002, + "num_tokens": 1185683886.0, + "step": 2320 + }, + { + "epoch": 0.6276365603028664, + "grad_norm": 1.1276419162750244, + "learning_rate": 1.9573286043969622e-05, + "loss": 1.1988, + "mean_token_accuracy": 0.6837528944015503, + "num_tokens": 1186208005.0, + "step": 2321 + }, + { + "epoch": 0.627906976744186, + "grad_norm": 2.54872465133667, + "learning_rate": 1.957280624150969e-05, + "loss": 2.1734, + "mean_token_accuracy": 0.5045062899589539, + "num_tokens": 1186732198.0, + "step": 2322 + }, + { + "epoch": 0.6281773931855057, + "grad_norm": 1.8420668840408325, + "learning_rate": 1.957232617601085e-05, + "loss": 2.1727, + "mean_token_accuracy": 0.516119658946991, + "num_tokens": 1187176726.0, + "step": 2323 + }, + { + "epoch": 0.6284478096268253, + "grad_norm": 1.7007256746292114, + "learning_rate": 1.957184584748784e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.49961376190185547, + "num_tokens": 1187683401.0, + "step": 2324 + }, + { + "epoch": 0.628718226068145, + "grad_norm": 1.9724334478378296, + "learning_rate": 1.9571365255955387e-05, + "loss": 2.2597, + "mean_token_accuracy": 0.5082135200500488, + "num_tokens": 1188186123.0, + "step": 2325 + }, + { + "epoch": 0.6289886425094646, + "grad_norm": 2.1272411346435547, + "learning_rate": 1.9570884401428242e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.49826788902282715, + "num_tokens": 1188710394.0, + "step": 2326 + }, + { + "epoch": 0.6292590589507842, + "grad_norm": 1.5158222913742065, + "learning_rate": 1.957040328392116e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.5240480899810791, + "num_tokens": 1189169890.0, + "step": 2327 + }, + { + "epoch": 0.6295294753921038, + "grad_norm": 2.185887098312378, + "learning_rate": 1.95699219034489e-05, + "loss": 2.1509, + "mean_token_accuracy": 0.5229135751724243, + "num_tokens": 1189694155.0, + "step": 2328 + }, + { + "epoch": 0.6297998918334234, + "grad_norm": 1.8336373567581177, + "learning_rate": 1.9569440260026235e-05, + "loss": 2.1483, + "mean_token_accuracy": 0.532240092754364, + "num_tokens": 1190189500.0, + "step": 2329 + }, + { + "epoch": 0.6300703082747431, + "grad_norm": 1.7607754468917847, + "learning_rate": 1.9568958353667938e-05, + "loss": 2.4048, + "mean_token_accuracy": 0.46637678146362305, + "num_tokens": 1190663040.0, + "step": 2330 + }, + { + "epoch": 0.6303407247160627, + "grad_norm": 2.364777088165283, + "learning_rate": 1.95684761843888e-05, + "loss": 2.2402, + "mean_token_accuracy": 0.5275207757949829, + "num_tokens": 1191143677.0, + "step": 2331 + }, + { + "epoch": 0.6306111411573824, + "grad_norm": 1.9587124586105347, + "learning_rate": 1.9567993752203618e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.5251876711845398, + "num_tokens": 1191667962.0, + "step": 2332 + }, + { + "epoch": 0.630881557598702, + "grad_norm": 1.9197723865509033, + "learning_rate": 1.9567511057127187e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.5078310966491699, + "num_tokens": 1192192193.0, + "step": 2333 + }, + { + "epoch": 0.6311519740400217, + "grad_norm": 1.8735945224761963, + "learning_rate": 1.9567028099174325e-05, + "loss": 2.3006, + "mean_token_accuracy": 0.515984833240509, + "num_tokens": 1192716411.0, + "step": 2334 + }, + { + "epoch": 0.6314223904813413, + "grad_norm": 1.8870770931243896, + "learning_rate": 1.9566544878359844e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.4907890260219574, + "num_tokens": 1193240686.0, + "step": 2335 + }, + { + "epoch": 0.6316928069226609, + "grad_norm": 1.668662190437317, + "learning_rate": 1.9566061394698572e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.5217400789260864, + "num_tokens": 1193764799.0, + "step": 2336 + }, + { + "epoch": 0.6319632233639805, + "grad_norm": 1.4672185182571411, + "learning_rate": 1.956557764820535e-05, + "loss": 2.2836, + "mean_token_accuracy": 0.5039830207824707, + "num_tokens": 1194265132.0, + "step": 2337 + }, + { + "epoch": 0.6322336398053001, + "grad_norm": 1.7468312978744507, + "learning_rate": 1.9565093638895015e-05, + "loss": 2.2937, + "mean_token_accuracy": 0.5092531442642212, + "num_tokens": 1194789210.0, + "step": 2338 + }, + { + "epoch": 0.6325040562466198, + "grad_norm": 1.3979036808013916, + "learning_rate": 1.9564609366782417e-05, + "loss": 2.1942, + "mean_token_accuracy": 0.5262122750282288, + "num_tokens": 1195313415.0, + "step": 2339 + }, + { + "epoch": 0.6327744726879394, + "grad_norm": 1.7447583675384521, + "learning_rate": 1.9564124831882418e-05, + "loss": 2.1769, + "mean_token_accuracy": 0.5093231201171875, + "num_tokens": 1195837561.0, + "step": 2340 + }, + { + "epoch": 0.633044889129259, + "grad_norm": 0.672813892364502, + "learning_rate": 1.956364003420988e-05, + "loss": 1.1313, + "mean_token_accuracy": 0.6988394856452942, + "num_tokens": 1196361829.0, + "step": 2341 + }, + { + "epoch": 0.6333153055705787, + "grad_norm": 2.142737627029419, + "learning_rate": 1.9563154973779685e-05, + "loss": 2.0699, + "mean_token_accuracy": 0.5321338176727295, + "num_tokens": 1196819009.0, + "step": 2342 + }, + { + "epoch": 0.6335857220118983, + "grad_norm": 1.7178665399551392, + "learning_rate": 1.9562669650606715e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5107149481773376, + "num_tokens": 1197343249.0, + "step": 2343 + }, + { + "epoch": 0.633856138453218, + "grad_norm": 1.8962396383285522, + "learning_rate": 1.9562184064705857e-05, + "loss": 2.3898, + "mean_token_accuracy": 0.48150119185447693, + "num_tokens": 1197838806.0, + "step": 2344 + }, + { + "epoch": 0.6341265548945376, + "grad_norm": 1.6054141521453857, + "learning_rate": 1.956169821609201e-05, + "loss": 2.2272, + "mean_token_accuracy": 0.5184717178344727, + "num_tokens": 1198362975.0, + "step": 2345 + }, + { + "epoch": 0.6343969713358573, + "grad_norm": 1.5695880651474, + "learning_rate": 1.9561212104780084e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.5181819200515747, + "num_tokens": 1198887259.0, + "step": 2346 + }, + { + "epoch": 0.6346673877771769, + "grad_norm": 1.5303210020065308, + "learning_rate": 1.9560725730784996e-05, + "loss": 2.128, + "mean_token_accuracy": 0.5338824987411499, + "num_tokens": 1199285445.0, + "step": 2347 + }, + { + "epoch": 0.6349378042184964, + "grad_norm": 1.373775601387024, + "learning_rate": 1.9560239094121667e-05, + "loss": 2.1373, + "mean_token_accuracy": 0.5128400325775146, + "num_tokens": 1199809713.0, + "step": 2348 + }, + { + "epoch": 0.6352082206598161, + "grad_norm": 1.726205587387085, + "learning_rate": 1.9559752194805026e-05, + "loss": 2.2083, + "mean_token_accuracy": 0.5113276839256287, + "num_tokens": 1200333827.0, + "step": 2349 + }, + { + "epoch": 0.6354786371011357, + "grad_norm": 1.6851204633712769, + "learning_rate": 1.9559265032850016e-05, + "loss": 2.24, + "mean_token_accuracy": 0.500458836555481, + "num_tokens": 1200857999.0, + "step": 2350 + }, + { + "epoch": 0.6357490535424554, + "grad_norm": 1.7378119230270386, + "learning_rate": 1.9558777608271585e-05, + "loss": 2.0903, + "mean_token_accuracy": 0.5420852899551392, + "num_tokens": 1201382208.0, + "step": 2351 + }, + { + "epoch": 0.636019469983775, + "grad_norm": 1.3549363613128662, + "learning_rate": 1.9558289921084686e-05, + "loss": 2.2769, + "mean_token_accuracy": 0.5035175085067749, + "num_tokens": 1201906430.0, + "step": 2352 + }, + { + "epoch": 0.6362898864250947, + "grad_norm": 1.4802194833755493, + "learning_rate": 1.9557801971304285e-05, + "loss": 2.2766, + "mean_token_accuracy": 0.4958682656288147, + "num_tokens": 1202430581.0, + "step": 2353 + }, + { + "epoch": 0.6365603028664143, + "grad_norm": 1.440037488937378, + "learning_rate": 1.955731375894535e-05, + "loss": 2.3025, + "mean_token_accuracy": 0.5165510773658752, + "num_tokens": 1202923333.0, + "step": 2354 + }, + { + "epoch": 0.6368307193077339, + "grad_norm": 1.877758264541626, + "learning_rate": 1.9556825284022867e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.5074676275253296, + "num_tokens": 1203399907.0, + "step": 2355 + }, + { + "epoch": 0.6371011357490536, + "grad_norm": 1.7620556354522705, + "learning_rate": 1.955633654655182e-05, + "loss": 2.2942, + "mean_token_accuracy": 0.5066260099411011, + "num_tokens": 1203924012.0, + "step": 2356 + }, + { + "epoch": 0.6373715521903732, + "grad_norm": 1.7868595123291016, + "learning_rate": 1.9555847546547205e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.5173531770706177, + "num_tokens": 1204410517.0, + "step": 2357 + }, + { + "epoch": 0.6376419686316928, + "grad_norm": 1.7937077283859253, + "learning_rate": 1.9555358284024023e-05, + "loss": 2.2065, + "mean_token_accuracy": 0.49952226877212524, + "num_tokens": 1204933983.0, + "step": 2358 + }, + { + "epoch": 0.6379123850730124, + "grad_norm": 1.6351351737976074, + "learning_rate": 1.955486875899729e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.5006482005119324, + "num_tokens": 1205458202.0, + "step": 2359 + }, + { + "epoch": 0.638182801514332, + "grad_norm": 1.2974252700805664, + "learning_rate": 1.955437897148203e-05, + "loss": 2.2116, + "mean_token_accuracy": 0.5194945335388184, + "num_tokens": 1205972446.0, + "step": 2360 + }, + { + "epoch": 0.6384532179556517, + "grad_norm": 0.8901383876800537, + "learning_rate": 1.955388892149326e-05, + "loss": 1.202, + "mean_token_accuracy": 0.6900192499160767, + "num_tokens": 1206496602.0, + "step": 2361 + }, + { + "epoch": 0.6387236343969713, + "grad_norm": 2.3375799655914307, + "learning_rate": 1.9553398609046026e-05, + "loss": 2.1735, + "mean_token_accuracy": 0.5241838693618774, + "num_tokens": 1207020692.0, + "step": 2362 + }, + { + "epoch": 0.638994050838291, + "grad_norm": 1.4545273780822754, + "learning_rate": 1.955290803415537e-05, + "loss": 2.0199, + "mean_token_accuracy": 0.5401423573493958, + "num_tokens": 1207544948.0, + "step": 2363 + }, + { + "epoch": 0.6392644672796106, + "grad_norm": 2.034656286239624, + "learning_rate": 1.955241719683634e-05, + "loss": 2.296, + "mean_token_accuracy": 0.5130549669265747, + "num_tokens": 1208069166.0, + "step": 2364 + }, + { + "epoch": 0.6395348837209303, + "grad_norm": 1.9673175811767578, + "learning_rate": 1.9551926097104002e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.5019335746765137, + "num_tokens": 1208593348.0, + "step": 2365 + }, + { + "epoch": 0.6398053001622499, + "grad_norm": 1.6694934368133545, + "learning_rate": 1.955143473497342e-05, + "loss": 2.2016, + "mean_token_accuracy": 0.5406278371810913, + "num_tokens": 1209117367.0, + "step": 2366 + }, + { + "epoch": 0.6400757166035695, + "grad_norm": 2.442293405532837, + "learning_rate": 1.955094311045967e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.5263438820838928, + "num_tokens": 1209527993.0, + "step": 2367 + }, + { + "epoch": 0.6403461330448891, + "grad_norm": 2.345855951309204, + "learning_rate": 1.955045122357784e-05, + "loss": 2.2692, + "mean_token_accuracy": 0.4946502149105072, + "num_tokens": 1210017958.0, + "step": 2368 + }, + { + "epoch": 0.6406165494862087, + "grad_norm": 1.639491081237793, + "learning_rate": 1.9549959074343027e-05, + "loss": 2.1882, + "mean_token_accuracy": 0.5153670310974121, + "num_tokens": 1210542123.0, + "step": 2369 + }, + { + "epoch": 0.6408869659275284, + "grad_norm": 1.769963026046753, + "learning_rate": 1.954946666277032e-05, + "loss": 2.2133, + "mean_token_accuracy": 0.5320063233375549, + "num_tokens": 1211066327.0, + "step": 2370 + }, + { + "epoch": 0.641157382368848, + "grad_norm": 1.7622196674346924, + "learning_rate": 1.9548973988874836e-05, + "loss": 2.1733, + "mean_token_accuracy": 0.513433039188385, + "num_tokens": 1211590581.0, + "step": 2371 + }, + { + "epoch": 0.6414277988101676, + "grad_norm": 1.838773250579834, + "learning_rate": 1.9548481052671686e-05, + "loss": 2.1366, + "mean_token_accuracy": 0.48909899592399597, + "num_tokens": 1212114668.0, + "step": 2372 + }, + { + "epoch": 0.6416982152514873, + "grad_norm": 2.3721280097961426, + "learning_rate": 1.9547987854176e-05, + "loss": 2.353, + "mean_token_accuracy": 0.5074297785758972, + "num_tokens": 1212638758.0, + "step": 2373 + }, + { + "epoch": 0.6419686316928069, + "grad_norm": 1.9219216108322144, + "learning_rate": 1.954749439340291e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.5082383751869202, + "num_tokens": 1213163040.0, + "step": 2374 + }, + { + "epoch": 0.6422390481341266, + "grad_norm": 2.141319990158081, + "learning_rate": 1.9547000670367554e-05, + "loss": 2.1831, + "mean_token_accuracy": 0.5198296308517456, + "num_tokens": 1213687251.0, + "step": 2375 + }, + { + "epoch": 0.6425094645754462, + "grad_norm": 1.4744013547897339, + "learning_rate": 1.9546506685085082e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.4974426031112671, + "num_tokens": 1214211538.0, + "step": 2376 + }, + { + "epoch": 0.6427798810167659, + "grad_norm": 2.0638387203216553, + "learning_rate": 1.954601243757065e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.48419955372810364, + "num_tokens": 1214702973.0, + "step": 2377 + }, + { + "epoch": 0.6430502974580855, + "grad_norm": 2.0958075523376465, + "learning_rate": 1.9545517927839427e-05, + "loss": 2.1825, + "mean_token_accuracy": 0.5187256336212158, + "num_tokens": 1215227092.0, + "step": 2378 + }, + { + "epoch": 0.643320713899405, + "grad_norm": 1.2561415433883667, + "learning_rate": 1.954502315590658e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.512001097202301, + "num_tokens": 1215751378.0, + "step": 2379 + }, + { + "epoch": 0.6435911303407247, + "grad_norm": 2.6819167137145996, + "learning_rate": 1.9544528121787297e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.5084062218666077, + "num_tokens": 1216238362.0, + "step": 2380 + }, + { + "epoch": 0.6438615467820443, + "grad_norm": 0.8877201080322266, + "learning_rate": 1.954403282549676e-05, + "loss": 1.2014, + "mean_token_accuracy": 0.6878361701965332, + "num_tokens": 1216712013.0, + "step": 2381 + }, + { + "epoch": 0.644131963223364, + "grad_norm": 2.1454062461853027, + "learning_rate": 1.954353726705017e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.49773287773132324, + "num_tokens": 1217236244.0, + "step": 2382 + }, + { + "epoch": 0.6444023796646836, + "grad_norm": 1.8549097776412964, + "learning_rate": 1.9543041446462736e-05, + "loss": 2.3467, + "mean_token_accuracy": 0.49492600560188293, + "num_tokens": 1217760452.0, + "step": 2383 + }, + { + "epoch": 0.6446727961060033, + "grad_norm": 1.89705228805542, + "learning_rate": 1.9542545363749664e-05, + "loss": 2.2348, + "mean_token_accuracy": 0.5050568580627441, + "num_tokens": 1218284656.0, + "step": 2384 + }, + { + "epoch": 0.6449432125473229, + "grad_norm": 1.8530943393707275, + "learning_rate": 1.9542049018926176e-05, + "loss": 2.3218, + "mean_token_accuracy": 0.48991650342941284, + "num_tokens": 1218808920.0, + "step": 2385 + }, + { + "epoch": 0.6452136289886425, + "grad_norm": 1.9652118682861328, + "learning_rate": 1.9541552412007506e-05, + "loss": 2.3443, + "mean_token_accuracy": 0.5108270645141602, + "num_tokens": 1219333189.0, + "step": 2386 + }, + { + "epoch": 0.6454840454299622, + "grad_norm": 2.110712766647339, + "learning_rate": 1.954105554300889e-05, + "loss": 2.1638, + "mean_token_accuracy": 0.5200259685516357, + "num_tokens": 1219857422.0, + "step": 2387 + }, + { + "epoch": 0.6457544618712818, + "grad_norm": 1.9904441833496094, + "learning_rate": 1.9540558411945572e-05, + "loss": 2.1284, + "mean_token_accuracy": 0.5291985273361206, + "num_tokens": 1220322949.0, + "step": 2388 + }, + { + "epoch": 0.6460248783126014, + "grad_norm": 2.271941661834717, + "learning_rate": 1.9540061018832805e-05, + "loss": 2.226, + "mean_token_accuracy": 0.5192971229553223, + "num_tokens": 1220847205.0, + "step": 2389 + }, + { + "epoch": 0.646295294753921, + "grad_norm": 2.0666282176971436, + "learning_rate": 1.9539563363685848e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.5259853005409241, + "num_tokens": 1221311597.0, + "step": 2390 + }, + { + "epoch": 0.6465657111952406, + "grad_norm": 1.622624158859253, + "learning_rate": 1.953906544651998e-05, + "loss": 2.1879, + "mean_token_accuracy": 0.5117869973182678, + "num_tokens": 1221835684.0, + "step": 2391 + }, + { + "epoch": 0.6468361276365603, + "grad_norm": 1.7822434902191162, + "learning_rate": 1.953856726735047e-05, + "loss": 2.3451, + "mean_token_accuracy": 0.5005882978439331, + "num_tokens": 1222359854.0, + "step": 2392 + }, + { + "epoch": 0.6471065440778799, + "grad_norm": 1.5669960975646973, + "learning_rate": 1.9538068826192604e-05, + "loss": 2.189, + "mean_token_accuracy": 0.5176808834075928, + "num_tokens": 1222884085.0, + "step": 2393 + }, + { + "epoch": 0.6473769605191996, + "grad_norm": 2.228469133377075, + "learning_rate": 1.953757012306168e-05, + "loss": 2.2919, + "mean_token_accuracy": 0.5314958095550537, + "num_tokens": 1223333149.0, + "step": 2394 + }, + { + "epoch": 0.6476473769605192, + "grad_norm": 1.8285918235778809, + "learning_rate": 1.9537071157972994e-05, + "loss": 2.0941, + "mean_token_accuracy": 0.5294219255447388, + "num_tokens": 1223800107.0, + "step": 2395 + }, + { + "epoch": 0.6479177934018389, + "grad_norm": 1.8498567342758179, + "learning_rate": 1.953657193094186e-05, + "loss": 2.002, + "mean_token_accuracy": 0.5620449781417847, + "num_tokens": 1224324215.0, + "step": 2396 + }, + { + "epoch": 0.6481882098431585, + "grad_norm": 1.9135141372680664, + "learning_rate": 1.95360724419836e-05, + "loss": 2.1386, + "mean_token_accuracy": 0.5443530678749084, + "num_tokens": 1224848382.0, + "step": 2397 + }, + { + "epoch": 0.6484586262844781, + "grad_norm": 1.5766323804855347, + "learning_rate": 1.9535572691113528e-05, + "loss": 2.1424, + "mean_token_accuracy": 0.5238580703735352, + "num_tokens": 1225372536.0, + "step": 2398 + }, + { + "epoch": 0.6487290427257977, + "grad_norm": 1.7830613851547241, + "learning_rate": 1.9535072678346985e-05, + "loss": 2.2947, + "mean_token_accuracy": 0.5043573379516602, + "num_tokens": 1225890515.0, + "step": 2399 + }, + { + "epoch": 0.6489994591671173, + "grad_norm": 1.727488398551941, + "learning_rate": 1.9534572403699316e-05, + "loss": 2.1942, + "mean_token_accuracy": 0.5183591842651367, + "num_tokens": 1226387081.0, + "step": 2400 + }, + { + "epoch": 0.649269875608437, + "grad_norm": 0.7435846328735352, + "learning_rate": 1.9534071867185864e-05, + "loss": 1.1522, + "mean_token_accuracy": 0.6963457465171814, + "num_tokens": 1226911261.0, + "step": 2401 + }, + { + "epoch": 0.6495402920497566, + "grad_norm": 2.530167818069458, + "learning_rate": 1.953357106882199e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.5072118043899536, + "num_tokens": 1227435405.0, + "step": 2402 + }, + { + "epoch": 0.6498107084910762, + "grad_norm": 2.155280828475952, + "learning_rate": 1.9533070008623062e-05, + "loss": 2.2409, + "mean_token_accuracy": 0.5084052085876465, + "num_tokens": 1227959591.0, + "step": 2403 + }, + { + "epoch": 0.6500811249323959, + "grad_norm": 1.259997010231018, + "learning_rate": 1.953256868660445e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.5091980695724487, + "num_tokens": 1228460121.0, + "step": 2404 + }, + { + "epoch": 0.6503515413737155, + "grad_norm": 1.854706048965454, + "learning_rate": 1.953206710278154e-05, + "loss": 2.2056, + "mean_token_accuracy": 0.5122607946395874, + "num_tokens": 1228984278.0, + "step": 2405 + }, + { + "epoch": 0.6506219578150352, + "grad_norm": 1.7191033363342285, + "learning_rate": 1.9531565257169718e-05, + "loss": 2.1716, + "mean_token_accuracy": 0.5404027700424194, + "num_tokens": 1229508474.0, + "step": 2406 + }, + { + "epoch": 0.6508923742563548, + "grad_norm": 1.731847882270813, + "learning_rate": 1.9531063149784386e-05, + "loss": 2.0183, + "mean_token_accuracy": 0.5270836353302002, + "num_tokens": 1230021649.0, + "step": 2407 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 1.9173678159713745, + "learning_rate": 1.953056078064095e-05, + "loss": 2.1869, + "mean_token_accuracy": 0.5186412930488586, + "num_tokens": 1230545783.0, + "step": 2408 + }, + { + "epoch": 0.651433207138994, + "grad_norm": 1.7211991548538208, + "learning_rate": 1.953005814975482e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.5007345080375671, + "num_tokens": 1231069887.0, + "step": 2409 + }, + { + "epoch": 0.6517036235803136, + "grad_norm": 1.8472422361373901, + "learning_rate": 1.952955525714142e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.5066890120506287, + "num_tokens": 1231594165.0, + "step": 2410 + }, + { + "epoch": 0.6519740400216333, + "grad_norm": 1.7580586671829224, + "learning_rate": 1.952905210281618e-05, + "loss": 2.3867, + "mean_token_accuracy": 0.4891403913497925, + "num_tokens": 1232118239.0, + "step": 2411 + }, + { + "epoch": 0.6522444564629529, + "grad_norm": 1.5053671598434448, + "learning_rate": 1.9528548686794543e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.5206046104431152, + "num_tokens": 1232594396.0, + "step": 2412 + }, + { + "epoch": 0.6525148729042726, + "grad_norm": 1.3972984552383423, + "learning_rate": 1.952804500909195e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.5072444081306458, + "num_tokens": 1233118524.0, + "step": 2413 + }, + { + "epoch": 0.6527852893455922, + "grad_norm": 2.059049606323242, + "learning_rate": 1.9527541069723854e-05, + "loss": 2.0397, + "mean_token_accuracy": 0.5684123039245605, + "num_tokens": 1233642800.0, + "step": 2414 + }, + { + "epoch": 0.6530557057869119, + "grad_norm": 1.84211266040802, + "learning_rate": 1.952703686870572e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.5038061738014221, + "num_tokens": 1234166988.0, + "step": 2415 + }, + { + "epoch": 0.6533261222282315, + "grad_norm": 1.872973084449768, + "learning_rate": 1.9526532406053024e-05, + "loss": 2.1301, + "mean_token_accuracy": 0.5381298065185547, + "num_tokens": 1234691155.0, + "step": 2416 + }, + { + "epoch": 0.6535965386695511, + "grad_norm": 1.4843065738677979, + "learning_rate": 1.9526027681781234e-05, + "loss": 2.2455, + "mean_token_accuracy": 0.5128355622291565, + "num_tokens": 1235215429.0, + "step": 2417 + }, + { + "epoch": 0.6538669551108708, + "grad_norm": 1.5446836948394775, + "learning_rate": 1.9525522695905845e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.504883885383606, + "num_tokens": 1235739562.0, + "step": 2418 + }, + { + "epoch": 0.6541373715521904, + "grad_norm": 1.4100159406661987, + "learning_rate": 1.9525017448442346e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.5157898664474487, + "num_tokens": 1236263752.0, + "step": 2419 + }, + { + "epoch": 0.65440778799351, + "grad_norm": 1.6454050540924072, + "learning_rate": 1.952451193940624e-05, + "loss": 2.3132, + "mean_token_accuracy": 0.49841949343681335, + "num_tokens": 1236788030.0, + "step": 2420 + }, + { + "epoch": 0.6546782044348296, + "grad_norm": 0.7244307398796082, + "learning_rate": 1.952400616881304e-05, + "loss": 1.1272, + "mean_token_accuracy": 0.6905021667480469, + "num_tokens": 1237312164.0, + "step": 2421 + }, + { + "epoch": 0.6549486208761492, + "grad_norm": 2.727628231048584, + "learning_rate": 1.9523500136678263e-05, + "loss": 2.1131, + "mean_token_accuracy": 0.5293101072311401, + "num_tokens": 1237836409.0, + "step": 2422 + }, + { + "epoch": 0.6552190373174689, + "grad_norm": 2.244123935699463, + "learning_rate": 1.952299384301744e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.5014215707778931, + "num_tokens": 1238348566.0, + "step": 2423 + }, + { + "epoch": 0.6554894537587885, + "grad_norm": 8.409016609191895, + "learning_rate": 1.952248728784609e-05, + "loss": 1.9939, + "mean_token_accuracy": 0.5268230438232422, + "num_tokens": 1238866143.0, + "step": 2424 + }, + { + "epoch": 0.6557598702001082, + "grad_norm": 2.3933422565460205, + "learning_rate": 1.952198047117978e-05, + "loss": 2.2803, + "mean_token_accuracy": 0.5316640138626099, + "num_tokens": 1239322027.0, + "step": 2425 + }, + { + "epoch": 0.6560302866414278, + "grad_norm": 2.149813413619995, + "learning_rate": 1.952147339303404e-05, + "loss": 2.3158, + "mean_token_accuracy": 0.5048920512199402, + "num_tokens": 1239819912.0, + "step": 2426 + }, + { + "epoch": 0.6563007030827475, + "grad_norm": 1.5692899227142334, + "learning_rate": 1.952096605342444e-05, + "loss": 2.2909, + "mean_token_accuracy": 0.4845121502876282, + "num_tokens": 1240344033.0, + "step": 2427 + }, + { + "epoch": 0.6565711195240671, + "grad_norm": 2.184782028198242, + "learning_rate": 1.952045845236654e-05, + "loss": 2.3468, + "mean_token_accuracy": 0.49612852931022644, + "num_tokens": 1240868165.0, + "step": 2428 + }, + { + "epoch": 0.6568415359653867, + "grad_norm": 1.855986475944519, + "learning_rate": 1.951995058987592e-05, + "loss": 2.1084, + "mean_token_accuracy": 0.5263649225234985, + "num_tokens": 1241392448.0, + "step": 2429 + }, + { + "epoch": 0.6571119524067063, + "grad_norm": 1.810267448425293, + "learning_rate": 1.9519442465968155e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.5031633973121643, + "num_tokens": 1241916695.0, + "step": 2430 + }, + { + "epoch": 0.6573823688480259, + "grad_norm": 1.8561217784881592, + "learning_rate": 1.9518934080658848e-05, + "loss": 2.2222, + "mean_token_accuracy": 0.4846216142177582, + "num_tokens": 1242440948.0, + "step": 2431 + }, + { + "epoch": 0.6576527852893456, + "grad_norm": 2.7871971130371094, + "learning_rate": 1.951842543396359e-05, + "loss": 2.3216, + "mean_token_accuracy": 0.5139203071594238, + "num_tokens": 1242961017.0, + "step": 2432 + }, + { + "epoch": 0.6579232017306652, + "grad_norm": 1.7346580028533936, + "learning_rate": 1.951791652589798e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.5430433750152588, + "num_tokens": 1243485183.0, + "step": 2433 + }, + { + "epoch": 0.6581936181719849, + "grad_norm": 2.3489744663238525, + "learning_rate": 1.9517407356477654e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.515036940574646, + "num_tokens": 1244009402.0, + "step": 2434 + }, + { + "epoch": 0.6584640346133045, + "grad_norm": 2.097858428955078, + "learning_rate": 1.9516897925718216e-05, + "loss": 2.2504, + "mean_token_accuracy": 0.5167456269264221, + "num_tokens": 1244533588.0, + "step": 2435 + }, + { + "epoch": 0.6587344510546241, + "grad_norm": 1.814680576324463, + "learning_rate": 1.95163882336353e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.5278557538986206, + "num_tokens": 1245057803.0, + "step": 2436 + }, + { + "epoch": 0.6590048674959438, + "grad_norm": 1.6711596250534058, + "learning_rate": 1.9515878280244554e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.5060222744941711, + "num_tokens": 1245582012.0, + "step": 2437 + }, + { + "epoch": 0.6592752839372634, + "grad_norm": 1.969112515449524, + "learning_rate": 1.9515368065561616e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.5205227136611938, + "num_tokens": 1246106173.0, + "step": 2438 + }, + { + "epoch": 0.6595457003785831, + "grad_norm": 2.234639883041382, + "learning_rate": 1.9514857589602143e-05, + "loss": 2.2152, + "mean_token_accuracy": 0.513677716255188, + "num_tokens": 1246630354.0, + "step": 2439 + }, + { + "epoch": 0.6598161168199026, + "grad_norm": 7.818943023681641, + "learning_rate": 1.95143468523818e-05, + "loss": 1.8505, + "mean_token_accuracy": 0.5849958658218384, + "num_tokens": 1247154437.0, + "step": 2440 + }, + { + "epoch": 0.6600865332612222, + "grad_norm": 0.8474338054656982, + "learning_rate": 1.9513835853916257e-05, + "loss": 1.1854, + "mean_token_accuracy": 0.6951673030853271, + "num_tokens": 1247678674.0, + "step": 2441 + }, + { + "epoch": 0.6603569497025419, + "grad_norm": 4.677751064300537, + "learning_rate": 1.951332459422119e-05, + "loss": 2.1975, + "mean_token_accuracy": 0.5512773990631104, + "num_tokens": 1248202790.0, + "step": 2442 + }, + { + "epoch": 0.6606273661438615, + "grad_norm": 3.6739611625671387, + "learning_rate": 1.951281307331229e-05, + "loss": 2.1175, + "mean_token_accuracy": 0.5328190922737122, + "num_tokens": 1248714704.0, + "step": 2443 + }, + { + "epoch": 0.6608977825851812, + "grad_norm": 2.288902521133423, + "learning_rate": 1.951230129120525e-05, + "loss": 2.2615, + "mean_token_accuracy": 0.5085810422897339, + "num_tokens": 1249238917.0, + "step": 2444 + }, + { + "epoch": 0.6611681990265008, + "grad_norm": 2.385303497314453, + "learning_rate": 1.9511789247915775e-05, + "loss": 2.331, + "mean_token_accuracy": 0.5041607022285461, + "num_tokens": 1249760817.0, + "step": 2445 + }, + { + "epoch": 0.6614386154678205, + "grad_norm": 2.381279945373535, + "learning_rate": 1.9511276943459573e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.49606937170028687, + "num_tokens": 1250285083.0, + "step": 2446 + }, + { + "epoch": 0.6617090319091401, + "grad_norm": 2.2935519218444824, + "learning_rate": 1.9510764377852363e-05, + "loss": 2.3203, + "mean_token_accuracy": 0.4960280954837799, + "num_tokens": 1250809365.0, + "step": 2447 + }, + { + "epoch": 0.6619794483504597, + "grad_norm": 2.518610954284668, + "learning_rate": 1.9510251551109876e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.5114401578903198, + "num_tokens": 1251331362.0, + "step": 2448 + }, + { + "epoch": 0.6622498647917794, + "grad_norm": 2.189088821411133, + "learning_rate": 1.9509738463247843e-05, + "loss": 2.2661, + "mean_token_accuracy": 0.5343109965324402, + "num_tokens": 1251797016.0, + "step": 2449 + }, + { + "epoch": 0.6625202812330989, + "grad_norm": 2.4385488033294678, + "learning_rate": 1.9509225114282005e-05, + "loss": 2.2519, + "mean_token_accuracy": 0.520357072353363, + "num_tokens": 1252299382.0, + "step": 2450 + }, + { + "epoch": 0.6627906976744186, + "grad_norm": 2.1884267330169678, + "learning_rate": 1.950871150422812e-05, + "loss": 2.1732, + "mean_token_accuracy": 0.5274531841278076, + "num_tokens": 1252823643.0, + "step": 2451 + }, + { + "epoch": 0.6630611141157382, + "grad_norm": 1.6216984987258911, + "learning_rate": 1.9508197633101945e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.49184268712997437, + "num_tokens": 1253335054.0, + "step": 2452 + }, + { + "epoch": 0.6633315305570578, + "grad_norm": 2.0110602378845215, + "learning_rate": 1.9507683500919242e-05, + "loss": 2.1405, + "mean_token_accuracy": 0.5328896045684814, + "num_tokens": 1253859236.0, + "step": 2453 + }, + { + "epoch": 0.6636019469983775, + "grad_norm": 1.8840192556381226, + "learning_rate": 1.9507169107695794e-05, + "loss": 2.235, + "mean_token_accuracy": 0.5252346992492676, + "num_tokens": 1254383523.0, + "step": 2454 + }, + { + "epoch": 0.6638723634396971, + "grad_norm": 1.7326639890670776, + "learning_rate": 1.9506654453447377e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.5328905582427979, + "num_tokens": 1254863341.0, + "step": 2455 + }, + { + "epoch": 0.6641427798810168, + "grad_norm": 1.7923202514648438, + "learning_rate": 1.9506139538189786e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.5260806083679199, + "num_tokens": 1255387512.0, + "step": 2456 + }, + { + "epoch": 0.6644131963223364, + "grad_norm": 1.7738174200057983, + "learning_rate": 1.950562436193882e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.5184954404830933, + "num_tokens": 1255911630.0, + "step": 2457 + }, + { + "epoch": 0.6646836127636561, + "grad_norm": 1.7711124420166016, + "learning_rate": 1.9505108924710282e-05, + "loss": 2.1505, + "mean_token_accuracy": 0.52597576379776, + "num_tokens": 1256435799.0, + "step": 2458 + }, + { + "epoch": 0.6649540292049757, + "grad_norm": 1.4396233558654785, + "learning_rate": 1.9504593226519992e-05, + "loss": 2.1675, + "mean_token_accuracy": 0.5230188369750977, + "num_tokens": 1256959978.0, + "step": 2459 + }, + { + "epoch": 0.6652244456462953, + "grad_norm": 2.0004031658172607, + "learning_rate": 1.950407726738377e-05, + "loss": 2.344, + "mean_token_accuracy": 0.4928220808506012, + "num_tokens": 1257484111.0, + "step": 2460 + }, + { + "epoch": 0.6654948620876149, + "grad_norm": 0.9731616973876953, + "learning_rate": 1.9503561047317453e-05, + "loss": 1.2126, + "mean_token_accuracy": 0.6753144264221191, + "num_tokens": 1258008261.0, + "step": 2461 + }, + { + "epoch": 0.6657652785289345, + "grad_norm": 2.8548784255981445, + "learning_rate": 1.9503044566336874e-05, + "loss": 2.299, + "mean_token_accuracy": 0.5081014037132263, + "num_tokens": 1258532429.0, + "step": 2462 + }, + { + "epoch": 0.6660356949702542, + "grad_norm": 2.307786464691162, + "learning_rate": 1.9502527824457878e-05, + "loss": 2.3989, + "mean_token_accuracy": 0.4814903736114502, + "num_tokens": 1259056684.0, + "step": 2463 + }, + { + "epoch": 0.6663061114115738, + "grad_norm": 1.47743821144104, + "learning_rate": 1.9502010821696326e-05, + "loss": 2.1052, + "mean_token_accuracy": 0.519756019115448, + "num_tokens": 1259532432.0, + "step": 2464 + }, + { + "epoch": 0.6665765278528935, + "grad_norm": 1.9291733503341675, + "learning_rate": 1.950149355806808e-05, + "loss": 2.0886, + "mean_token_accuracy": 0.5315576791763306, + "num_tokens": 1260040147.0, + "step": 2465 + }, + { + "epoch": 0.6668469442942131, + "grad_norm": 2.528188705444336, + "learning_rate": 1.950097603358901e-05, + "loss": 1.7378, + "mean_token_accuracy": 0.6217724084854126, + "num_tokens": 1260518333.0, + "step": 2466 + }, + { + "epoch": 0.6671173607355327, + "grad_norm": 1.7299622297286987, + "learning_rate": 1.9500458248275e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.5052632689476013, + "num_tokens": 1261035133.0, + "step": 2467 + }, + { + "epoch": 0.6673877771768524, + "grad_norm": 1.721633791923523, + "learning_rate": 1.9499940202141928e-05, + "loss": 2.2908, + "mean_token_accuracy": 0.5041437149047852, + "num_tokens": 1261559235.0, + "step": 2468 + }, + { + "epoch": 0.667658193618172, + "grad_norm": 1.5400619506835938, + "learning_rate": 1.9499421895205692e-05, + "loss": 2.2167, + "mean_token_accuracy": 0.5188296437263489, + "num_tokens": 1262083452.0, + "step": 2469 + }, + { + "epoch": 0.6679286100594917, + "grad_norm": 1.5444211959838867, + "learning_rate": 1.9498903327482198e-05, + "loss": 2.116, + "mean_token_accuracy": 0.5393229126930237, + "num_tokens": 1262607709.0, + "step": 2470 + }, + { + "epoch": 0.6681990265008112, + "grad_norm": 1.4965490102767944, + "learning_rate": 1.949838449898736e-05, + "loss": 2.1581, + "mean_token_accuracy": 0.5331366062164307, + "num_tokens": 1263131967.0, + "step": 2471 + }, + { + "epoch": 0.6684694429421308, + "grad_norm": 1.6832724809646606, + "learning_rate": 1.949786540973709e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.5271751880645752, + "num_tokens": 1263577361.0, + "step": 2472 + }, + { + "epoch": 0.6687398593834505, + "grad_norm": 1.497922420501709, + "learning_rate": 1.949734605974732e-05, + "loss": 2.1717, + "mean_token_accuracy": 0.5346230864524841, + "num_tokens": 1264101403.0, + "step": 2473 + }, + { + "epoch": 0.6690102758247701, + "grad_norm": 1.4481247663497925, + "learning_rate": 1.9496826449033985e-05, + "loss": 2.3282, + "mean_token_accuracy": 0.5091638565063477, + "num_tokens": 1264625503.0, + "step": 2474 + }, + { + "epoch": 0.6692806922660898, + "grad_norm": 1.6917388439178467, + "learning_rate": 1.9496306577613025e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.5203160643577576, + "num_tokens": 1265149554.0, + "step": 2475 + }, + { + "epoch": 0.6695511087074094, + "grad_norm": 2.008437156677246, + "learning_rate": 1.9495786445500393e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.5314311981201172, + "num_tokens": 1265607962.0, + "step": 2476 + }, + { + "epoch": 0.6698215251487291, + "grad_norm": 1.625312089920044, + "learning_rate": 1.949526605271205e-05, + "loss": 2.2001, + "mean_token_accuracy": 0.5214027762413025, + "num_tokens": 1266132241.0, + "step": 2477 + }, + { + "epoch": 0.6700919415900487, + "grad_norm": 2.091261863708496, + "learning_rate": 1.949474539926396e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.5019589066505432, + "num_tokens": 1266656514.0, + "step": 2478 + }, + { + "epoch": 0.6703623580313683, + "grad_norm": 1.543449878692627, + "learning_rate": 1.9494224485172103e-05, + "loss": 2.372, + "mean_token_accuracy": 0.4975506663322449, + "num_tokens": 1267137348.0, + "step": 2479 + }, + { + "epoch": 0.670632774472688, + "grad_norm": 1.590502381324768, + "learning_rate": 1.9493703310452453e-05, + "loss": 2.2368, + "mean_token_accuracy": 0.5184467434883118, + "num_tokens": 1267616680.0, + "step": 2480 + }, + { + "epoch": 0.6709031909140075, + "grad_norm": 0.7119165658950806, + "learning_rate": 1.9493181875121013e-05, + "loss": 1.1246, + "mean_token_accuracy": 0.7031291723251343, + "num_tokens": 1268111456.0, + "step": 2481 + }, + { + "epoch": 0.6711736073553272, + "grad_norm": 2.628079891204834, + "learning_rate": 1.9492660179193772e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.5340671539306641, + "num_tokens": 1268609177.0, + "step": 2482 + }, + { + "epoch": 0.6714440237966468, + "grad_norm": 1.7980713844299316, + "learning_rate": 1.9492138222686744e-05, + "loss": 2.0072, + "mean_token_accuracy": 0.5430371165275574, + "num_tokens": 1269133372.0, + "step": 2483 + }, + { + "epoch": 0.6717144402379664, + "grad_norm": 1.5244762897491455, + "learning_rate": 1.9491616005615942e-05, + "loss": 2.1985, + "mean_token_accuracy": 0.5105096697807312, + "num_tokens": 1269648578.0, + "step": 2484 + }, + { + "epoch": 0.6719848566792861, + "grad_norm": 1.9428737163543701, + "learning_rate": 1.9491093527997386e-05, + "loss": 2.1886, + "mean_token_accuracy": 0.512684166431427, + "num_tokens": 1270172752.0, + "step": 2485 + }, + { + "epoch": 0.6722552731206057, + "grad_norm": 1.6827239990234375, + "learning_rate": 1.9490570789847114e-05, + "loss": 2.183, + "mean_token_accuracy": 0.5334427952766418, + "num_tokens": 1270673113.0, + "step": 2486 + }, + { + "epoch": 0.6725256895619254, + "grad_norm": 2.0264062881469727, + "learning_rate": 1.9490047791181157e-05, + "loss": 2.2211, + "mean_token_accuracy": 0.5198070406913757, + "num_tokens": 1271197382.0, + "step": 2487 + }, + { + "epoch": 0.672796106003245, + "grad_norm": 1.9078952074050903, + "learning_rate": 1.9489524532015566e-05, + "loss": 2.2332, + "mean_token_accuracy": 0.5103070139884949, + "num_tokens": 1271721654.0, + "step": 2488 + }, + { + "epoch": 0.6730665224445647, + "grad_norm": 1.9324238300323486, + "learning_rate": 1.94890010123664e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.5209511518478394, + "num_tokens": 1272193569.0, + "step": 2489 + }, + { + "epoch": 0.6733369388858843, + "grad_norm": 1.6376181840896606, + "learning_rate": 1.9488477232249714e-05, + "loss": 2.1204, + "mean_token_accuracy": 0.5308380126953125, + "num_tokens": 1272717804.0, + "step": 2490 + }, + { + "epoch": 0.673607355327204, + "grad_norm": 1.834349513053894, + "learning_rate": 1.948795319168159e-05, + "loss": 2.2331, + "mean_token_accuracy": 0.5201164484024048, + "num_tokens": 1273242050.0, + "step": 2491 + }, + { + "epoch": 0.6738777717685235, + "grad_norm": 1.6873431205749512, + "learning_rate": 1.9487428890678094e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.5160081386566162, + "num_tokens": 1273766325.0, + "step": 2492 + }, + { + "epoch": 0.6741481882098431, + "grad_norm": 1.8120265007019043, + "learning_rate": 1.948690432925532e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.511557936668396, + "num_tokens": 1274290439.0, + "step": 2493 + }, + { + "epoch": 0.6744186046511628, + "grad_norm": 1.8668967485427856, + "learning_rate": 1.9486379507429364e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.5275661945343018, + "num_tokens": 1274738996.0, + "step": 2494 + }, + { + "epoch": 0.6746890210924824, + "grad_norm": 1.7631956338882446, + "learning_rate": 1.9485854425216333e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.4872684180736542, + "num_tokens": 1275249010.0, + "step": 2495 + }, + { + "epoch": 0.674959437533802, + "grad_norm": 1.7291419506072998, + "learning_rate": 1.9485329082632327e-05, + "loss": 2.3119, + "mean_token_accuracy": 0.5241487622261047, + "num_tokens": 1275773281.0, + "step": 2496 + }, + { + "epoch": 0.6752298539751217, + "grad_norm": 1.620832920074463, + "learning_rate": 1.9484803479693472e-05, + "loss": 2.229, + "mean_token_accuracy": 0.5144116878509521, + "num_tokens": 1276297499.0, + "step": 2497 + }, + { + "epoch": 0.6755002704164413, + "grad_norm": 1.769417643547058, + "learning_rate": 1.9484277616415897e-05, + "loss": 2.1796, + "mean_token_accuracy": 0.5251675248146057, + "num_tokens": 1276821443.0, + "step": 2498 + }, + { + "epoch": 0.675770686857761, + "grad_norm": 1.4516657590866089, + "learning_rate": 1.9483751492815733e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.5172607898712158, + "num_tokens": 1277310983.0, + "step": 2499 + }, + { + "epoch": 0.6760411032990806, + "grad_norm": 1.4637625217437744, + "learning_rate": 1.948322510890912e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.5110905170440674, + "num_tokens": 1277835126.0, + "step": 2500 + }, + { + "epoch": 0.6763115197404003, + "grad_norm": 0.8096910715103149, + "learning_rate": 1.9482698464712216e-05, + "loss": 1.1995, + "mean_token_accuracy": 0.6796814799308777, + "num_tokens": 1278359216.0, + "step": 2501 + }, + { + "epoch": 0.6765819361817198, + "grad_norm": 2.5744123458862305, + "learning_rate": 1.948217156024118e-05, + "loss": 2.1499, + "mean_token_accuracy": 0.5245827436447144, + "num_tokens": 1278883386.0, + "step": 2502 + }, + { + "epoch": 0.6768523526230394, + "grad_norm": 2.071204423904419, + "learning_rate": 1.9481644395512172e-05, + "loss": 2.2362, + "mean_token_accuracy": 0.5035126805305481, + "num_tokens": 1279391742.0, + "step": 2503 + }, + { + "epoch": 0.6771227690643591, + "grad_norm": 2.0314223766326904, + "learning_rate": 1.9481116970541372e-05, + "loss": 2.2751, + "mean_token_accuracy": 0.5157243013381958, + "num_tokens": 1279915960.0, + "step": 2504 + }, + { + "epoch": 0.6773931855056787, + "grad_norm": 25.53084945678711, + "learning_rate": 1.9480589285344964e-05, + "loss": 2.1209, + "mean_token_accuracy": 0.5533679127693176, + "num_tokens": 1280440152.0, + "step": 2505 + }, + { + "epoch": 0.6776636019469984, + "grad_norm": 3.1578080654144287, + "learning_rate": 1.9480061339939136e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.5120954513549805, + "num_tokens": 1280942570.0, + "step": 2506 + }, + { + "epoch": 0.677934018388318, + "grad_norm": 2.4766552448272705, + "learning_rate": 1.947953313434009e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5257940292358398, + "num_tokens": 1281466751.0, + "step": 2507 + }, + { + "epoch": 0.6782044348296377, + "grad_norm": 1.6687089204788208, + "learning_rate": 1.9479004668564025e-05, + "loss": 2.2293, + "mean_token_accuracy": 0.5285584926605225, + "num_tokens": 1281986451.0, + "step": 2508 + }, + { + "epoch": 0.6784748512709573, + "grad_norm": 1.919291377067566, + "learning_rate": 1.9478475942627163e-05, + "loss": 2.2837, + "mean_token_accuracy": 0.5125898718833923, + "num_tokens": 1282510722.0, + "step": 2509 + }, + { + "epoch": 0.678745267712277, + "grad_norm": 1.7140355110168457, + "learning_rate": 1.9477946956545726e-05, + "loss": 2.2711, + "mean_token_accuracy": 0.5055350661277771, + "num_tokens": 1283016524.0, + "step": 2510 + }, + { + "epoch": 0.6790156841535966, + "grad_norm": 1.9823596477508545, + "learning_rate": 1.947741771033595e-05, + "loss": 2.1295, + "mean_token_accuracy": 0.5183477997779846, + "num_tokens": 1283495864.0, + "step": 2511 + }, + { + "epoch": 0.6792861005949161, + "grad_norm": 1.865671992301941, + "learning_rate": 1.9476888204014062e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.5229554772377014, + "num_tokens": 1283985644.0, + "step": 2512 + }, + { + "epoch": 0.6795565170362358, + "grad_norm": 1.5463227033615112, + "learning_rate": 1.947635843759632e-05, + "loss": 2.245, + "mean_token_accuracy": 0.49080002307891846, + "num_tokens": 1284509910.0, + "step": 2513 + }, + { + "epoch": 0.6798269334775554, + "grad_norm": 1.4346606731414795, + "learning_rate": 1.9475828411098968e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.5304383039474487, + "num_tokens": 1284999762.0, + "step": 2514 + }, + { + "epoch": 0.680097349918875, + "grad_norm": 1.4326817989349365, + "learning_rate": 1.9475298124538276e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.5114426016807556, + "num_tokens": 1285507703.0, + "step": 2515 + }, + { + "epoch": 0.6803677663601947, + "grad_norm": 1.7719475030899048, + "learning_rate": 1.9474767577930517e-05, + "loss": 2.1921, + "mean_token_accuracy": 0.5009279847145081, + "num_tokens": 1286031870.0, + "step": 2516 + }, + { + "epoch": 0.6806381828015143, + "grad_norm": 1.530341386795044, + "learning_rate": 1.947423677129196e-05, + "loss": 2.2184, + "mean_token_accuracy": 0.5217571258544922, + "num_tokens": 1286510906.0, + "step": 2517 + }, + { + "epoch": 0.680908599242834, + "grad_norm": 1.414880633354187, + "learning_rate": 1.9473705704638902e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.5110716223716736, + "num_tokens": 1287035188.0, + "step": 2518 + }, + { + "epoch": 0.6811790156841536, + "grad_norm": 1.3706754446029663, + "learning_rate": 1.947317437798763e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.5053364634513855, + "num_tokens": 1287559472.0, + "step": 2519 + }, + { + "epoch": 0.6814494321254733, + "grad_norm": 1.3983978033065796, + "learning_rate": 1.9472642791354455e-05, + "loss": 2.1929, + "mean_token_accuracy": 0.5132580995559692, + "num_tokens": 1288083714.0, + "step": 2520 + }, + { + "epoch": 0.6817198485667929, + "grad_norm": 0.7625740766525269, + "learning_rate": 1.947211094475568e-05, + "loss": 1.1392, + "mean_token_accuracy": 0.6939889192581177, + "num_tokens": 1288591290.0, + "step": 2521 + }, + { + "epoch": 0.6819902650081124, + "grad_norm": 2.420501232147217, + "learning_rate": 1.9471578838207628e-05, + "loss": 2.3075, + "mean_token_accuracy": 0.5189443826675415, + "num_tokens": 1289115467.0, + "step": 2522 + }, + { + "epoch": 0.6822606814494321, + "grad_norm": 1.7876039743423462, + "learning_rate": 1.9471046471726624e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.530924916267395, + "num_tokens": 1289639720.0, + "step": 2523 + }, + { + "epoch": 0.6825310978907517, + "grad_norm": 1.3333966732025146, + "learning_rate": 1.9470513845329003e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.536644697189331, + "num_tokens": 1290132876.0, + "step": 2524 + }, + { + "epoch": 0.6828015143320714, + "grad_norm": 1.9128509759902954, + "learning_rate": 1.9469980959031103e-05, + "loss": 2.2091, + "mean_token_accuracy": 0.5299713611602783, + "num_tokens": 1290657010.0, + "step": 2525 + }, + { + "epoch": 0.683071930773391, + "grad_norm": 1.6975470781326294, + "learning_rate": 1.946944781284928e-05, + "loss": 2.3136, + "mean_token_accuracy": 0.5093383193016052, + "num_tokens": 1291171265.0, + "step": 2526 + }, + { + "epoch": 0.6833423472147107, + "grad_norm": 1.8177343606948853, + "learning_rate": 1.9468914406799893e-05, + "loss": 2.2365, + "mean_token_accuracy": 0.5077084898948669, + "num_tokens": 1291695520.0, + "step": 2527 + }, + { + "epoch": 0.6836127636560303, + "grad_norm": 1.7520591020584106, + "learning_rate": 1.9468380740899306e-05, + "loss": 2.2811, + "mean_token_accuracy": 0.5088881254196167, + "num_tokens": 1292219686.0, + "step": 2528 + }, + { + "epoch": 0.6838831800973499, + "grad_norm": 1.5487018823623657, + "learning_rate": 1.9467846815163894e-05, + "loss": 2.2308, + "mean_token_accuracy": 0.5216051340103149, + "num_tokens": 1292721876.0, + "step": 2529 + }, + { + "epoch": 0.6841535965386696, + "grad_norm": 2.0007095336914062, + "learning_rate": 1.9467312629610037e-05, + "loss": 2.2414, + "mean_token_accuracy": 0.5074161291122437, + "num_tokens": 1293246104.0, + "step": 2530 + }, + { + "epoch": 0.6844240129799892, + "grad_norm": 1.792381763458252, + "learning_rate": 1.9466778184254133e-05, + "loss": 2.238, + "mean_token_accuracy": 0.5482763051986694, + "num_tokens": 1293641314.0, + "step": 2531 + }, + { + "epoch": 0.6846944294213089, + "grad_norm": 1.896998405456543, + "learning_rate": 1.946624347911257e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4988521337509155, + "num_tokens": 1294165553.0, + "step": 2532 + }, + { + "epoch": 0.6849648458626284, + "grad_norm": 1.6567907333374023, + "learning_rate": 1.9465708514201763e-05, + "loss": 2.2149, + "mean_token_accuracy": 0.5271774530410767, + "num_tokens": 1294689770.0, + "step": 2533 + }, + { + "epoch": 0.685235262303948, + "grad_norm": 1.5596007108688354, + "learning_rate": 1.946517328953812e-05, + "loss": 2.2213, + "mean_token_accuracy": 0.5090916156768799, + "num_tokens": 1295213926.0, + "step": 2534 + }, + { + "epoch": 0.6855056787452677, + "grad_norm": 1.8334336280822754, + "learning_rate": 1.9464637805138065e-05, + "loss": 2.293, + "mean_token_accuracy": 0.5032004714012146, + "num_tokens": 1295738147.0, + "step": 2535 + }, + { + "epoch": 0.6857760951865873, + "grad_norm": 1.7981784343719482, + "learning_rate": 1.9464102061018032e-05, + "loss": 2.2031, + "mean_token_accuracy": 0.5177004337310791, + "num_tokens": 1296189892.0, + "step": 2536 + }, + { + "epoch": 0.686046511627907, + "grad_norm": 2.0130412578582764, + "learning_rate": 1.9463566057194458e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.5164402723312378, + "num_tokens": 1296714107.0, + "step": 2537 + }, + { + "epoch": 0.6863169280692266, + "grad_norm": 1.8265239000320435, + "learning_rate": 1.9463029793683786e-05, + "loss": 2.2916, + "mean_token_accuracy": 0.4993496537208557, + "num_tokens": 1297238389.0, + "step": 2538 + }, + { + "epoch": 0.6865873445105463, + "grad_norm": 1.4269134998321533, + "learning_rate": 1.946249327050247e-05, + "loss": 2.1083, + "mean_token_accuracy": 0.5295783877372742, + "num_tokens": 1297762498.0, + "step": 2539 + }, + { + "epoch": 0.6868577609518659, + "grad_norm": 2.2754809856414795, + "learning_rate": 1.9461956487666972e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.5322673320770264, + "num_tokens": 1298286674.0, + "step": 2540 + }, + { + "epoch": 0.6871281773931855, + "grad_norm": 0.9452037811279297, + "learning_rate": 1.946141944519377e-05, + "loss": 1.2187, + "mean_token_accuracy": 0.688110888004303, + "num_tokens": 1298810882.0, + "step": 2541 + }, + { + "epoch": 0.6873985938345052, + "grad_norm": 2.1609010696411133, + "learning_rate": 1.9460882143099335e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.4971119463443756, + "num_tokens": 1299335144.0, + "step": 2542 + }, + { + "epoch": 0.6876690102758247, + "grad_norm": 1.8733506202697754, + "learning_rate": 1.946034458140015e-05, + "loss": 2.2337, + "mean_token_accuracy": 0.5058099031448364, + "num_tokens": 1299859364.0, + "step": 2543 + }, + { + "epoch": 0.6879394267171444, + "grad_norm": 1.822691559791565, + "learning_rate": 1.9459806760112716e-05, + "loss": 2.2316, + "mean_token_accuracy": 0.5026850700378418, + "num_tokens": 1300383400.0, + "step": 2544 + }, + { + "epoch": 0.688209843158464, + "grad_norm": 1.9049060344696045, + "learning_rate": 1.945926867925353e-05, + "loss": 2.2583, + "mean_token_accuracy": 0.5217800140380859, + "num_tokens": 1300907491.0, + "step": 2545 + }, + { + "epoch": 0.6884802595997837, + "grad_norm": 1.667790174484253, + "learning_rate": 1.9458730338839104e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.529405951499939, + "num_tokens": 1301423531.0, + "step": 2546 + }, + { + "epoch": 0.6887506760411033, + "grad_norm": 1.9839255809783936, + "learning_rate": 1.9458191738885957e-05, + "loss": 2.1836, + "mean_token_accuracy": 0.5308715105056763, + "num_tokens": 1301910560.0, + "step": 2547 + }, + { + "epoch": 0.6890210924824229, + "grad_norm": 1.3184525966644287, + "learning_rate": 1.9457652879410613e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5490512847900391, + "num_tokens": 1302434815.0, + "step": 2548 + }, + { + "epoch": 0.6892915089237426, + "grad_norm": 1.9468058347702026, + "learning_rate": 1.9457113760429608e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.49693259596824646, + "num_tokens": 1302958982.0, + "step": 2549 + }, + { + "epoch": 0.6895619253650622, + "grad_norm": 1.9691362380981445, + "learning_rate": 1.945657438195948e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.5042430758476257, + "num_tokens": 1303483217.0, + "step": 2550 + }, + { + "epoch": 0.6898323418063819, + "grad_norm": 1.6749482154846191, + "learning_rate": 1.9456034744016784e-05, + "loss": 2.2283, + "mean_token_accuracy": 0.5005873441696167, + "num_tokens": 1304007477.0, + "step": 2551 + }, + { + "epoch": 0.6901027582477015, + "grad_norm": 1.556549310684204, + "learning_rate": 1.945549484661807e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.5089119672775269, + "num_tokens": 1304504030.0, + "step": 2552 + }, + { + "epoch": 0.690373174689021, + "grad_norm": 1.6995514631271362, + "learning_rate": 1.945495468977991e-05, + "loss": 2.1242, + "mean_token_accuracy": 0.5399844646453857, + "num_tokens": 1305028303.0, + "step": 2553 + }, + { + "epoch": 0.6906435911303407, + "grad_norm": 1.3682011365890503, + "learning_rate": 1.9454414273518875e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.5352267026901245, + "num_tokens": 1305552478.0, + "step": 2554 + }, + { + "epoch": 0.6909140075716603, + "grad_norm": 1.4953033924102783, + "learning_rate": 1.9453873597851548e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.5240803360939026, + "num_tokens": 1306076641.0, + "step": 2555 + }, + { + "epoch": 0.69118442401298, + "grad_norm": 1.6380467414855957, + "learning_rate": 1.9453332662794516e-05, + "loss": 2.1653, + "mean_token_accuracy": 0.5309959650039673, + "num_tokens": 1306537461.0, + "step": 2556 + }, + { + "epoch": 0.6914548404542996, + "grad_norm": 1.580230951309204, + "learning_rate": 1.945279146836438e-05, + "loss": 2.2663, + "mean_token_accuracy": 0.5067145824432373, + "num_tokens": 1307061647.0, + "step": 2557 + }, + { + "epoch": 0.6917252568956193, + "grad_norm": 1.6193242073059082, + "learning_rate": 1.9452250014577745e-05, + "loss": 2.1709, + "mean_token_accuracy": 0.5143380761146545, + "num_tokens": 1307585758.0, + "step": 2558 + }, + { + "epoch": 0.6919956733369389, + "grad_norm": 1.3866064548492432, + "learning_rate": 1.945170830145122e-05, + "loss": 2.1148, + "mean_token_accuracy": 0.5275964736938477, + "num_tokens": 1308110010.0, + "step": 2559 + }, + { + "epoch": 0.6922660897782585, + "grad_norm": 1.3063420057296753, + "learning_rate": 1.9451166329001435e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.5396149158477783, + "num_tokens": 1308634186.0, + "step": 2560 + }, + { + "epoch": 0.6925365062195782, + "grad_norm": 0.9142351150512695, + "learning_rate": 1.945062409724501e-05, + "loss": 1.2761, + "mean_token_accuracy": 0.6629462242126465, + "num_tokens": 1309158249.0, + "step": 2561 + }, + { + "epoch": 0.6928069226608978, + "grad_norm": 2.8930768966674805, + "learning_rate": 1.9450081606198583e-05, + "loss": 2.1633, + "mean_token_accuracy": 0.5052065253257751, + "num_tokens": 1309682498.0, + "step": 2562 + }, + { + "epoch": 0.6930773391022174, + "grad_norm": 2.6038756370544434, + "learning_rate": 1.944953885587881e-05, + "loss": 2.3617, + "mean_token_accuracy": 0.5273919701576233, + "num_tokens": 1310132147.0, + "step": 2563 + }, + { + "epoch": 0.693347755543537, + "grad_norm": 1.465810775756836, + "learning_rate": 1.9448995846302326e-05, + "loss": 2.2042, + "mean_token_accuracy": 0.5120910406112671, + "num_tokens": 1310656360.0, + "step": 2564 + }, + { + "epoch": 0.6936181719848566, + "grad_norm": 1.5960642099380493, + "learning_rate": 1.944845257748581e-05, + "loss": 2.1837, + "mean_token_accuracy": 0.5240485668182373, + "num_tokens": 1311137339.0, + "step": 2565 + }, + { + "epoch": 0.6938885884261763, + "grad_norm": 1.9618362188339233, + "learning_rate": 1.9447909049445923e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.5265464782714844, + "num_tokens": 1311661426.0, + "step": 2566 + }, + { + "epoch": 0.6941590048674959, + "grad_norm": 1.9718518257141113, + "learning_rate": 1.9447365262199344e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.4922097325325012, + "num_tokens": 1312143005.0, + "step": 2567 + }, + { + "epoch": 0.6944294213088156, + "grad_norm": 2.0541799068450928, + "learning_rate": 1.944682121576275e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.4764019846916199, + "num_tokens": 1312667172.0, + "step": 2568 + }, + { + "epoch": 0.6946998377501352, + "grad_norm": 1.6157010793685913, + "learning_rate": 1.9446276910152844e-05, + "loss": 2.2886, + "mean_token_accuracy": 0.5158265829086304, + "num_tokens": 1313191340.0, + "step": 2569 + }, + { + "epoch": 0.6949702541914549, + "grad_norm": 1.6254159212112427, + "learning_rate": 1.9445732345386324e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.5548476576805115, + "num_tokens": 1313715567.0, + "step": 2570 + }, + { + "epoch": 0.6952406706327745, + "grad_norm": 1.8818483352661133, + "learning_rate": 1.9445187521479902e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.5386597514152527, + "num_tokens": 1314239834.0, + "step": 2571 + }, + { + "epoch": 0.6955110870740941, + "grad_norm": 1.688724398612976, + "learning_rate": 1.9444642438450286e-05, + "loss": 2.1946, + "mean_token_accuracy": 0.5253812074661255, + "num_tokens": 1314755365.0, + "step": 2572 + }, + { + "epoch": 0.6957815035154138, + "grad_norm": 1.8621989488601685, + "learning_rate": 1.944409709631421e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.5266097784042358, + "num_tokens": 1315279533.0, + "step": 2573 + }, + { + "epoch": 0.6960519199567333, + "grad_norm": 2.106252670288086, + "learning_rate": 1.94435514950884e-05, + "loss": 2.1097, + "mean_token_accuracy": 0.5435153841972351, + "num_tokens": 1315800617.0, + "step": 2574 + }, + { + "epoch": 0.696322336398053, + "grad_norm": 1.8425694704055786, + "learning_rate": 1.94430056347896e-05, + "loss": 2.1358, + "mean_token_accuracy": 0.518057644367218, + "num_tokens": 1316324774.0, + "step": 2575 + }, + { + "epoch": 0.6965927528393726, + "grad_norm": 1.7408607006072998, + "learning_rate": 1.9442459515434558e-05, + "loss": 2.1763, + "mean_token_accuracy": 0.5260797739028931, + "num_tokens": 1316849046.0, + "step": 2576 + }, + { + "epoch": 0.6968631692806923, + "grad_norm": 1.5581759214401245, + "learning_rate": 1.944191313704003e-05, + "loss": 2.1118, + "mean_token_accuracy": 0.5246298909187317, + "num_tokens": 1317373324.0, + "step": 2577 + }, + { + "epoch": 0.6971335857220119, + "grad_norm": 1.9424360990524292, + "learning_rate": 1.944136649962278e-05, + "loss": 2.1201, + "mean_token_accuracy": 0.5383074283599854, + "num_tokens": 1317884806.0, + "step": 2578 + }, + { + "epoch": 0.6974040021633315, + "grad_norm": 2.132488965988159, + "learning_rate": 1.9440819603199582e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.5245944857597351, + "num_tokens": 1318358130.0, + "step": 2579 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 1.5437371730804443, + "learning_rate": 1.9440272447787218e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.5016194581985474, + "num_tokens": 1318882352.0, + "step": 2580 + }, + { + "epoch": 0.6979448350459708, + "grad_norm": 0.8859127163887024, + "learning_rate": 1.943972503340247e-05, + "loss": 1.1065, + "mean_token_accuracy": 0.6988362073898315, + "num_tokens": 1319406528.0, + "step": 2581 + }, + { + "epoch": 0.6982152514872905, + "grad_norm": 2.5524380207061768, + "learning_rate": 1.943917736006214e-05, + "loss": 2.1454, + "mean_token_accuracy": 0.5274649858474731, + "num_tokens": 1319930799.0, + "step": 2582 + }, + { + "epoch": 0.6984856679286101, + "grad_norm": 1.9903721809387207, + "learning_rate": 1.9438629427783033e-05, + "loss": 2.1166, + "mean_token_accuracy": 0.5270049571990967, + "num_tokens": 1320455070.0, + "step": 2583 + }, + { + "epoch": 0.6987560843699296, + "grad_norm": 1.369248390197754, + "learning_rate": 1.9438081236581954e-05, + "loss": 2.1175, + "mean_token_accuracy": 0.5337812900543213, + "num_tokens": 1320979166.0, + "step": 2584 + }, + { + "epoch": 0.6990265008112493, + "grad_norm": 5.58806037902832, + "learning_rate": 1.9437532786475732e-05, + "loss": 2.0785, + "mean_token_accuracy": 0.552418053150177, + "num_tokens": 1321503413.0, + "step": 2585 + }, + { + "epoch": 0.6992969172525689, + "grad_norm": 2.757416009902954, + "learning_rate": 1.943698407748119e-05, + "loss": 2.2704, + "mean_token_accuracy": 0.5045025944709778, + "num_tokens": 1322027633.0, + "step": 2586 + }, + { + "epoch": 0.6995673336938886, + "grad_norm": 2.7707359790802, + "learning_rate": 1.9436435109615165e-05, + "loss": 2.2615, + "mean_token_accuracy": 0.5246999859809875, + "num_tokens": 1322541807.0, + "step": 2587 + }, + { + "epoch": 0.6998377501352082, + "grad_norm": 2.0703325271606445, + "learning_rate": 1.94358858828945e-05, + "loss": 2.1633, + "mean_token_accuracy": 0.5461508631706238, + "num_tokens": 1323065970.0, + "step": 2588 + }, + { + "epoch": 0.7001081665765279, + "grad_norm": 2.251121759414673, + "learning_rate": 1.943533639733605e-05, + "loss": 2.1927, + "mean_token_accuracy": 0.5073543787002563, + "num_tokens": 1323590142.0, + "step": 2589 + }, + { + "epoch": 0.7003785830178475, + "grad_norm": 2.074922800064087, + "learning_rate": 1.943478665295667e-05, + "loss": 2.2384, + "mean_token_accuracy": 0.5238022804260254, + "num_tokens": 1324114339.0, + "step": 2590 + }, + { + "epoch": 0.7006489994591671, + "grad_norm": 2.0104880332946777, + "learning_rate": 1.9434236649773234e-05, + "loss": 2.2202, + "mean_token_accuracy": 0.5240087509155273, + "num_tokens": 1324638557.0, + "step": 2591 + }, + { + "epoch": 0.7009194159004868, + "grad_norm": 1.9718111753463745, + "learning_rate": 1.943368638780261e-05, + "loss": 2.1196, + "mean_token_accuracy": 0.5265191793441772, + "num_tokens": 1325162762.0, + "step": 2592 + }, + { + "epoch": 0.7011898323418064, + "grad_norm": 1.7379034757614136, + "learning_rate": 1.9433135867061686e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.5278786420822144, + "num_tokens": 1325686991.0, + "step": 2593 + }, + { + "epoch": 0.701460248783126, + "grad_norm": 2.101607322692871, + "learning_rate": 1.9432585087567354e-05, + "loss": 2.2529, + "mean_token_accuracy": 0.5136802792549133, + "num_tokens": 1326211273.0, + "step": 2594 + }, + { + "epoch": 0.7017306652244456, + "grad_norm": 1.6944670677185059, + "learning_rate": 1.9432034049336514e-05, + "loss": 2.0932, + "mean_token_accuracy": 0.5336927175521851, + "num_tokens": 1326659831.0, + "step": 2595 + }, + { + "epoch": 0.7020010816657652, + "grad_norm": 1.5018397569656372, + "learning_rate": 1.943148275238607e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.5314247608184814, + "num_tokens": 1327171172.0, + "step": 2596 + }, + { + "epoch": 0.7022714981070849, + "grad_norm": 1.7782295942306519, + "learning_rate": 1.943093119673294e-05, + "loss": 2.1684, + "mean_token_accuracy": 0.5053547024726868, + "num_tokens": 1327674142.0, + "step": 2597 + }, + { + "epoch": 0.7025419145484045, + "grad_norm": 2.0164737701416016, + "learning_rate": 1.9430379382394053e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.5052817463874817, + "num_tokens": 1328180827.0, + "step": 2598 + }, + { + "epoch": 0.7028123309897242, + "grad_norm": 1.4860098361968994, + "learning_rate": 1.9429827309386328e-05, + "loss": 2.2472, + "mean_token_accuracy": 0.51776123046875, + "num_tokens": 1328637908.0, + "step": 2599 + }, + { + "epoch": 0.7030827474310438, + "grad_norm": 1.7924518585205078, + "learning_rate": 1.9429274977726715e-05, + "loss": 2.0786, + "mean_token_accuracy": 0.519647479057312, + "num_tokens": 1329162146.0, + "step": 2600 + }, + { + "epoch": 0.7033531638723635, + "grad_norm": 0.8719674944877625, + "learning_rate": 1.9428722387432155e-05, + "loss": 1.1318, + "mean_token_accuracy": 0.685107946395874, + "num_tokens": 1329686257.0, + "step": 2601 + }, + { + "epoch": 0.7036235803136831, + "grad_norm": 1.8711882829666138, + "learning_rate": 1.9428169538519604e-05, + "loss": 2.2196, + "mean_token_accuracy": 0.5169186592102051, + "num_tokens": 1330210473.0, + "step": 2602 + }, + { + "epoch": 0.7038939967550027, + "grad_norm": 1.6404743194580078, + "learning_rate": 1.9427616431006024e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.49810126423835754, + "num_tokens": 1330734709.0, + "step": 2603 + }, + { + "epoch": 0.7041644131963224, + "grad_norm": 1.3645187616348267, + "learning_rate": 1.942706306490839e-05, + "loss": 2.0335, + "mean_token_accuracy": 0.5635991096496582, + "num_tokens": 1331258990.0, + "step": 2604 + }, + { + "epoch": 0.7044348296376419, + "grad_norm": 1.5561959743499756, + "learning_rate": 1.942650944024368e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.49741172790527344, + "num_tokens": 1331783164.0, + "step": 2605 + }, + { + "epoch": 0.7047052460789616, + "grad_norm": 1.5149919986724854, + "learning_rate": 1.942595555702888e-05, + "loss": 2.1771, + "mean_token_accuracy": 0.5210468769073486, + "num_tokens": 1332270196.0, + "step": 2606 + }, + { + "epoch": 0.7049756625202812, + "grad_norm": 1.4716674089431763, + "learning_rate": 1.9425401415280984e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.5312756299972534, + "num_tokens": 1332794371.0, + "step": 2607 + }, + { + "epoch": 0.7052460789616009, + "grad_norm": 1.6280783414840698, + "learning_rate": 1.9424847015016992e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5363767147064209, + "num_tokens": 1333318457.0, + "step": 2608 + }, + { + "epoch": 0.7055164954029205, + "grad_norm": 2.2489213943481445, + "learning_rate": 1.942429235625392e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.49343931674957275, + "num_tokens": 1333824443.0, + "step": 2609 + }, + { + "epoch": 0.7057869118442401, + "grad_norm": 2.0166122913360596, + "learning_rate": 1.9423737439008784e-05, + "loss": 2.2091, + "mean_token_accuracy": 0.5002149343490601, + "num_tokens": 1334348595.0, + "step": 2610 + }, + { + "epoch": 0.7060573282855598, + "grad_norm": 1.510259985923767, + "learning_rate": 1.942318226329861e-05, + "loss": 2.1322, + "mean_token_accuracy": 0.520997166633606, + "num_tokens": 1334872792.0, + "step": 2611 + }, + { + "epoch": 0.7063277447268794, + "grad_norm": 2.069786310195923, + "learning_rate": 1.9422626829140434e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.49709320068359375, + "num_tokens": 1335396875.0, + "step": 2612 + }, + { + "epoch": 0.7065981611681991, + "grad_norm": 2.1769208908081055, + "learning_rate": 1.94220711365513e-05, + "loss": 2.2929, + "mean_token_accuracy": 0.5005862712860107, + "num_tokens": 1335921153.0, + "step": 2613 + }, + { + "epoch": 0.7068685776095187, + "grad_norm": 1.9009501934051514, + "learning_rate": 1.942151518554825e-05, + "loss": 2.2731, + "mean_token_accuracy": 0.5021399855613708, + "num_tokens": 1336445418.0, + "step": 2614 + }, + { + "epoch": 0.7071389940508382, + "grad_norm": 1.8500983715057373, + "learning_rate": 1.9420958976148356e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.4972182512283325, + "num_tokens": 1336969563.0, + "step": 2615 + }, + { + "epoch": 0.7074094104921579, + "grad_norm": 1.741005778312683, + "learning_rate": 1.942040250836867e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.5142806768417358, + "num_tokens": 1337493779.0, + "step": 2616 + }, + { + "epoch": 0.7076798269334775, + "grad_norm": 1.320273756980896, + "learning_rate": 1.9419845782226273e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.5319877862930298, + "num_tokens": 1338017942.0, + "step": 2617 + }, + { + "epoch": 0.7079502433747972, + "grad_norm": 1.3516371250152588, + "learning_rate": 1.9419288797738248e-05, + "loss": 2.2024, + "mean_token_accuracy": 0.5366109013557434, + "num_tokens": 1338477574.0, + "step": 2618 + }, + { + "epoch": 0.7082206598161168, + "grad_norm": 1.3952972888946533, + "learning_rate": 1.941873155492168e-05, + "loss": 2.2244, + "mean_token_accuracy": 0.5194706916809082, + "num_tokens": 1339001851.0, + "step": 2619 + }, + { + "epoch": 0.7084910762574365, + "grad_norm": 1.49873685836792, + "learning_rate": 1.9418174053793675e-05, + "loss": 2.4289, + "mean_token_accuracy": 0.4861382842063904, + "num_tokens": 1339525967.0, + "step": 2620 + }, + { + "epoch": 0.7087614926987561, + "grad_norm": 0.9290701746940613, + "learning_rate": 1.941761629437133e-05, + "loss": 1.2639, + "mean_token_accuracy": 0.6695972084999084, + "num_tokens": 1340049985.0, + "step": 2621 + }, + { + "epoch": 0.7090319091400757, + "grad_norm": 2.9253220558166504, + "learning_rate": 1.9417058276671765e-05, + "loss": 2.3098, + "mean_token_accuracy": 0.48926040530204773, + "num_tokens": 1340574263.0, + "step": 2622 + }, + { + "epoch": 0.7093023255813954, + "grad_norm": 2.1455180644989014, + "learning_rate": 1.9416500000712097e-05, + "loss": 2.1587, + "mean_token_accuracy": 0.5310894250869751, + "num_tokens": 1341098541.0, + "step": 2623 + }, + { + "epoch": 0.709572742022715, + "grad_norm": 1.4382537603378296, + "learning_rate": 1.9415941466509457e-05, + "loss": 2.1134, + "mean_token_accuracy": 0.5226966142654419, + "num_tokens": 1341622807.0, + "step": 2624 + }, + { + "epoch": 0.7098431584640346, + "grad_norm": 2.0059659481048584, + "learning_rate": 1.9415382674080987e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5375882387161255, + "num_tokens": 1342147082.0, + "step": 2625 + }, + { + "epoch": 0.7101135749053542, + "grad_norm": 1.7305212020874023, + "learning_rate": 1.9414823623443827e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.5211869478225708, + "num_tokens": 1342671289.0, + "step": 2626 + }, + { + "epoch": 0.7103839913466738, + "grad_norm": 1.9595458507537842, + "learning_rate": 1.941426431461513e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.5139217376708984, + "num_tokens": 1343166161.0, + "step": 2627 + }, + { + "epoch": 0.7106544077879935, + "grad_norm": 1.7298222780227661, + "learning_rate": 1.941370474761206e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.510723352432251, + "num_tokens": 1343690441.0, + "step": 2628 + }, + { + "epoch": 0.7109248242293131, + "grad_norm": 1.887345790863037, + "learning_rate": 1.9413144922451784e-05, + "loss": 2.1726, + "mean_token_accuracy": 0.5073103904724121, + "num_tokens": 1344214646.0, + "step": 2629 + }, + { + "epoch": 0.7111952406706328, + "grad_norm": 1.4302138090133667, + "learning_rate": 1.9412584839151483e-05, + "loss": 2.2748, + "mean_token_accuracy": 0.5118786096572876, + "num_tokens": 1344738928.0, + "step": 2630 + }, + { + "epoch": 0.7114656571119524, + "grad_norm": 1.4690250158309937, + "learning_rate": 1.941202449772834e-05, + "loss": 2.132, + "mean_token_accuracy": 0.5251673460006714, + "num_tokens": 1345243101.0, + "step": 2631 + }, + { + "epoch": 0.7117360735532721, + "grad_norm": 1.5437531471252441, + "learning_rate": 1.9411463898199542e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.5151597261428833, + "num_tokens": 1345767212.0, + "step": 2632 + }, + { + "epoch": 0.7120064899945917, + "grad_norm": 1.4161245822906494, + "learning_rate": 1.94109030405823e-05, + "loss": 2.0368, + "mean_token_accuracy": 0.5364935994148254, + "num_tokens": 1346291488.0, + "step": 2633 + }, + { + "epoch": 0.7122769064359114, + "grad_norm": 1.6674083471298218, + "learning_rate": 1.9410341924893817e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.49322810769081116, + "num_tokens": 1346815713.0, + "step": 2634 + }, + { + "epoch": 0.7125473228772309, + "grad_norm": 1.2832144498825073, + "learning_rate": 1.940978055115131e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.5059245228767395, + "num_tokens": 1347339986.0, + "step": 2635 + }, + { + "epoch": 0.7128177393185505, + "grad_norm": 1.3715786933898926, + "learning_rate": 1.9409218919372003e-05, + "loss": 2.0528, + "mean_token_accuracy": 0.5327358245849609, + "num_tokens": 1347864236.0, + "step": 2636 + }, + { + "epoch": 0.7130881557598702, + "grad_norm": 1.5303281545639038, + "learning_rate": 1.9408657029573134e-05, + "loss": 2.2117, + "mean_token_accuracy": 0.5115183591842651, + "num_tokens": 1348388340.0, + "step": 2637 + }, + { + "epoch": 0.7133585722011898, + "grad_norm": 1.6043105125427246, + "learning_rate": 1.9408094881771932e-05, + "loss": 2.1468, + "mean_token_accuracy": 0.5260326862335205, + "num_tokens": 1348909212.0, + "step": 2638 + }, + { + "epoch": 0.7136289886425095, + "grad_norm": 1.6080045700073242, + "learning_rate": 1.9407532475985656e-05, + "loss": 2.3845, + "mean_token_accuracy": 0.4973367750644684, + "num_tokens": 1349433440.0, + "step": 2639 + }, + { + "epoch": 0.7138994050838291, + "grad_norm": 1.392184853553772, + "learning_rate": 1.940696981223156e-05, + "loss": 2.1446, + "mean_token_accuracy": 0.5376999378204346, + "num_tokens": 1349957695.0, + "step": 2640 + }, + { + "epoch": 0.7141698215251487, + "grad_norm": 0.8417640328407288, + "learning_rate": 1.9406406890526904e-05, + "loss": 1.2182, + "mean_token_accuracy": 0.6805802583694458, + "num_tokens": 1350481956.0, + "step": 2641 + }, + { + "epoch": 0.7144402379664684, + "grad_norm": 1.837081789970398, + "learning_rate": 1.9405843710888966e-05, + "loss": 2.0835, + "mean_token_accuracy": 0.5397635698318481, + "num_tokens": 1350949876.0, + "step": 2642 + }, + { + "epoch": 0.714710654407788, + "grad_norm": 1.8216676712036133, + "learning_rate": 1.9405280273335022e-05, + "loss": 2.2692, + "mean_token_accuracy": 0.509792685508728, + "num_tokens": 1351474037.0, + "step": 2643 + }, + { + "epoch": 0.7149810708491077, + "grad_norm": 1.3969835042953491, + "learning_rate": 1.9404716577882358e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.47911733388900757, + "num_tokens": 1351998266.0, + "step": 2644 + }, + { + "epoch": 0.7152514872904273, + "grad_norm": 1.618642807006836, + "learning_rate": 1.9404152624548277e-05, + "loss": 2.1753, + "mean_token_accuracy": 0.5331079363822937, + "num_tokens": 1352522378.0, + "step": 2645 + }, + { + "epoch": 0.7155219037317468, + "grad_norm": 2.0084407329559326, + "learning_rate": 1.9403588413350075e-05, + "loss": 2.2219, + "mean_token_accuracy": 0.5145535469055176, + "num_tokens": 1353046557.0, + "step": 2646 + }, + { + "epoch": 0.7157923201730665, + "grad_norm": 1.4709587097167969, + "learning_rate": 1.940302394430507e-05, + "loss": 2.1336, + "mean_token_accuracy": 0.5312155485153198, + "num_tokens": 1353570714.0, + "step": 2647 + }, + { + "epoch": 0.7160627366143861, + "grad_norm": 1.909627079963684, + "learning_rate": 1.940245921743058e-05, + "loss": 2.2871, + "mean_token_accuracy": 0.49494364857673645, + "num_tokens": 1354094985.0, + "step": 2648 + }, + { + "epoch": 0.7163331530557058, + "grad_norm": 1.7327898740768433, + "learning_rate": 1.9401894232743927e-05, + "loss": 2.1567, + "mean_token_accuracy": 0.5297528505325317, + "num_tokens": 1354619210.0, + "step": 2649 + }, + { + "epoch": 0.7166035694970254, + "grad_norm": 1.5723862648010254, + "learning_rate": 1.940132899026245e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5228700637817383, + "num_tokens": 1355137683.0, + "step": 2650 + }, + { + "epoch": 0.7168739859383451, + "grad_norm": 2.1259360313415527, + "learning_rate": 1.9400763490003496e-05, + "loss": 2.2645, + "mean_token_accuracy": 0.5040464401245117, + "num_tokens": 1355653872.0, + "step": 2651 + }, + { + "epoch": 0.7171444023796647, + "grad_norm": 1.99906587600708, + "learning_rate": 1.9400197731984412e-05, + "loss": 2.1114, + "mean_token_accuracy": 0.5035186409950256, + "num_tokens": 1356178138.0, + "step": 2652 + }, + { + "epoch": 0.7174148188209843, + "grad_norm": 2.182967185974121, + "learning_rate": 1.9399631716222564e-05, + "loss": 2.1783, + "mean_token_accuracy": 0.528643012046814, + "num_tokens": 1356645057.0, + "step": 2653 + }, + { + "epoch": 0.717685235262304, + "grad_norm": 1.6154192686080933, + "learning_rate": 1.9399065442735302e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.5011450052261353, + "num_tokens": 1357169152.0, + "step": 2654 + }, + { + "epoch": 0.7179556517036236, + "grad_norm": 2.0220162868499756, + "learning_rate": 1.939849891154002e-05, + "loss": 2.2641, + "mean_token_accuracy": 0.5064830780029297, + "num_tokens": 1357639725.0, + "step": 2655 + }, + { + "epoch": 0.7182260681449432, + "grad_norm": 1.5272839069366455, + "learning_rate": 1.939793212265409e-05, + "loss": 2.1766, + "mean_token_accuracy": 0.5099086165428162, + "num_tokens": 1358163982.0, + "step": 2656 + }, + { + "epoch": 0.7184964845862628, + "grad_norm": 1.6769464015960693, + "learning_rate": 1.9397365076094908e-05, + "loss": 2.132, + "mean_token_accuracy": 0.5071864128112793, + "num_tokens": 1358688146.0, + "step": 2657 + }, + { + "epoch": 0.7187669010275824, + "grad_norm": 1.9190633296966553, + "learning_rate": 1.9396797771879866e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.5207388401031494, + "num_tokens": 1359212424.0, + "step": 2658 + }, + { + "epoch": 0.7190373174689021, + "grad_norm": 1.9008361101150513, + "learning_rate": 1.9396230210026377e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.5273951292037964, + "num_tokens": 1359637450.0, + "step": 2659 + }, + { + "epoch": 0.7193077339102217, + "grad_norm": 1.6138259172439575, + "learning_rate": 1.9395662390551856e-05, + "loss": 2.292, + "mean_token_accuracy": 0.5251738429069519, + "num_tokens": 1360114074.0, + "step": 2660 + }, + { + "epoch": 0.7195781503515414, + "grad_norm": 0.8682464957237244, + "learning_rate": 1.9395094313473724e-05, + "loss": 1.1313, + "mean_token_accuracy": 0.705444872379303, + "num_tokens": 1360638232.0, + "step": 2661 + }, + { + "epoch": 0.719848566792861, + "grad_norm": 1.8845055103302002, + "learning_rate": 1.9394525978809407e-05, + "loss": 2.0507, + "mean_token_accuracy": 0.5512222051620483, + "num_tokens": 1361162458.0, + "step": 2662 + }, + { + "epoch": 0.7201189832341807, + "grad_norm": 1.9803661108016968, + "learning_rate": 1.9393957386576347e-05, + "loss": 2.0691, + "mean_token_accuracy": 0.5496580600738525, + "num_tokens": 1361686713.0, + "step": 2663 + }, + { + "epoch": 0.7203893996755003, + "grad_norm": 1.5580079555511475, + "learning_rate": 1.939338853679199e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.5236334800720215, + "num_tokens": 1362164106.0, + "step": 2664 + }, + { + "epoch": 0.72065981611682, + "grad_norm": 1.4682048559188843, + "learning_rate": 1.9392819429473785e-05, + "loss": 2.0367, + "mean_token_accuracy": 0.5650355815887451, + "num_tokens": 1362567770.0, + "step": 2665 + }, + { + "epoch": 0.7209302325581395, + "grad_norm": 2.086247682571411, + "learning_rate": 1.9392250064639202e-05, + "loss": 2.0729, + "mean_token_accuracy": 0.5374137163162231, + "num_tokens": 1363041975.0, + "step": 2666 + }, + { + "epoch": 0.7212006489994591, + "grad_norm": 1.7022522687911987, + "learning_rate": 1.939168044230571e-05, + "loss": 2.1641, + "mean_token_accuracy": 0.5175639986991882, + "num_tokens": 1363566179.0, + "step": 2667 + }, + { + "epoch": 0.7214710654407788, + "grad_norm": 1.3182774782180786, + "learning_rate": 1.939111056249078e-05, + "loss": 2.2528, + "mean_token_accuracy": 0.5060141682624817, + "num_tokens": 1364090342.0, + "step": 2668 + }, + { + "epoch": 0.7217414818820984, + "grad_norm": 1.6607245206832886, + "learning_rate": 1.9390540425211898e-05, + "loss": 2.1469, + "mean_token_accuracy": 0.5291639566421509, + "num_tokens": 1364609013.0, + "step": 2669 + }, + { + "epoch": 0.722011898323418, + "grad_norm": 1.8187631368637085, + "learning_rate": 1.9389970030486566e-05, + "loss": 2.1297, + "mean_token_accuracy": 0.534237265586853, + "num_tokens": 1365133276.0, + "step": 2670 + }, + { + "epoch": 0.7222823147647377, + "grad_norm": 1.960084080696106, + "learning_rate": 1.9389399378332282e-05, + "loss": 2.1942, + "mean_token_accuracy": 0.5174040794372559, + "num_tokens": 1365657400.0, + "step": 2671 + }, + { + "epoch": 0.7225527312060573, + "grad_norm": 1.7768858671188354, + "learning_rate": 1.938882846876655e-05, + "loss": 2.2177, + "mean_token_accuracy": 0.5276317596435547, + "num_tokens": 1366181596.0, + "step": 2672 + }, + { + "epoch": 0.722823147647377, + "grad_norm": 1.7167552709579468, + "learning_rate": 1.938825730180689e-05, + "loss": 2.3102, + "mean_token_accuracy": 0.5049904584884644, + "num_tokens": 1366689771.0, + "step": 2673 + }, + { + "epoch": 0.7230935640886966, + "grad_norm": 1.5682429075241089, + "learning_rate": 1.938768587747083e-05, + "loss": 2.1456, + "mean_token_accuracy": 0.5198429822921753, + "num_tokens": 1367214000.0, + "step": 2674 + }, + { + "epoch": 0.7233639805300163, + "grad_norm": 1.3728694915771484, + "learning_rate": 1.9387114195775895e-05, + "loss": 2.1928, + "mean_token_accuracy": 0.5162926316261292, + "num_tokens": 1367727927.0, + "step": 2675 + }, + { + "epoch": 0.7236343969713358, + "grad_norm": 1.697927713394165, + "learning_rate": 1.9386542256739637e-05, + "loss": 2.2511, + "mean_token_accuracy": 0.5246747732162476, + "num_tokens": 1368252115.0, + "step": 2676 + }, + { + "epoch": 0.7239048134126554, + "grad_norm": 1.523364543914795, + "learning_rate": 1.9385970060379597e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.5320528745651245, + "num_tokens": 1368717351.0, + "step": 2677 + }, + { + "epoch": 0.7241752298539751, + "grad_norm": 1.4568779468536377, + "learning_rate": 1.9385397606713333e-05, + "loss": 2.1291, + "mean_token_accuracy": 0.5323441624641418, + "num_tokens": 1369217583.0, + "step": 2678 + }, + { + "epoch": 0.7244456462952947, + "grad_norm": 1.6023061275482178, + "learning_rate": 1.9384824895758413e-05, + "loss": 2.0302, + "mean_token_accuracy": 0.5444846153259277, + "num_tokens": 1369643326.0, + "step": 2679 + }, + { + "epoch": 0.7247160627366144, + "grad_norm": 1.7209864854812622, + "learning_rate": 1.938425192753241e-05, + "loss": 2.0741, + "mean_token_accuracy": 0.5458289384841919, + "num_tokens": 1370107534.0, + "step": 2680 + }, + { + "epoch": 0.724986479177934, + "grad_norm": 1.0512205362319946, + "learning_rate": 1.93836787020529e-05, + "loss": 1.1753, + "mean_token_accuracy": 0.6991049647331238, + "num_tokens": 1370631624.0, + "step": 2681 + }, + { + "epoch": 0.7252568956192537, + "grad_norm": 2.3428640365600586, + "learning_rate": 1.938310521933747e-05, + "loss": 2.086, + "mean_token_accuracy": 0.5448309183120728, + "num_tokens": 1371080504.0, + "step": 2682 + }, + { + "epoch": 0.7255273120605733, + "grad_norm": 1.739555835723877, + "learning_rate": 1.938253147940372e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.5058194398880005, + "num_tokens": 1371604701.0, + "step": 2683 + }, + { + "epoch": 0.725797728501893, + "grad_norm": 1.6928240060806274, + "learning_rate": 1.9381957482269248e-05, + "loss": 2.0778, + "mean_token_accuracy": 0.5302640199661255, + "num_tokens": 1372110547.0, + "step": 2684 + }, + { + "epoch": 0.7260681449432126, + "grad_norm": 1.904649257659912, + "learning_rate": 1.9381383227951678e-05, + "loss": 2.1305, + "mean_token_accuracy": 0.5305490493774414, + "num_tokens": 1372582189.0, + "step": 2685 + }, + { + "epoch": 0.7263385613845322, + "grad_norm": 1.6165761947631836, + "learning_rate": 1.9380808716468617e-05, + "loss": 2.2357, + "mean_token_accuracy": 0.5093584060668945, + "num_tokens": 1373106440.0, + "step": 2686 + }, + { + "epoch": 0.7266089778258518, + "grad_norm": 2.067905902862549, + "learning_rate": 1.9380233947837703e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5285032987594604, + "num_tokens": 1373630596.0, + "step": 2687 + }, + { + "epoch": 0.7268793942671714, + "grad_norm": 1.963935375213623, + "learning_rate": 1.937965892207656e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.5362775325775146, + "num_tokens": 1374129178.0, + "step": 2688 + }, + { + "epoch": 0.727149810708491, + "grad_norm": 1.4387476444244385, + "learning_rate": 1.937908363920285e-05, + "loss": 2.2269, + "mean_token_accuracy": 0.5274767279624939, + "num_tokens": 1374632575.0, + "step": 2689 + }, + { + "epoch": 0.7274202271498107, + "grad_norm": 2.360348701477051, + "learning_rate": 1.9378508099234202e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.4899676442146301, + "num_tokens": 1375156771.0, + "step": 2690 + }, + { + "epoch": 0.7276906435911303, + "grad_norm": 1.9188480377197266, + "learning_rate": 1.937793230218829e-05, + "loss": 1.8564, + "mean_token_accuracy": 0.58415687084198, + "num_tokens": 1375661515.0, + "step": 2691 + }, + { + "epoch": 0.72796106003245, + "grad_norm": 1.5592310428619385, + "learning_rate": 1.937735624808278e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.5202716588973999, + "num_tokens": 1376185786.0, + "step": 2692 + }, + { + "epoch": 0.7282314764737696, + "grad_norm": 1.9627299308776855, + "learning_rate": 1.9376779936935337e-05, + "loss": 2.217, + "mean_token_accuracy": 0.5051414966583252, + "num_tokens": 1376709917.0, + "step": 2693 + }, + { + "epoch": 0.7285018929150893, + "grad_norm": 1.6075893640518188, + "learning_rate": 1.9376203368763655e-05, + "loss": 2.1912, + "mean_token_accuracy": 0.504400372505188, + "num_tokens": 1377234020.0, + "step": 2694 + }, + { + "epoch": 0.7287723093564089, + "grad_norm": 1.4940134286880493, + "learning_rate": 1.9375626543585422e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.5139482617378235, + "num_tokens": 1377752082.0, + "step": 2695 + }, + { + "epoch": 0.7290427257977286, + "grad_norm": 1.6578025817871094, + "learning_rate": 1.9375049461418332e-05, + "loss": 2.2543, + "mean_token_accuracy": 0.5070140361785889, + "num_tokens": 1378276336.0, + "step": 2696 + }, + { + "epoch": 0.7293131422390481, + "grad_norm": 1.9131687879562378, + "learning_rate": 1.9374472122280098e-05, + "loss": 1.9669, + "mean_token_accuracy": 0.5665947198867798, + "num_tokens": 1378777220.0, + "step": 2697 + }, + { + "epoch": 0.7295835586803677, + "grad_norm": 2.352567195892334, + "learning_rate": 1.9373894526188432e-05, + "loss": 1.9528, + "mean_token_accuracy": 0.5659162998199463, + "num_tokens": 1379301495.0, + "step": 2698 + }, + { + "epoch": 0.7298539751216874, + "grad_norm": 1.7005527019500732, + "learning_rate": 1.9373316673161054e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.5013248920440674, + "num_tokens": 1379825770.0, + "step": 2699 + }, + { + "epoch": 0.730124391563007, + "grad_norm": 1.5857117176055908, + "learning_rate": 1.93727385632157e-05, + "loss": 2.0546, + "mean_token_accuracy": 0.5055577754974365, + "num_tokens": 1380349941.0, + "step": 2700 + }, + { + "epoch": 0.7303948080043267, + "grad_norm": 0.7129926085472107, + "learning_rate": 1.93721601963701e-05, + "loss": 1.1556, + "mean_token_accuracy": 0.6916912794113159, + "num_tokens": 1380874163.0, + "step": 2701 + }, + { + "epoch": 0.7306652244456463, + "grad_norm": 2.5632593631744385, + "learning_rate": 1.9371581572642e-05, + "loss": 2.3468, + "mean_token_accuracy": 0.49210771918296814, + "num_tokens": 1381398340.0, + "step": 2702 + }, + { + "epoch": 0.7309356408869659, + "grad_norm": 4.316312789916992, + "learning_rate": 1.9371002692049164e-05, + "loss": 2.2065, + "mean_token_accuracy": 0.4929901957511902, + "num_tokens": 1381922491.0, + "step": 2703 + }, + { + "epoch": 0.7312060573282856, + "grad_norm": 1.7515062093734741, + "learning_rate": 1.9370423554609347e-05, + "loss": 2.1456, + "mean_token_accuracy": 0.5301902294158936, + "num_tokens": 1382397265.0, + "step": 2704 + }, + { + "epoch": 0.7314764737696052, + "grad_norm": 1.911817193031311, + "learning_rate": 1.9369844160340324e-05, + "loss": 2.2822, + "mean_token_accuracy": 0.5130500793457031, + "num_tokens": 1382921501.0, + "step": 2705 + }, + { + "epoch": 0.7317468902109249, + "grad_norm": 1.3917655944824219, + "learning_rate": 1.9369264509259863e-05, + "loss": 2.1233, + "mean_token_accuracy": 0.5148938894271851, + "num_tokens": 1383400271.0, + "step": 2706 + }, + { + "epoch": 0.7320173066522444, + "grad_norm": 1.4775210618972778, + "learning_rate": 1.9368684601385755e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.5090298056602478, + "num_tokens": 1383911871.0, + "step": 2707 + }, + { + "epoch": 0.732287723093564, + "grad_norm": 2.0885682106018066, + "learning_rate": 1.9368104436735794e-05, + "loss": 2.1639, + "mean_token_accuracy": 0.5172065496444702, + "num_tokens": 1384436099.0, + "step": 2708 + }, + { + "epoch": 0.7325581395348837, + "grad_norm": 1.5413607358932495, + "learning_rate": 1.9367524015327776e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.5442807674407959, + "num_tokens": 1384960362.0, + "step": 2709 + }, + { + "epoch": 0.7328285559762033, + "grad_norm": 1.833767294883728, + "learning_rate": 1.936694333717952e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.4998851418495178, + "num_tokens": 1385484626.0, + "step": 2710 + }, + { + "epoch": 0.733098972417523, + "grad_norm": 2.4001498222351074, + "learning_rate": 1.9366362402308833e-05, + "loss": 2.1391, + "mean_token_accuracy": 0.5325707197189331, + "num_tokens": 1386008872.0, + "step": 2711 + }, + { + "epoch": 0.7333693888588426, + "grad_norm": 1.8970781564712524, + "learning_rate": 1.9365781210733545e-05, + "loss": 2.1255, + "mean_token_accuracy": 0.530319094657898, + "num_tokens": 1386533092.0, + "step": 2712 + }, + { + "epoch": 0.7336398053001623, + "grad_norm": 1.7091823816299438, + "learning_rate": 1.9365199762471486e-05, + "loss": 2.1436, + "mean_token_accuracy": 0.5142998695373535, + "num_tokens": 1387055512.0, + "step": 2713 + }, + { + "epoch": 0.7339102217414819, + "grad_norm": 1.992758870124817, + "learning_rate": 1.93646180575405e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.4990259110927582, + "num_tokens": 1387579700.0, + "step": 2714 + }, + { + "epoch": 0.7341806381828015, + "grad_norm": 2.019658088684082, + "learning_rate": 1.936403609595843e-05, + "loss": 2.0557, + "mean_token_accuracy": 0.5411267280578613, + "num_tokens": 1388060487.0, + "step": 2715 + }, + { + "epoch": 0.7344510546241212, + "grad_norm": 1.8141485452651978, + "learning_rate": 1.936345387774314e-05, + "loss": 2.1947, + "mean_token_accuracy": 0.5194936394691467, + "num_tokens": 1388584731.0, + "step": 2716 + }, + { + "epoch": 0.7347214710654408, + "grad_norm": 1.9695887565612793, + "learning_rate": 1.9362871402912483e-05, + "loss": 2.2585, + "mean_token_accuracy": 0.5052302479743958, + "num_tokens": 1389109008.0, + "step": 2717 + }, + { + "epoch": 0.7349918875067604, + "grad_norm": 1.484803557395935, + "learning_rate": 1.9362288671484345e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.5205672979354858, + "num_tokens": 1389633111.0, + "step": 2718 + }, + { + "epoch": 0.73526230394808, + "grad_norm": 2.0832059383392334, + "learning_rate": 1.9361705683476596e-05, + "loss": 2.2761, + "mean_token_accuracy": 0.518429160118103, + "num_tokens": 1390157332.0, + "step": 2719 + }, + { + "epoch": 0.7355327203893997, + "grad_norm": 2.04496431350708, + "learning_rate": 1.9361122438907123e-05, + "loss": 2.2992, + "mean_token_accuracy": 0.5155529379844666, + "num_tokens": 1390610249.0, + "step": 2720 + }, + { + "epoch": 0.7358031368307193, + "grad_norm": 0.9139624834060669, + "learning_rate": 1.9360538937793828e-05, + "loss": 1.2694, + "mean_token_accuracy": 0.6792582273483276, + "num_tokens": 1391134520.0, + "step": 2721 + }, + { + "epoch": 0.7360735532720389, + "grad_norm": 2.760053873062134, + "learning_rate": 1.9359955180154612e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.5116801857948303, + "num_tokens": 1391658789.0, + "step": 2722 + }, + { + "epoch": 0.7363439697133586, + "grad_norm": 2.265894889831543, + "learning_rate": 1.9359371166007386e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.5054619908332825, + "num_tokens": 1392182973.0, + "step": 2723 + }, + { + "epoch": 0.7366143861546782, + "grad_norm": 1.3648386001586914, + "learning_rate": 1.935878689537007e-05, + "loss": 2.1546, + "mean_token_accuracy": 0.5045639276504517, + "num_tokens": 1392707239.0, + "step": 2724 + }, + { + "epoch": 0.7368848025959979, + "grad_norm": 2.1893742084503174, + "learning_rate": 1.935820236826059e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.5442241430282593, + "num_tokens": 1393222160.0, + "step": 2725 + }, + { + "epoch": 0.7371552190373175, + "grad_norm": 2.1344211101531982, + "learning_rate": 1.9357617584696878e-05, + "loss": 2.1861, + "mean_token_accuracy": 0.5232314467430115, + "num_tokens": 1393746362.0, + "step": 2726 + }, + { + "epoch": 0.7374256354786372, + "grad_norm": 1.8965260982513428, + "learning_rate": 1.9357032544696885e-05, + "loss": 2.1725, + "mean_token_accuracy": 0.5345448851585388, + "num_tokens": 1394270482.0, + "step": 2727 + }, + { + "epoch": 0.7376960519199567, + "grad_norm": 2.1495351791381836, + "learning_rate": 1.9356447248278555e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.5077637434005737, + "num_tokens": 1394794751.0, + "step": 2728 + }, + { + "epoch": 0.7379664683612763, + "grad_norm": 3.290787935256958, + "learning_rate": 1.935586169545985e-05, + "loss": 1.8936, + "mean_token_accuracy": 0.5884757041931152, + "num_tokens": 1395318988.0, + "step": 2729 + }, + { + "epoch": 0.738236884802596, + "grad_norm": 2.222447395324707, + "learning_rate": 1.9355275886258734e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.5037224292755127, + "num_tokens": 1395843242.0, + "step": 2730 + }, + { + "epoch": 0.7385073012439156, + "grad_norm": 1.6902565956115723, + "learning_rate": 1.9354689820693183e-05, + "loss": 2.0287, + "mean_token_accuracy": 0.5423320531845093, + "num_tokens": 1396331611.0, + "step": 2731 + }, + { + "epoch": 0.7387777176852353, + "grad_norm": 2.1576035022735596, + "learning_rate": 1.9354103498781182e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.5149536728858948, + "num_tokens": 1396855768.0, + "step": 2732 + }, + { + "epoch": 0.7390481341265549, + "grad_norm": 1.757699728012085, + "learning_rate": 1.935351692054071e-05, + "loss": 1.9355, + "mean_token_accuracy": 0.5589607357978821, + "num_tokens": 1397322807.0, + "step": 2733 + }, + { + "epoch": 0.7393185505678745, + "grad_norm": 1.6477984189987183, + "learning_rate": 1.935293008598978e-05, + "loss": 2.0823, + "mean_token_accuracy": 0.525428056716919, + "num_tokens": 1397847022.0, + "step": 2734 + }, + { + "epoch": 0.7395889670091942, + "grad_norm": 1.992859125137329, + "learning_rate": 1.9352342995146388e-05, + "loss": 2.2205, + "mean_token_accuracy": 0.5254761576652527, + "num_tokens": 1398313825.0, + "step": 2735 + }, + { + "epoch": 0.7398593834505138, + "grad_norm": 1.8682501316070557, + "learning_rate": 1.9351755648028547e-05, + "loss": 2.335, + "mean_token_accuracy": 0.5077328681945801, + "num_tokens": 1398838093.0, + "step": 2736 + }, + { + "epoch": 0.7401297998918335, + "grad_norm": 1.6523387432098389, + "learning_rate": 1.9351168044654287e-05, + "loss": 2.139, + "mean_token_accuracy": 0.5264089703559875, + "num_tokens": 1399362288.0, + "step": 2737 + }, + { + "epoch": 0.740400216333153, + "grad_norm": 1.5541459321975708, + "learning_rate": 1.935058018504163e-05, + "loss": 2.2284, + "mean_token_accuracy": 0.5185156464576721, + "num_tokens": 1399854104.0, + "step": 2738 + }, + { + "epoch": 0.7406706327744726, + "grad_norm": 1.5556087493896484, + "learning_rate": 1.9349992069208617e-05, + "loss": 2.0814, + "mean_token_accuracy": 0.5278365612030029, + "num_tokens": 1400378390.0, + "step": 2739 + }, + { + "epoch": 0.7409410492157923, + "grad_norm": 2.1734132766723633, + "learning_rate": 1.934940369717329e-05, + "loss": 2.1118, + "mean_token_accuracy": 0.5478845834732056, + "num_tokens": 1400902589.0, + "step": 2740 + }, + { + "epoch": 0.7412114656571119, + "grad_norm": 0.7761396765708923, + "learning_rate": 1.93488150689537e-05, + "loss": 1.1681, + "mean_token_accuracy": 0.692666232585907, + "num_tokens": 1401426816.0, + "step": 2741 + }, + { + "epoch": 0.7414818820984316, + "grad_norm": 2.034691572189331, + "learning_rate": 1.9348226184567917e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.5137996673583984, + "num_tokens": 1401951092.0, + "step": 2742 + }, + { + "epoch": 0.7417522985397512, + "grad_norm": 1.5438151359558105, + "learning_rate": 1.9347637044034003e-05, + "loss": 2.1883, + "mean_token_accuracy": 0.505845308303833, + "num_tokens": 1402475298.0, + "step": 2743 + }, + { + "epoch": 0.7420227149810709, + "grad_norm": 1.725797414779663, + "learning_rate": 1.9347047647370034e-05, + "loss": 2.2424, + "mean_token_accuracy": 0.5117804408073425, + "num_tokens": 1402999549.0, + "step": 2744 + }, + { + "epoch": 0.7422931314223905, + "grad_norm": 2.18229341506958, + "learning_rate": 1.9346457994594097e-05, + "loss": 2.2206, + "mean_token_accuracy": 0.5220274329185486, + "num_tokens": 1403479027.0, + "step": 2745 + }, + { + "epoch": 0.7425635478637101, + "grad_norm": 1.7732354402542114, + "learning_rate": 1.934586808572428e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.5063325762748718, + "num_tokens": 1404003187.0, + "step": 2746 + }, + { + "epoch": 0.7428339643050298, + "grad_norm": 2.3736462593078613, + "learning_rate": 1.9345277920778695e-05, + "loss": 2.3403, + "mean_token_accuracy": 0.5108140110969543, + "num_tokens": 1404526391.0, + "step": 2747 + }, + { + "epoch": 0.7431043807463493, + "grad_norm": 2.0010995864868164, + "learning_rate": 1.9344687499775435e-05, + "loss": 2.2314, + "mean_token_accuracy": 0.5240415334701538, + "num_tokens": 1405050668.0, + "step": 2748 + }, + { + "epoch": 0.743374797187669, + "grad_norm": 1.7051366567611694, + "learning_rate": 1.9344096822732625e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.5143165588378906, + "num_tokens": 1405574927.0, + "step": 2749 + }, + { + "epoch": 0.7436452136289886, + "grad_norm": 2.1987860202789307, + "learning_rate": 1.9343505889668384e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.5055691003799438, + "num_tokens": 1406099197.0, + "step": 2750 + }, + { + "epoch": 0.7439156300703083, + "grad_norm": 1.7553472518920898, + "learning_rate": 1.9342914700600845e-05, + "loss": 2.296, + "mean_token_accuracy": 0.5282005071640015, + "num_tokens": 1406623369.0, + "step": 2751 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 1.7046483755111694, + "learning_rate": 1.9342323255548152e-05, + "loss": 2.2184, + "mean_token_accuracy": 0.5196512937545776, + "num_tokens": 1407126510.0, + "step": 2752 + }, + { + "epoch": 0.7444564629529475, + "grad_norm": 2.0597691535949707, + "learning_rate": 1.934173155452845e-05, + "loss": 2.3327, + "mean_token_accuracy": 0.505384624004364, + "num_tokens": 1407650712.0, + "step": 2753 + }, + { + "epoch": 0.7447268793942672, + "grad_norm": 1.6456762552261353, + "learning_rate": 1.9341139597559884e-05, + "loss": 2.1428, + "mean_token_accuracy": 0.5264963507652283, + "num_tokens": 1408142502.0, + "step": 2754 + }, + { + "epoch": 0.7449972958355868, + "grad_norm": 1.6415730714797974, + "learning_rate": 1.934054738466063e-05, + "loss": 2.2689, + "mean_token_accuracy": 0.5254163146018982, + "num_tokens": 1408617286.0, + "step": 2755 + }, + { + "epoch": 0.7452677122769065, + "grad_norm": 1.8030807971954346, + "learning_rate": 1.9339954915848854e-05, + "loss": 2.136, + "mean_token_accuracy": 0.5321518778800964, + "num_tokens": 1409141498.0, + "step": 2756 + }, + { + "epoch": 0.7455381287182261, + "grad_norm": 1.8457010984420776, + "learning_rate": 1.9339362191142735e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.5112389326095581, + "num_tokens": 1409640455.0, + "step": 2757 + }, + { + "epoch": 0.7458085451595458, + "grad_norm": 1.7041586637496948, + "learning_rate": 1.9338769210560458e-05, + "loss": 2.0772, + "mean_token_accuracy": 0.5324855446815491, + "num_tokens": 1410164664.0, + "step": 2758 + }, + { + "epoch": 0.7460789616008653, + "grad_norm": 1.642318844795227, + "learning_rate": 1.9338175974120225e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.5096694231033325, + "num_tokens": 1410642963.0, + "step": 2759 + }, + { + "epoch": 0.7463493780421849, + "grad_norm": 1.6948579549789429, + "learning_rate": 1.933758248184023e-05, + "loss": 2.1441, + "mean_token_accuracy": 0.5348008275032043, + "num_tokens": 1411088087.0, + "step": 2760 + }, + { + "epoch": 0.7466197944835046, + "grad_norm": 1.0134637355804443, + "learning_rate": 1.933698873373868e-05, + "loss": 1.2139, + "mean_token_accuracy": 0.6787028908729553, + "num_tokens": 1411612331.0, + "step": 2761 + }, + { + "epoch": 0.7468902109248242, + "grad_norm": 2.7131187915802, + "learning_rate": 1.93363947298338e-05, + "loss": 2.1334, + "mean_token_accuracy": 0.5154367685317993, + "num_tokens": 1412136437.0, + "step": 2762 + }, + { + "epoch": 0.7471606273661439, + "grad_norm": 2.488492250442505, + "learning_rate": 1.9335800470143817e-05, + "loss": 2.1992, + "mean_token_accuracy": 0.5170671939849854, + "num_tokens": 1412660567.0, + "step": 2763 + }, + { + "epoch": 0.7474310438074635, + "grad_norm": 1.7165721654891968, + "learning_rate": 1.9335205954686957e-05, + "loss": 2.2038, + "mean_token_accuracy": 0.5250973105430603, + "num_tokens": 1413141324.0, + "step": 2764 + }, + { + "epoch": 0.7477014602487831, + "grad_norm": 1.8410258293151855, + "learning_rate": 1.933461118348147e-05, + "loss": 1.9925, + "mean_token_accuracy": 0.5709640979766846, + "num_tokens": 1413665511.0, + "step": 2765 + }, + { + "epoch": 0.7479718766901028, + "grad_norm": 1.9385534524917603, + "learning_rate": 1.9334016156545602e-05, + "loss": 2.0466, + "mean_token_accuracy": 0.5287067294120789, + "num_tokens": 1414173358.0, + "step": 2766 + }, + { + "epoch": 0.7482422931314224, + "grad_norm": 2.247498035430908, + "learning_rate": 1.9333420873897605e-05, + "loss": 2.19, + "mean_token_accuracy": 0.527753472328186, + "num_tokens": 1414601455.0, + "step": 2767 + }, + { + "epoch": 0.7485127095727421, + "grad_norm": 1.9747555255889893, + "learning_rate": 1.9332825335555752e-05, + "loss": 2.1796, + "mean_token_accuracy": 0.5235813856124878, + "num_tokens": 1415124094.0, + "step": 2768 + }, + { + "epoch": 0.7487831260140616, + "grad_norm": 1.8930621147155762, + "learning_rate": 1.933222954153831e-05, + "loss": 2.1539, + "mean_token_accuracy": 0.5461639165878296, + "num_tokens": 1415614909.0, + "step": 2769 + }, + { + "epoch": 0.7490535424553812, + "grad_norm": 1.8463653326034546, + "learning_rate": 1.933163349186357e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.5019195079803467, + "num_tokens": 1416105873.0, + "step": 2770 + }, + { + "epoch": 0.7493239588967009, + "grad_norm": 1.8377830982208252, + "learning_rate": 1.9331037186549804e-05, + "loss": 2.2025, + "mean_token_accuracy": 0.5200008749961853, + "num_tokens": 1416630022.0, + "step": 2771 + }, + { + "epoch": 0.7495943753380205, + "grad_norm": 2.408188581466675, + "learning_rate": 1.933044062561532e-05, + "loss": 2.1907, + "mean_token_accuracy": 0.5366557836532593, + "num_tokens": 1417115931.0, + "step": 2772 + }, + { + "epoch": 0.7498647917793402, + "grad_norm": 1.324980616569519, + "learning_rate": 1.932984380907842e-05, + "loss": 2.136, + "mean_token_accuracy": 0.5247119665145874, + "num_tokens": 1417640159.0, + "step": 2773 + }, + { + "epoch": 0.7501352082206598, + "grad_norm": 1.8758933544158936, + "learning_rate": 1.9329246736957415e-05, + "loss": 2.165, + "mean_token_accuracy": 0.527749240398407, + "num_tokens": 1418164341.0, + "step": 2774 + }, + { + "epoch": 0.7504056246619795, + "grad_norm": 1.8570802211761475, + "learning_rate": 1.9328649409270624e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4939698874950409, + "num_tokens": 1418688620.0, + "step": 2775 + }, + { + "epoch": 0.7506760411032991, + "grad_norm": 1.538385272026062, + "learning_rate": 1.9328051826036378e-05, + "loss": 2.0895, + "mean_token_accuracy": 0.5299953818321228, + "num_tokens": 1419212841.0, + "step": 2776 + }, + { + "epoch": 0.7509464575446188, + "grad_norm": 1.976723313331604, + "learning_rate": 1.932745398727301e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.49252915382385254, + "num_tokens": 1419737094.0, + "step": 2777 + }, + { + "epoch": 0.7512168739859384, + "grad_norm": 1.7533795833587646, + "learning_rate": 1.9326855892998867e-05, + "loss": 2.162, + "mean_token_accuracy": 0.5004239082336426, + "num_tokens": 1420261215.0, + "step": 2778 + }, + { + "epoch": 0.7514872904272579, + "grad_norm": 1.741995930671692, + "learning_rate": 1.9326257543232293e-05, + "loss": 2.2172, + "mean_token_accuracy": 0.5184087753295898, + "num_tokens": 1420770820.0, + "step": 2779 + }, + { + "epoch": 0.7517577068685776, + "grad_norm": 1.7625404596328735, + "learning_rate": 1.932565893799165e-05, + "loss": 2.15, + "mean_token_accuracy": 0.5373152494430542, + "num_tokens": 1421242622.0, + "step": 2780 + }, + { + "epoch": 0.7520281233098972, + "grad_norm": 0.7578743696212769, + "learning_rate": 1.932506007729531e-05, + "loss": 1.1743, + "mean_token_accuracy": 0.6749497056007385, + "num_tokens": 1421766830.0, + "step": 2781 + }, + { + "epoch": 0.7522985397512169, + "grad_norm": 2.3436732292175293, + "learning_rate": 1.9324460961161646e-05, + "loss": 2.2677, + "mean_token_accuracy": 0.5274824500083923, + "num_tokens": 1422291006.0, + "step": 2782 + }, + { + "epoch": 0.7525689561925365, + "grad_norm": 2.0706467628479004, + "learning_rate": 1.9323861589609036e-05, + "loss": 2.0962, + "mean_token_accuracy": 0.5313628315925598, + "num_tokens": 1422815192.0, + "step": 2783 + }, + { + "epoch": 0.7528393726338561, + "grad_norm": 1.1242996454238892, + "learning_rate": 1.932326196265588e-05, + "loss": 2.0683, + "mean_token_accuracy": 0.5432077646255493, + "num_tokens": 1423297981.0, + "step": 2784 + }, + { + "epoch": 0.7531097890751758, + "grad_norm": 2.1227176189422607, + "learning_rate": 1.9322662080320563e-05, + "loss": 2.301, + "mean_token_accuracy": 0.49272364377975464, + "num_tokens": 1423822179.0, + "step": 2785 + }, + { + "epoch": 0.7533802055164954, + "grad_norm": 2.4013681411743164, + "learning_rate": 1.93220619426215e-05, + "loss": 2.2541, + "mean_token_accuracy": 0.5041956901550293, + "num_tokens": 1424346367.0, + "step": 2786 + }, + { + "epoch": 0.7536506219578151, + "grad_norm": 1.5398012399673462, + "learning_rate": 1.9321461549577102e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.5153555274009705, + "num_tokens": 1424870635.0, + "step": 2787 + }, + { + "epoch": 0.7539210383991347, + "grad_norm": 1.9290344715118408, + "learning_rate": 1.9320860901205794e-05, + "loss": 2.1585, + "mean_token_accuracy": 0.5580198168754578, + "num_tokens": 1425335547.0, + "step": 2788 + }, + { + "epoch": 0.7541914548404542, + "grad_norm": 1.680161714553833, + "learning_rate": 1.9320259997526002e-05, + "loss": 2.1048, + "mean_token_accuracy": 0.5113760828971863, + "num_tokens": 1425807859.0, + "step": 2789 + }, + { + "epoch": 0.7544618712817739, + "grad_norm": 1.8846315145492554, + "learning_rate": 1.9319658838556163e-05, + "loss": 2.194, + "mean_token_accuracy": 0.5255911350250244, + "num_tokens": 1426283537.0, + "step": 2790 + }, + { + "epoch": 0.7547322877230935, + "grad_norm": 1.8844928741455078, + "learning_rate": 1.931905742431473e-05, + "loss": 2.2675, + "mean_token_accuracy": 0.5078182220458984, + "num_tokens": 1426807772.0, + "step": 2791 + }, + { + "epoch": 0.7550027041644132, + "grad_norm": 1.906058669090271, + "learning_rate": 1.9318455754820145e-05, + "loss": 2.1325, + "mean_token_accuracy": 0.5202142000198364, + "num_tokens": 1427293991.0, + "step": 2792 + }, + { + "epoch": 0.7552731206057328, + "grad_norm": 1.8237839937210083, + "learning_rate": 1.9317853830090877e-05, + "loss": 2.0362, + "mean_token_accuracy": 0.5575810670852661, + "num_tokens": 1427795470.0, + "step": 2793 + }, + { + "epoch": 0.7555435370470525, + "grad_norm": 1.675642728805542, + "learning_rate": 1.9317251650145392e-05, + "loss": 2.2469, + "mean_token_accuracy": 0.5201114416122437, + "num_tokens": 1428293742.0, + "step": 2794 + }, + { + "epoch": 0.7558139534883721, + "grad_norm": 1.7541320323944092, + "learning_rate": 1.9316649215002164e-05, + "loss": 2.0848, + "mean_token_accuracy": 0.5628855228424072, + "num_tokens": 1428818015.0, + "step": 2795 + }, + { + "epoch": 0.7560843699296917, + "grad_norm": 1.9219611883163452, + "learning_rate": 1.9316046524679687e-05, + "loss": 2.0625, + "mean_token_accuracy": 0.5385273694992065, + "num_tokens": 1429342176.0, + "step": 2796 + }, + { + "epoch": 0.7563547863710114, + "grad_norm": 1.5352131128311157, + "learning_rate": 1.931544357919644e-05, + "loss": 2.0226, + "mean_token_accuracy": 0.5637937784194946, + "num_tokens": 1429822587.0, + "step": 2797 + }, + { + "epoch": 0.756625202812331, + "grad_norm": 1.4730974435806274, + "learning_rate": 1.9314840378570933e-05, + "loss": 2.1981, + "mean_token_accuracy": 0.5056030750274658, + "num_tokens": 1430346603.0, + "step": 2798 + }, + { + "epoch": 0.7568956192536507, + "grad_norm": 1.415964126586914, + "learning_rate": 1.931423692282167e-05, + "loss": 2.1444, + "mean_token_accuracy": 0.5333524942398071, + "num_tokens": 1430870684.0, + "step": 2799 + }, + { + "epoch": 0.7571660356949702, + "grad_norm": 1.250576376914978, + "learning_rate": 1.9313633211967167e-05, + "loss": 2.256, + "mean_token_accuracy": 0.5183013677597046, + "num_tokens": 1431343657.0, + "step": 2800 + }, + { + "epoch": 0.7574364521362899, + "grad_norm": 0.8424605131149292, + "learning_rate": 1.9313029246025953e-05, + "loss": 1.242, + "mean_token_accuracy": 0.671241044998169, + "num_tokens": 1431867845.0, + "step": 2801 + }, + { + "epoch": 0.7577068685776095, + "grad_norm": 2.158961534500122, + "learning_rate": 1.931242502501655e-05, + "loss": 2.0625, + "mean_token_accuracy": 0.5428842306137085, + "num_tokens": 1432392104.0, + "step": 2802 + }, + { + "epoch": 0.7579772850189291, + "grad_norm": 2.023557186126709, + "learning_rate": 1.93118205489575e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.49567344784736633, + "num_tokens": 1432916353.0, + "step": 2803 + }, + { + "epoch": 0.7582477014602488, + "grad_norm": 1.544243574142456, + "learning_rate": 1.931121581786736e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.53724205493927, + "num_tokens": 1433440516.0, + "step": 2804 + }, + { + "epoch": 0.7585181179015684, + "grad_norm": 1.7703577280044556, + "learning_rate": 1.9310610831764672e-05, + "loss": 2.1498, + "mean_token_accuracy": 0.5187535285949707, + "num_tokens": 1433931021.0, + "step": 2805 + }, + { + "epoch": 0.7587885343428881, + "grad_norm": 1.9411542415618896, + "learning_rate": 1.9310005590668e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.5087252259254456, + "num_tokens": 1434455199.0, + "step": 2806 + }, + { + "epoch": 0.7590589507842077, + "grad_norm": 1.7252707481384277, + "learning_rate": 1.9309400094595923e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.5514475703239441, + "num_tokens": 1434892377.0, + "step": 2807 + }, + { + "epoch": 0.7593293672255274, + "grad_norm": 1.820135235786438, + "learning_rate": 1.9308794343567017e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.5331310033798218, + "num_tokens": 1435361510.0, + "step": 2808 + }, + { + "epoch": 0.759599783666847, + "grad_norm": 1.753928542137146, + "learning_rate": 1.9308188337599865e-05, + "loss": 2.288, + "mean_token_accuracy": 0.5180338621139526, + "num_tokens": 1435885746.0, + "step": 2809 + }, + { + "epoch": 0.7598702001081665, + "grad_norm": 1.5447512865066528, + "learning_rate": 1.930758207671306e-05, + "loss": 2.2317, + "mean_token_accuracy": 0.5154139399528503, + "num_tokens": 1436410025.0, + "step": 2810 + }, + { + "epoch": 0.7601406165494862, + "grad_norm": 1.4926327466964722, + "learning_rate": 1.9306975560925202e-05, + "loss": 2.0081, + "mean_token_accuracy": 0.5277310609817505, + "num_tokens": 1436913854.0, + "step": 2811 + }, + { + "epoch": 0.7604110329908058, + "grad_norm": 2.0216970443725586, + "learning_rate": 1.930636879025491e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.5333399772644043, + "num_tokens": 1437329725.0, + "step": 2812 + }, + { + "epoch": 0.7606814494321255, + "grad_norm": 1.826431155204773, + "learning_rate": 1.9305761764720797e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.5067939162254333, + "num_tokens": 1437853917.0, + "step": 2813 + }, + { + "epoch": 0.7609518658734451, + "grad_norm": 1.9016863107681274, + "learning_rate": 1.9305154484341483e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.512819766998291, + "num_tokens": 1438350878.0, + "step": 2814 + }, + { + "epoch": 0.7612222823147647, + "grad_norm": 1.8556488752365112, + "learning_rate": 1.930454694913561e-05, + "loss": 2.2332, + "mean_token_accuracy": 0.5192396640777588, + "num_tokens": 1438875073.0, + "step": 2815 + }, + { + "epoch": 0.7614926987560844, + "grad_norm": 1.6484742164611816, + "learning_rate": 1.930393915912181e-05, + "loss": 2.1747, + "mean_token_accuracy": 0.5209321975708008, + "num_tokens": 1439399225.0, + "step": 2816 + }, + { + "epoch": 0.761763115197404, + "grad_norm": 1.6525442600250244, + "learning_rate": 1.9303331114318738e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.5222989320755005, + "num_tokens": 1439867994.0, + "step": 2817 + }, + { + "epoch": 0.7620335316387237, + "grad_norm": 1.5450516939163208, + "learning_rate": 1.9302722814745048e-05, + "loss": 2.275, + "mean_token_accuracy": 0.5217602252960205, + "num_tokens": 1440392078.0, + "step": 2818 + }, + { + "epoch": 0.7623039480800433, + "grad_norm": 1.4938470125198364, + "learning_rate": 1.9302114260419407e-05, + "loss": 2.2211, + "mean_token_accuracy": 0.5233793258666992, + "num_tokens": 1440916269.0, + "step": 2819 + }, + { + "epoch": 0.7625743645213628, + "grad_norm": 1.6109932661056519, + "learning_rate": 1.9301505451360485e-05, + "loss": 2.0605, + "mean_token_accuracy": 0.5280459523200989, + "num_tokens": 1441440466.0, + "step": 2820 + }, + { + "epoch": 0.7628447809626825, + "grad_norm": 0.9005704522132874, + "learning_rate": 1.9300896387586965e-05, + "loss": 1.0914, + "mean_token_accuracy": 0.6923714280128479, + "num_tokens": 1441964665.0, + "step": 2821 + }, + { + "epoch": 0.7631151974040021, + "grad_norm": 2.067051887512207, + "learning_rate": 1.930028706911753e-05, + "loss": 2.0943, + "mean_token_accuracy": 0.5236109495162964, + "num_tokens": 1442464583.0, + "step": 2822 + }, + { + "epoch": 0.7633856138453218, + "grad_norm": 1.9576245546340942, + "learning_rate": 1.929967749597088e-05, + "loss": 2.2448, + "mean_token_accuracy": 0.5224920511245728, + "num_tokens": 1442988852.0, + "step": 2823 + }, + { + "epoch": 0.7636560302866414, + "grad_norm": 1.6304889917373657, + "learning_rate": 1.929906766816571e-05, + "loss": 2.235, + "mean_token_accuracy": 0.5198402404785156, + "num_tokens": 1443513079.0, + "step": 2824 + }, + { + "epoch": 0.7639264467279611, + "grad_norm": 1.5756680965423584, + "learning_rate": 1.9298457585720746e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.5009980201721191, + "num_tokens": 1444037119.0, + "step": 2825 + }, + { + "epoch": 0.7641968631692807, + "grad_norm": 1.5283002853393555, + "learning_rate": 1.9297847248654696e-05, + "loss": 2.1872, + "mean_token_accuracy": 0.5212076902389526, + "num_tokens": 1444553315.0, + "step": 2826 + }, + { + "epoch": 0.7644672796106003, + "grad_norm": 1.5200674533843994, + "learning_rate": 1.929723665698629e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.5100520849227905, + "num_tokens": 1445077593.0, + "step": 2827 + }, + { + "epoch": 0.76473769605192, + "grad_norm": 1.2158812284469604, + "learning_rate": 1.9296625810734264e-05, + "loss": 1.9975, + "mean_token_accuracy": 0.5448517799377441, + "num_tokens": 1445601767.0, + "step": 2828 + }, + { + "epoch": 0.7650081124932396, + "grad_norm": 1.4631661176681519, + "learning_rate": 1.929601470991736e-05, + "loss": 2.23, + "mean_token_accuracy": 0.5128458738327026, + "num_tokens": 1446126022.0, + "step": 2829 + }, + { + "epoch": 0.7652785289345592, + "grad_norm": 1.498935341835022, + "learning_rate": 1.9295403354554324e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.5117813944816589, + "num_tokens": 1446650219.0, + "step": 2830 + }, + { + "epoch": 0.7655489453758788, + "grad_norm": 1.3108066320419312, + "learning_rate": 1.929479174466392e-05, + "loss": 2.209, + "mean_token_accuracy": 0.5246172547340393, + "num_tokens": 1447174498.0, + "step": 2831 + }, + { + "epoch": 0.7658193618171985, + "grad_norm": 1.345410943031311, + "learning_rate": 1.929417988026491e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.5264021754264832, + "num_tokens": 1447698717.0, + "step": 2832 + }, + { + "epoch": 0.7660897782585181, + "grad_norm": 1.8036255836486816, + "learning_rate": 1.929356776137607e-05, + "loss": 2.3159, + "mean_token_accuracy": 0.4737052321434021, + "num_tokens": 1448222861.0, + "step": 2833 + }, + { + "epoch": 0.7663601946998377, + "grad_norm": 1.5993982553482056, + "learning_rate": 1.9292955388016188e-05, + "loss": 2.322, + "mean_token_accuracy": 0.5144942998886108, + "num_tokens": 1448690867.0, + "step": 2834 + }, + { + "epoch": 0.7666306111411574, + "grad_norm": 1.5004414319992065, + "learning_rate": 1.929234276020404e-05, + "loss": 2.2335, + "mean_token_accuracy": 0.5035849213600159, + "num_tokens": 1449214763.0, + "step": 2835 + }, + { + "epoch": 0.766901027582477, + "grad_norm": 1.3880819082260132, + "learning_rate": 1.929172987795843e-05, + "loss": 2.3087, + "mean_token_accuracy": 0.5120522975921631, + "num_tokens": 1449738912.0, + "step": 2836 + }, + { + "epoch": 0.7671714440237967, + "grad_norm": 1.3506712913513184, + "learning_rate": 1.9291116741298166e-05, + "loss": 2.1667, + "mean_token_accuracy": 0.5243909955024719, + "num_tokens": 1450262981.0, + "step": 2837 + }, + { + "epoch": 0.7674418604651163, + "grad_norm": 1.5315035581588745, + "learning_rate": 1.9290503350242055e-05, + "loss": 2.2089, + "mean_token_accuracy": 0.5227367877960205, + "num_tokens": 1450787095.0, + "step": 2838 + }, + { + "epoch": 0.767712276906436, + "grad_norm": 1.529209852218628, + "learning_rate": 1.9289889704808922e-05, + "loss": 2.3256, + "mean_token_accuracy": 0.5084881782531738, + "num_tokens": 1451259932.0, + "step": 2839 + }, + { + "epoch": 0.7679826933477556, + "grad_norm": 1.2349447011947632, + "learning_rate": 1.9289275805017592e-05, + "loss": 2.1596, + "mean_token_accuracy": 0.5286936163902283, + "num_tokens": 1451732810.0, + "step": 2840 + }, + { + "epoch": 0.7682531097890751, + "grad_norm": 0.8597829341888428, + "learning_rate": 1.9288661650886904e-05, + "loss": 1.2484, + "mean_token_accuracy": 0.6627370715141296, + "num_tokens": 1452257029.0, + "step": 2841 + }, + { + "epoch": 0.7685235262303948, + "grad_norm": 2.3061373233795166, + "learning_rate": 1.9288047242435704e-05, + "loss": 2.0703, + "mean_token_accuracy": 0.5390133857727051, + "num_tokens": 1452781280.0, + "step": 2842 + }, + { + "epoch": 0.7687939426717144, + "grad_norm": 1.7358335256576538, + "learning_rate": 1.9287432579682836e-05, + "loss": 2.1479, + "mean_token_accuracy": 0.5289407968521118, + "num_tokens": 1453305429.0, + "step": 2843 + }, + { + "epoch": 0.7690643591130341, + "grad_norm": 1.820744514465332, + "learning_rate": 1.928681766264717e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.5033144950866699, + "num_tokens": 1453829688.0, + "step": 2844 + }, + { + "epoch": 0.7693347755543537, + "grad_norm": 1.6128456592559814, + "learning_rate": 1.9286202491347564e-05, + "loss": 2.1423, + "mean_token_accuracy": 0.5217627286911011, + "num_tokens": 1454353796.0, + "step": 2845 + }, + { + "epoch": 0.7696051919956733, + "grad_norm": 1.982367992401123, + "learning_rate": 1.92855870658029e-05, + "loss": 2.3345, + "mean_token_accuracy": 0.4962896406650543, + "num_tokens": 1454878075.0, + "step": 2846 + }, + { + "epoch": 0.769875608436993, + "grad_norm": 2.3156626224517822, + "learning_rate": 1.9284971386032057e-05, + "loss": 2.2827, + "mean_token_accuracy": 0.5233322381973267, + "num_tokens": 1455402241.0, + "step": 2847 + }, + { + "epoch": 0.7701460248783126, + "grad_norm": 2.060638904571533, + "learning_rate": 1.928435545205393e-05, + "loss": 2.2078, + "mean_token_accuracy": 0.5058469176292419, + "num_tokens": 1455926440.0, + "step": 2848 + }, + { + "epoch": 0.7704164413196323, + "grad_norm": 2.0953269004821777, + "learning_rate": 1.9283739263887412e-05, + "loss": 2.2845, + "mean_token_accuracy": 0.5022789239883423, + "num_tokens": 1456450719.0, + "step": 2849 + }, + { + "epoch": 0.7706868577609519, + "grad_norm": 1.8056893348693848, + "learning_rate": 1.9283122821551413e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.49857276678085327, + "num_tokens": 1456966243.0, + "step": 2850 + }, + { + "epoch": 0.7709572742022714, + "grad_norm": 1.7418729066848755, + "learning_rate": 1.928250612506485e-05, + "loss": 2.1186, + "mean_token_accuracy": 0.5268675088882446, + "num_tokens": 1457471871.0, + "step": 2851 + }, + { + "epoch": 0.7712276906435911, + "grad_norm": 1.6890026330947876, + "learning_rate": 1.9281889174446638e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.5270786285400391, + "num_tokens": 1457995991.0, + "step": 2852 + }, + { + "epoch": 0.7714981070849107, + "grad_norm": 2.1209583282470703, + "learning_rate": 1.928127196971571e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.5135262608528137, + "num_tokens": 1458520194.0, + "step": 2853 + }, + { + "epoch": 0.7717685235262304, + "grad_norm": 2.000781297683716, + "learning_rate": 1.9280654510891006e-05, + "loss": 2.1604, + "mean_token_accuracy": 0.517595648765564, + "num_tokens": 1459044402.0, + "step": 2854 + }, + { + "epoch": 0.77203893996755, + "grad_norm": 1.6187231540679932, + "learning_rate": 1.928003679799147e-05, + "loss": 2.1832, + "mean_token_accuracy": 0.5236225724220276, + "num_tokens": 1459563928.0, + "step": 2855 + }, + { + "epoch": 0.7723093564088697, + "grad_norm": 1.9426559209823608, + "learning_rate": 1.9279418831036055e-05, + "loss": 2.1404, + "mean_token_accuracy": 0.5345020294189453, + "num_tokens": 1460088008.0, + "step": 2856 + }, + { + "epoch": 0.7725797728501893, + "grad_norm": 1.3528618812561035, + "learning_rate": 1.9278800610043722e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.49486106634140015, + "num_tokens": 1460612199.0, + "step": 2857 + }, + { + "epoch": 0.772850189291509, + "grad_norm": 1.6004738807678223, + "learning_rate": 1.9278182135033442e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.5403763055801392, + "num_tokens": 1461124945.0, + "step": 2858 + }, + { + "epoch": 0.7731206057328286, + "grad_norm": 2.9794228076934814, + "learning_rate": 1.9277563406024188e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.5215765237808228, + "num_tokens": 1461595439.0, + "step": 2859 + }, + { + "epoch": 0.7733910221741482, + "grad_norm": 1.6752219200134277, + "learning_rate": 1.927694442303494e-05, + "loss": 2.1914, + "mean_token_accuracy": 0.5232423543930054, + "num_tokens": 1462036282.0, + "step": 2860 + }, + { + "epoch": 0.7736614386154678, + "grad_norm": 0.8013840913772583, + "learning_rate": 1.9276325186084704e-05, + "loss": 1.1681, + "mean_token_accuracy": 0.6898350715637207, + "num_tokens": 1462560539.0, + "step": 2861 + }, + { + "epoch": 0.7739318550567874, + "grad_norm": 2.7999467849731445, + "learning_rate": 1.927570569519247e-05, + "loss": 2.0799, + "mean_token_accuracy": 0.5342811346054077, + "num_tokens": 1463065022.0, + "step": 2862 + }, + { + "epoch": 0.774202271498107, + "grad_norm": 2.1290030479431152, + "learning_rate": 1.9275085950377252e-05, + "loss": 2.2095, + "mean_token_accuracy": 0.5155829191207886, + "num_tokens": 1463589203.0, + "step": 2863 + }, + { + "epoch": 0.7744726879394267, + "grad_norm": 1.468196153640747, + "learning_rate": 1.9274465951658055e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.5198218822479248, + "num_tokens": 1464113396.0, + "step": 2864 + }, + { + "epoch": 0.7747431043807463, + "grad_norm": 2.0592775344848633, + "learning_rate": 1.927384569905391e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.5054872035980225, + "num_tokens": 1464637652.0, + "step": 2865 + }, + { + "epoch": 0.775013520822066, + "grad_norm": 2.0591089725494385, + "learning_rate": 1.9273225192583852e-05, + "loss": 2.0819, + "mean_token_accuracy": 0.5431224703788757, + "num_tokens": 1465161864.0, + "step": 2866 + }, + { + "epoch": 0.7752839372633856, + "grad_norm": 1.665498971939087, + "learning_rate": 1.927260443226691e-05, + "loss": 2.2967, + "mean_token_accuracy": 0.5133489370346069, + "num_tokens": 1465686057.0, + "step": 2867 + }, + { + "epoch": 0.7755543537047053, + "grad_norm": 1.4920860528945923, + "learning_rate": 1.927198341812214e-05, + "loss": 2.0749, + "mean_token_accuracy": 0.5305405855178833, + "num_tokens": 1466157730.0, + "step": 2868 + }, + { + "epoch": 0.7758247701460249, + "grad_norm": 53.46586608886719, + "learning_rate": 1.9271362150168587e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.5683057904243469, + "num_tokens": 1466681946.0, + "step": 2869 + }, + { + "epoch": 0.7760951865873446, + "grad_norm": 3.317659378051758, + "learning_rate": 1.9270740628425322e-05, + "loss": 2.2051, + "mean_token_accuracy": 0.5284333229064941, + "num_tokens": 1467136493.0, + "step": 2870 + }, + { + "epoch": 0.7763656030286642, + "grad_norm": 3.199970245361328, + "learning_rate": 1.9270118852911413e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.5209552049636841, + "num_tokens": 1467605446.0, + "step": 2871 + }, + { + "epoch": 0.7766360194699837, + "grad_norm": 1.8609086275100708, + "learning_rate": 1.9269496823645932e-05, + "loss": 2.1257, + "mean_token_accuracy": 0.5323668122291565, + "num_tokens": 1468129719.0, + "step": 2872 + }, + { + "epoch": 0.7769064359113034, + "grad_norm": 1.8151848316192627, + "learning_rate": 1.9268874540647974e-05, + "loss": 2.0696, + "mean_token_accuracy": 0.5478093028068542, + "num_tokens": 1468653959.0, + "step": 2873 + }, + { + "epoch": 0.777176852352623, + "grad_norm": 2.105897903442383, + "learning_rate": 1.9268252003936628e-05, + "loss": 2.1547, + "mean_token_accuracy": 0.5271254777908325, + "num_tokens": 1469178214.0, + "step": 2874 + }, + { + "epoch": 0.7774472687939427, + "grad_norm": 2.330996036529541, + "learning_rate": 1.9267629213530993e-05, + "loss": 2.2305, + "mean_token_accuracy": 0.5181589126586914, + "num_tokens": 1469702366.0, + "step": 2875 + }, + { + "epoch": 0.7777176852352623, + "grad_norm": 1.658593773841858, + "learning_rate": 1.926700616945018e-05, + "loss": 2.1967, + "mean_token_accuracy": 0.5180413722991943, + "num_tokens": 1470226614.0, + "step": 2876 + }, + { + "epoch": 0.777988101676582, + "grad_norm": 1.4990426301956177, + "learning_rate": 1.926638287171331e-05, + "loss": 2.1378, + "mean_token_accuracy": 0.5280419588088989, + "num_tokens": 1470750874.0, + "step": 2877 + }, + { + "epoch": 0.7782585181179016, + "grad_norm": 13.958029747009277, + "learning_rate": 1.9265759320339498e-05, + "loss": 2.0488, + "mean_token_accuracy": 0.5542311668395996, + "num_tokens": 1471275076.0, + "step": 2878 + }, + { + "epoch": 0.7785289345592212, + "grad_norm": 2.389885187149048, + "learning_rate": 1.9265135515347885e-05, + "loss": 2.1995, + "mean_token_accuracy": 0.5089861154556274, + "num_tokens": 1471799142.0, + "step": 2879 + }, + { + "epoch": 0.7787993510005409, + "grad_norm": 2.1826589107513428, + "learning_rate": 1.926451145675761e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.5102906227111816, + "num_tokens": 1472323421.0, + "step": 2880 + }, + { + "epoch": 0.7790697674418605, + "grad_norm": 0.7953854203224182, + "learning_rate": 1.9263887144587817e-05, + "loss": 1.2357, + "mean_token_accuracy": 0.6749305725097656, + "num_tokens": 1472847555.0, + "step": 2881 + }, + { + "epoch": 0.77934018388318, + "grad_norm": 3.1790366172790527, + "learning_rate": 1.9263262578857663e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.5108902454376221, + "num_tokens": 1473371824.0, + "step": 2882 + }, + { + "epoch": 0.7796106003244997, + "grad_norm": 2.616812229156494, + "learning_rate": 1.9262637759586315e-05, + "loss": 2.2583, + "mean_token_accuracy": 0.5144633650779724, + "num_tokens": 1473896076.0, + "step": 2883 + }, + { + "epoch": 0.7798810167658193, + "grad_norm": 1.9348783493041992, + "learning_rate": 1.926201268679294e-05, + "loss": 2.2046, + "mean_token_accuracy": 0.5294284224510193, + "num_tokens": 1474389676.0, + "step": 2884 + }, + { + "epoch": 0.780151433207139, + "grad_norm": 2.173959255218506, + "learning_rate": 1.926138736049672e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.5052053332328796, + "num_tokens": 1474913928.0, + "step": 2885 + }, + { + "epoch": 0.7804218496484586, + "grad_norm": 1.8255642652511597, + "learning_rate": 1.9260761780716837e-05, + "loss": 2.2267, + "mean_token_accuracy": 0.5137723684310913, + "num_tokens": 1475438206.0, + "step": 2886 + }, + { + "epoch": 0.7806922660897783, + "grad_norm": 2.2837233543395996, + "learning_rate": 1.9260135947472492e-05, + "loss": 2.2555, + "mean_token_accuracy": 0.5171563029289246, + "num_tokens": 1475962370.0, + "step": 2887 + }, + { + "epoch": 0.7809626825310979, + "grad_norm": 1.9278199672698975, + "learning_rate": 1.9259509860782886e-05, + "loss": 2.0541, + "mean_token_accuracy": 0.5645776391029358, + "num_tokens": 1476421215.0, + "step": 2888 + }, + { + "epoch": 0.7812330989724176, + "grad_norm": 1.4397410154342651, + "learning_rate": 1.9258883520667226e-05, + "loss": 2.1525, + "mean_token_accuracy": 0.5347422361373901, + "num_tokens": 1476937288.0, + "step": 2889 + }, + { + "epoch": 0.7815035154137372, + "grad_norm": 1.3418279886245728, + "learning_rate": 1.9258256927144732e-05, + "loss": 2.0951, + "mean_token_accuracy": 0.5255023837089539, + "num_tokens": 1477461323.0, + "step": 2890 + }, + { + "epoch": 0.7817739318550568, + "grad_norm": 1.5180716514587402, + "learning_rate": 1.925763008023463e-05, + "loss": 2.3237, + "mean_token_accuracy": 0.5157989263534546, + "num_tokens": 1477961308.0, + "step": 2891 + }, + { + "epoch": 0.7820443482963764, + "grad_norm": 1.3710167407989502, + "learning_rate": 1.925700297995615e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.5118633508682251, + "num_tokens": 1478485404.0, + "step": 2892 + }, + { + "epoch": 0.782314764737696, + "grad_norm": 1.2406529188156128, + "learning_rate": 1.925637562632854e-05, + "loss": 2.1238, + "mean_token_accuracy": 0.544037938117981, + "num_tokens": 1478998190.0, + "step": 2893 + }, + { + "epoch": 0.7825851811790157, + "grad_norm": 1.3793164491653442, + "learning_rate": 1.9255748019371044e-05, + "loss": 2.2359, + "mean_token_accuracy": 0.515317976474762, + "num_tokens": 1479522273.0, + "step": 2894 + }, + { + "epoch": 0.7828555976203353, + "grad_norm": 1.4842078685760498, + "learning_rate": 1.925512015910292e-05, + "loss": 2.1183, + "mean_token_accuracy": 0.5231686234474182, + "num_tokens": 1480046548.0, + "step": 2895 + }, + { + "epoch": 0.7831260140616549, + "grad_norm": 1.230409026145935, + "learning_rate": 1.9254492045543435e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.5239608287811279, + "num_tokens": 1480545516.0, + "step": 2896 + }, + { + "epoch": 0.7833964305029746, + "grad_norm": 1.4827913045883179, + "learning_rate": 1.925386367871186e-05, + "loss": 2.1518, + "mean_token_accuracy": 0.5302932262420654, + "num_tokens": 1481069761.0, + "step": 2897 + }, + { + "epoch": 0.7836668469442942, + "grad_norm": 1.613511323928833, + "learning_rate": 1.925323505862747e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.5228427648544312, + "num_tokens": 1481567750.0, + "step": 2898 + }, + { + "epoch": 0.7839372633856139, + "grad_norm": 1.709930658340454, + "learning_rate": 1.9252606185309558e-05, + "loss": 2.228, + "mean_token_accuracy": 0.5293518900871277, + "num_tokens": 1482091991.0, + "step": 2899 + }, + { + "epoch": 0.7842076798269335, + "grad_norm": 1.535893440246582, + "learning_rate": 1.925197705877742e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.5084378719329834, + "num_tokens": 1482616275.0, + "step": 2900 + }, + { + "epoch": 0.7844780962682532, + "grad_norm": 0.6763321757316589, + "learning_rate": 1.925134767905036e-05, + "loss": 0.9409, + "mean_token_accuracy": 0.7579436898231506, + "num_tokens": 1483016448.0, + "step": 2901 + }, + { + "epoch": 0.7847485127095727, + "grad_norm": 2.0628724098205566, + "learning_rate": 1.9250718046147684e-05, + "loss": 2.1403, + "mean_token_accuracy": 0.5040013194084167, + "num_tokens": 1483540518.0, + "step": 2902 + }, + { + "epoch": 0.7850189291508923, + "grad_norm": 2.0049445629119873, + "learning_rate": 1.9250088160088717e-05, + "loss": 2.2646, + "mean_token_accuracy": 0.4998849630355835, + "num_tokens": 1484034378.0, + "step": 2903 + }, + { + "epoch": 0.785289345592212, + "grad_norm": 1.4805481433868408, + "learning_rate": 1.924945802089278e-05, + "loss": 2.2443, + "mean_token_accuracy": 0.5155399441719055, + "num_tokens": 1484523881.0, + "step": 2904 + }, + { + "epoch": 0.7855597620335316, + "grad_norm": 1.4884332418441772, + "learning_rate": 1.9248827628579215e-05, + "loss": 2.1269, + "mean_token_accuracy": 0.527633547782898, + "num_tokens": 1485048108.0, + "step": 2905 + }, + { + "epoch": 0.7858301784748513, + "grad_norm": 1.4168486595153809, + "learning_rate": 1.924819698316736e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.5298317670822144, + "num_tokens": 1485522349.0, + "step": 2906 + }, + { + "epoch": 0.7861005949161709, + "grad_norm": 1.5732828378677368, + "learning_rate": 1.9247566084676565e-05, + "loss": 2.0608, + "mean_token_accuracy": 0.5350906848907471, + "num_tokens": 1486046485.0, + "step": 2907 + }, + { + "epoch": 0.7863710113574905, + "grad_norm": 1.5613981485366821, + "learning_rate": 1.9246934933126186e-05, + "loss": 2.1597, + "mean_token_accuracy": 0.5374040603637695, + "num_tokens": 1486570663.0, + "step": 2908 + }, + { + "epoch": 0.7866414277988102, + "grad_norm": 1.656746745109558, + "learning_rate": 1.9246303528535593e-05, + "loss": 2.204, + "mean_token_accuracy": 0.5084028244018555, + "num_tokens": 1487094913.0, + "step": 2909 + }, + { + "epoch": 0.7869118442401298, + "grad_norm": 1.3790664672851562, + "learning_rate": 1.9245671870924157e-05, + "loss": 2.2144, + "mean_token_accuracy": 0.5035777688026428, + "num_tokens": 1487619173.0, + "step": 2910 + }, + { + "epoch": 0.7871822606814495, + "grad_norm": 1.6130332946777344, + "learning_rate": 1.9245039960311257e-05, + "loss": 2.278, + "mean_token_accuracy": 0.5122401118278503, + "num_tokens": 1488143441.0, + "step": 2911 + }, + { + "epoch": 0.7874526771227691, + "grad_norm": 1.5550397634506226, + "learning_rate": 1.9244407796716284e-05, + "loss": 2.1043, + "mean_token_accuracy": 0.5271894931793213, + "num_tokens": 1488659317.0, + "step": 2912 + }, + { + "epoch": 0.7877230935640886, + "grad_norm": 1.7380379438400269, + "learning_rate": 1.9243775380158634e-05, + "loss": 2.1881, + "mean_token_accuracy": 0.524970531463623, + "num_tokens": 1489183549.0, + "step": 2913 + }, + { + "epoch": 0.7879935100054083, + "grad_norm": 1.4030719995498657, + "learning_rate": 1.9243142710657712e-05, + "loss": 2.2029, + "mean_token_accuracy": 0.4828759431838989, + "num_tokens": 1489707737.0, + "step": 2914 + }, + { + "epoch": 0.7882639264467279, + "grad_norm": 2.1672942638397217, + "learning_rate": 1.9242509788232932e-05, + "loss": 2.1734, + "mean_token_accuracy": 0.5145875215530396, + "num_tokens": 1490231901.0, + "step": 2915 + }, + { + "epoch": 0.7885343428880476, + "grad_norm": 1.5477409362792969, + "learning_rate": 1.924187661290371e-05, + "loss": 1.9273, + "mean_token_accuracy": 0.5778495073318481, + "num_tokens": 1490756111.0, + "step": 2916 + }, + { + "epoch": 0.7888047593293672, + "grad_norm": 1.4332228899002075, + "learning_rate": 1.924124318468948e-05, + "loss": 2.1639, + "mean_token_accuracy": 0.5461599230766296, + "num_tokens": 1491169204.0, + "step": 2917 + }, + { + "epoch": 0.7890751757706869, + "grad_norm": 1.5180702209472656, + "learning_rate": 1.9240609503609668e-05, + "loss": 2.1573, + "mean_token_accuracy": 0.5248773097991943, + "num_tokens": 1491693381.0, + "step": 2918 + }, + { + "epoch": 0.7893455922120065, + "grad_norm": 1.4209150075912476, + "learning_rate": 1.9239975569683725e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.5222276449203491, + "num_tokens": 1492217544.0, + "step": 2919 + }, + { + "epoch": 0.7896160086533262, + "grad_norm": 1.627369999885559, + "learning_rate": 1.92393413829311e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.5041543245315552, + "num_tokens": 1492681954.0, + "step": 2920 + }, + { + "epoch": 0.7898864250946458, + "grad_norm": 0.7047639489173889, + "learning_rate": 1.9238706943371248e-05, + "loss": 1.1685, + "mean_token_accuracy": 0.6912800073623657, + "num_tokens": 1493206230.0, + "step": 2921 + }, + { + "epoch": 0.7901568415359654, + "grad_norm": 1.535727858543396, + "learning_rate": 1.923807225102364e-05, + "loss": 2.1647, + "mean_token_accuracy": 0.5242955684661865, + "num_tokens": 1493730406.0, + "step": 2922 + }, + { + "epoch": 0.790427257977285, + "grad_norm": 1.4825924634933472, + "learning_rate": 1.923743730590775e-05, + "loss": 2.1996, + "mean_token_accuracy": 0.5198556184768677, + "num_tokens": 1494254634.0, + "step": 2923 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 1.285538673400879, + "learning_rate": 1.9236802108043058e-05, + "loss": 2.2106, + "mean_token_accuracy": 0.5046542286872864, + "num_tokens": 1494778905.0, + "step": 2924 + }, + { + "epoch": 0.7909680908599243, + "grad_norm": 1.4719092845916748, + "learning_rate": 1.9236166657449055e-05, + "loss": 2.1665, + "mean_token_accuracy": 0.5217546224594116, + "num_tokens": 1495303096.0, + "step": 2925 + }, + { + "epoch": 0.7912385073012439, + "grad_norm": 1.686125636100769, + "learning_rate": 1.9235530954145235e-05, + "loss": 2.1847, + "mean_token_accuracy": 0.5242372751235962, + "num_tokens": 1495823929.0, + "step": 2926 + }, + { + "epoch": 0.7915089237425635, + "grad_norm": 1.6181615591049194, + "learning_rate": 1.923489499815111e-05, + "loss": 2.2708, + "mean_token_accuracy": 0.5332293510437012, + "num_tokens": 1496348210.0, + "step": 2927 + }, + { + "epoch": 0.7917793401838832, + "grad_norm": 1.5409728288650513, + "learning_rate": 1.9234258789486188e-05, + "loss": 2.2113, + "mean_token_accuracy": 0.5235283374786377, + "num_tokens": 1496872461.0, + "step": 2928 + }, + { + "epoch": 0.7920497566252028, + "grad_norm": 1.389814853668213, + "learning_rate": 1.9233622328169988e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.526120662689209, + "num_tokens": 1497396710.0, + "step": 2929 + }, + { + "epoch": 0.7923201730665225, + "grad_norm": 1.44599449634552, + "learning_rate": 1.9232985614222048e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.5110691785812378, + "num_tokens": 1497920814.0, + "step": 2930 + }, + { + "epoch": 0.7925905895078421, + "grad_norm": 1.495490550994873, + "learning_rate": 1.923234864766189e-05, + "loss": 2.1303, + "mean_token_accuracy": 0.505866527557373, + "num_tokens": 1498444936.0, + "step": 2931 + }, + { + "epoch": 0.7928610059491618, + "grad_norm": 1.69108247756958, + "learning_rate": 1.9231711428509073e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.5224519968032837, + "num_tokens": 1498926418.0, + "step": 2932 + }, + { + "epoch": 0.7931314223904813, + "grad_norm": 1.732498288154602, + "learning_rate": 1.9231073956783137e-05, + "loss": 2.2507, + "mean_token_accuracy": 0.5153942108154297, + "num_tokens": 1499450604.0, + "step": 2933 + }, + { + "epoch": 0.7934018388318009, + "grad_norm": 1.462684988975525, + "learning_rate": 1.9230436232503646e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.51832515001297, + "num_tokens": 1499974743.0, + "step": 2934 + }, + { + "epoch": 0.7936722552731206, + "grad_norm": 1.5762158632278442, + "learning_rate": 1.9229798255690168e-05, + "loss": 2.2207, + "mean_token_accuracy": 0.5240989923477173, + "num_tokens": 1500498899.0, + "step": 2935 + }, + { + "epoch": 0.7939426717144402, + "grad_norm": 1.5893634557724, + "learning_rate": 1.922916002636228e-05, + "loss": 2.1629, + "mean_token_accuracy": 0.5208847522735596, + "num_tokens": 1501023028.0, + "step": 2936 + }, + { + "epoch": 0.7942130881557599, + "grad_norm": 1.8124794960021973, + "learning_rate": 1.922852154453956e-05, + "loss": 2.2774, + "mean_token_accuracy": 0.511349081993103, + "num_tokens": 1501547102.0, + "step": 2937 + }, + { + "epoch": 0.7944835045970795, + "grad_norm": 1.8044850826263428, + "learning_rate": 1.9227882810241597e-05, + "loss": 2.1553, + "mean_token_accuracy": 0.5283539295196533, + "num_tokens": 1502071355.0, + "step": 2938 + }, + { + "epoch": 0.7947539210383991, + "grad_norm": 1.7713884115219116, + "learning_rate": 1.9227243823487998e-05, + "loss": 1.9645, + "mean_token_accuracy": 0.5570166707038879, + "num_tokens": 1502595502.0, + "step": 2939 + }, + { + "epoch": 0.7950243374797188, + "grad_norm": 2.003009557723999, + "learning_rate": 1.922660458429836e-05, + "loss": 2.3129, + "mean_token_accuracy": 0.4970276951789856, + "num_tokens": 1503119786.0, + "step": 2940 + }, + { + "epoch": 0.7952947539210384, + "grad_norm": 0.6903308629989624, + "learning_rate": 1.9225965092692302e-05, + "loss": 0.9879, + "mean_token_accuracy": 0.7252504825592041, + "num_tokens": 1503643941.0, + "step": 2941 + }, + { + "epoch": 0.7955651703623581, + "grad_norm": 2.5793275833129883, + "learning_rate": 1.9225325348689446e-05, + "loss": 2.2758, + "mean_token_accuracy": 0.5209342241287231, + "num_tokens": 1504138178.0, + "step": 2942 + }, + { + "epoch": 0.7958355868036776, + "grad_norm": 1.8012590408325195, + "learning_rate": 1.9224685352309417e-05, + "loss": 2.1136, + "mean_token_accuracy": 0.522160530090332, + "num_tokens": 1504662351.0, + "step": 2943 + }, + { + "epoch": 0.7961060032449973, + "grad_norm": 1.803697943687439, + "learning_rate": 1.9224045103571856e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5160662531852722, + "num_tokens": 1505186439.0, + "step": 2944 + }, + { + "epoch": 0.7963764196863169, + "grad_norm": 1.752705454826355, + "learning_rate": 1.9223404602496403e-05, + "loss": 2.181, + "mean_token_accuracy": 0.5020315051078796, + "num_tokens": 1505710586.0, + "step": 2945 + }, + { + "epoch": 0.7966468361276365, + "grad_norm": 1.4963828325271606, + "learning_rate": 1.9222763849102713e-05, + "loss": 2.1134, + "mean_token_accuracy": 0.5429192185401917, + "num_tokens": 1506234788.0, + "step": 2946 + }, + { + "epoch": 0.7969172525689562, + "grad_norm": 1.5797154903411865, + "learning_rate": 1.922212284341045e-05, + "loss": 2.1703, + "mean_token_accuracy": 0.5362938046455383, + "num_tokens": 1506690225.0, + "step": 2947 + }, + { + "epoch": 0.7971876690102758, + "grad_norm": 1.796881914138794, + "learning_rate": 1.9221481585439276e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.509967565536499, + "num_tokens": 1507205582.0, + "step": 2948 + }, + { + "epoch": 0.7974580854515955, + "grad_norm": 1.7470511198043823, + "learning_rate": 1.922084007520887e-05, + "loss": 2.1005, + "mean_token_accuracy": 0.5392032265663147, + "num_tokens": 1507729863.0, + "step": 2949 + }, + { + "epoch": 0.7977285018929151, + "grad_norm": 2.2897167205810547, + "learning_rate": 1.9220198312738913e-05, + "loss": 2.3104, + "mean_token_accuracy": 0.5003081560134888, + "num_tokens": 1508254106.0, + "step": 2950 + }, + { + "epoch": 0.7979989183342348, + "grad_norm": 1.7170947790145874, + "learning_rate": 1.92195562980491e-05, + "loss": 2.1066, + "mean_token_accuracy": 0.5246098041534424, + "num_tokens": 1508778371.0, + "step": 2951 + }, + { + "epoch": 0.7982693347755544, + "grad_norm": 1.6667150259017944, + "learning_rate": 1.9218914031159125e-05, + "loss": 2.2076, + "mean_token_accuracy": 0.5296638607978821, + "num_tokens": 1509302630.0, + "step": 2952 + }, + { + "epoch": 0.798539751216874, + "grad_norm": 1.2766919136047363, + "learning_rate": 1.92182715120887e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.5163782835006714, + "num_tokens": 1509826874.0, + "step": 2953 + }, + { + "epoch": 0.7988101676581936, + "grad_norm": 1.3604322671890259, + "learning_rate": 1.9217628740857537e-05, + "loss": 2.1347, + "mean_token_accuracy": 0.5267298221588135, + "num_tokens": 1510351153.0, + "step": 2954 + }, + { + "epoch": 0.7990805840995132, + "grad_norm": 1.6835724115371704, + "learning_rate": 1.9216985717485358e-05, + "loss": 2.1777, + "mean_token_accuracy": 0.5542593002319336, + "num_tokens": 1510875240.0, + "step": 2955 + }, + { + "epoch": 0.7993510005408329, + "grad_norm": 1.3758718967437744, + "learning_rate": 1.9216342441991888e-05, + "loss": 2.0657, + "mean_token_accuracy": 0.5526669025421143, + "num_tokens": 1511399440.0, + "step": 2956 + }, + { + "epoch": 0.7996214169821525, + "grad_norm": 2.09769868850708, + "learning_rate": 1.9215698914396872e-05, + "loss": 2.1591, + "mean_token_accuracy": 0.5157003402709961, + "num_tokens": 1511923621.0, + "step": 2957 + }, + { + "epoch": 0.7998918334234721, + "grad_norm": 2.13220477104187, + "learning_rate": 1.9215055134720054e-05, + "loss": 2.3937, + "mean_token_accuracy": 0.4996081590652466, + "num_tokens": 1512401611.0, + "step": 2958 + }, + { + "epoch": 0.8001622498647918, + "grad_norm": 1.8713133335113525, + "learning_rate": 1.9214411102981185e-05, + "loss": 2.0867, + "mean_token_accuracy": 0.5318746566772461, + "num_tokens": 1512920254.0, + "step": 2959 + }, + { + "epoch": 0.8004326663061114, + "grad_norm": 1.8760477304458618, + "learning_rate": 1.9213766819200027e-05, + "loss": 2.306, + "mean_token_accuracy": 0.5020864009857178, + "num_tokens": 1513444449.0, + "step": 2960 + }, + { + "epoch": 0.8007030827474311, + "grad_norm": 0.6622839570045471, + "learning_rate": 1.9213122283396344e-05, + "loss": 1.2206, + "mean_token_accuracy": 0.6778841018676758, + "num_tokens": 1513954818.0, + "step": 2961 + }, + { + "epoch": 0.8009734991887507, + "grad_norm": 2.1515870094299316, + "learning_rate": 1.9212477495589917e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.5090149641036987, + "num_tokens": 1514456718.0, + "step": 2962 + }, + { + "epoch": 0.8012439156300704, + "grad_norm": 2.8587560653686523, + "learning_rate": 1.9211832455800533e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.5291112661361694, + "num_tokens": 1514929555.0, + "step": 2963 + }, + { + "epoch": 0.8015143320713899, + "grad_norm": 1.337034821510315, + "learning_rate": 1.921118716404798e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.5128475427627563, + "num_tokens": 1515453808.0, + "step": 2964 + }, + { + "epoch": 0.8017847485127095, + "grad_norm": 2.2415926456451416, + "learning_rate": 1.9210541620352054e-05, + "loss": 2.1229, + "mean_token_accuracy": 0.5621730089187622, + "num_tokens": 1515906210.0, + "step": 2965 + }, + { + "epoch": 0.8020551649540292, + "grad_norm": 1.4176832437515259, + "learning_rate": 1.9209895824732566e-05, + "loss": 2.1149, + "mean_token_accuracy": 0.506890594959259, + "num_tokens": 1516430336.0, + "step": 2966 + }, + { + "epoch": 0.8023255813953488, + "grad_norm": 1.8124696016311646, + "learning_rate": 1.9209249777209328e-05, + "loss": 2.2998, + "mean_token_accuracy": 0.5087864995002747, + "num_tokens": 1516954516.0, + "step": 2967 + }, + { + "epoch": 0.8025959978366685, + "grad_norm": 1.594586968421936, + "learning_rate": 1.9208603477802167e-05, + "loss": 2.1708, + "mean_token_accuracy": 0.5355467796325684, + "num_tokens": 1517461911.0, + "step": 2968 + }, + { + "epoch": 0.8028664142779881, + "grad_norm": 1.5007050037384033, + "learning_rate": 1.9207956926530912e-05, + "loss": 2.0417, + "mean_token_accuracy": 0.5224905014038086, + "num_tokens": 1517986160.0, + "step": 2969 + }, + { + "epoch": 0.8031368307193077, + "grad_norm": 1.4570461511611938, + "learning_rate": 1.9207310123415403e-05, + "loss": 2.1283, + "mean_token_accuracy": 0.5317803621292114, + "num_tokens": 1518453238.0, + "step": 2970 + }, + { + "epoch": 0.8034072471606274, + "grad_norm": 1.7292784452438354, + "learning_rate": 1.920666306847548e-05, + "loss": 2.2449, + "mean_token_accuracy": 0.5001092553138733, + "num_tokens": 1518977408.0, + "step": 2971 + }, + { + "epoch": 0.803677663601947, + "grad_norm": 1.369895100593567, + "learning_rate": 1.9206015761731e-05, + "loss": 2.182, + "mean_token_accuracy": 0.5331180691719055, + "num_tokens": 1519485392.0, + "step": 2972 + }, + { + "epoch": 0.8039480800432667, + "grad_norm": 1.7154661417007446, + "learning_rate": 1.9205368203201825e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.4983501732349396, + "num_tokens": 1520009518.0, + "step": 2973 + }, + { + "epoch": 0.8042184964845862, + "grad_norm": 1.424544334411621, + "learning_rate": 1.9204720392907824e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5258234143257141, + "num_tokens": 1520533646.0, + "step": 2974 + }, + { + "epoch": 0.8044889129259059, + "grad_norm": 1.5487865209579468, + "learning_rate": 1.9204072330868872e-05, + "loss": 2.2044, + "mean_token_accuracy": 0.5175096988677979, + "num_tokens": 1520999793.0, + "step": 2975 + }, + { + "epoch": 0.8047593293672255, + "grad_norm": 1.5601147413253784, + "learning_rate": 1.9203424017104853e-05, + "loss": 2.2458, + "mean_token_accuracy": 0.5204190611839294, + "num_tokens": 1521524061.0, + "step": 2976 + }, + { + "epoch": 0.8050297458085451, + "grad_norm": 1.334111213684082, + "learning_rate": 1.9202775451635664e-05, + "loss": 2.0207, + "mean_token_accuracy": 0.5556819438934326, + "num_tokens": 1522048242.0, + "step": 2977 + }, + { + "epoch": 0.8053001622498648, + "grad_norm": 1.2835087776184082, + "learning_rate": 1.9202126634481198e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.5195366144180298, + "num_tokens": 1522572301.0, + "step": 2978 + }, + { + "epoch": 0.8055705786911844, + "grad_norm": 1.7267935276031494, + "learning_rate": 1.9201477565661368e-05, + "loss": 2.1519, + "mean_token_accuracy": 0.5338690876960754, + "num_tokens": 1523067422.0, + "step": 2979 + }, + { + "epoch": 0.8058409951325041, + "grad_norm": 1.4552147388458252, + "learning_rate": 1.9200828245196083e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.5121617913246155, + "num_tokens": 1523591693.0, + "step": 2980 + }, + { + "epoch": 0.8061114115738237, + "grad_norm": 0.876873791217804, + "learning_rate": 1.9200178673105275e-05, + "loss": 1.2083, + "mean_token_accuracy": 0.6806430220603943, + "num_tokens": 1524094727.0, + "step": 2981 + }, + { + "epoch": 0.8063818280151434, + "grad_norm": 2.767321825027466, + "learning_rate": 1.9199528849408866e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.5252306461334229, + "num_tokens": 1524618788.0, + "step": 2982 + }, + { + "epoch": 0.806652244456463, + "grad_norm": 2.2306418418884277, + "learning_rate": 1.9198878774126802e-05, + "loss": 2.244, + "mean_token_accuracy": 0.523762583732605, + "num_tokens": 1525090292.0, + "step": 2983 + }, + { + "epoch": 0.8069226608977826, + "grad_norm": 1.5432687997817993, + "learning_rate": 1.9198228447279026e-05, + "loss": 2.2454, + "mean_token_accuracy": 0.5257366299629211, + "num_tokens": 1525555321.0, + "step": 2984 + }, + { + "epoch": 0.8071930773391022, + "grad_norm": 1.874511480331421, + "learning_rate": 1.919757786888549e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.5137531757354736, + "num_tokens": 1526079507.0, + "step": 2985 + }, + { + "epoch": 0.8074634937804218, + "grad_norm": 2.0578577518463135, + "learning_rate": 1.919692703896616e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.4942573308944702, + "num_tokens": 1526603561.0, + "step": 2986 + }, + { + "epoch": 0.8077339102217415, + "grad_norm": 1.6377894878387451, + "learning_rate": 1.9196275957541e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.5063691139221191, + "num_tokens": 1527127834.0, + "step": 2987 + }, + { + "epoch": 0.8080043266630611, + "grad_norm": 1.856582760810852, + "learning_rate": 1.9195624624629992e-05, + "loss": 2.1822, + "mean_token_accuracy": 0.5237991809844971, + "num_tokens": 1527592039.0, + "step": 2988 + }, + { + "epoch": 0.8082747431043807, + "grad_norm": 1.664801836013794, + "learning_rate": 1.9194973040253118e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.5303992629051208, + "num_tokens": 1528116272.0, + "step": 2989 + }, + { + "epoch": 0.8085451595457004, + "grad_norm": 1.7050788402557373, + "learning_rate": 1.919432120443037e-05, + "loss": 2.1886, + "mean_token_accuracy": 0.524692714214325, + "num_tokens": 1528633592.0, + "step": 2990 + }, + { + "epoch": 0.80881557598702, + "grad_norm": 1.7779067754745483, + "learning_rate": 1.9193669117181754e-05, + "loss": 2.0235, + "mean_token_accuracy": 0.5284867286682129, + "num_tokens": 1529157832.0, + "step": 2991 + }, + { + "epoch": 0.8090859924283397, + "grad_norm": 1.9272651672363281, + "learning_rate": 1.9193016778527268e-05, + "loss": 2.2666, + "mean_token_accuracy": 0.5063455700874329, + "num_tokens": 1529682102.0, + "step": 2992 + }, + { + "epoch": 0.8093564088696593, + "grad_norm": 1.7404500246047974, + "learning_rate": 1.9192364188486936e-05, + "loss": 2.1147, + "mean_token_accuracy": 0.5314252376556396, + "num_tokens": 1530180469.0, + "step": 2993 + }, + { + "epoch": 0.809626825310979, + "grad_norm": 1.947279691696167, + "learning_rate": 1.919171134708078e-05, + "loss": 2.2324, + "mean_token_accuracy": 0.5160045623779297, + "num_tokens": 1530672523.0, + "step": 2994 + }, + { + "epoch": 0.8098972417522985, + "grad_norm": 1.687666416168213, + "learning_rate": 1.9191058254328827e-05, + "loss": 2.1816, + "mean_token_accuracy": 0.5204278230667114, + "num_tokens": 1531196624.0, + "step": 2995 + }, + { + "epoch": 0.8101676581936181, + "grad_norm": 1.602015733718872, + "learning_rate": 1.919040491025112e-05, + "loss": 2.0413, + "mean_token_accuracy": 0.5427844524383545, + "num_tokens": 1531676559.0, + "step": 2996 + }, + { + "epoch": 0.8104380746349378, + "grad_norm": 2.1049563884735107, + "learning_rate": 1.91897513148677e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.4846424460411072, + "num_tokens": 1532200718.0, + "step": 2997 + }, + { + "epoch": 0.8107084910762574, + "grad_norm": 2.180758476257324, + "learning_rate": 1.9189097468198628e-05, + "loss": 2.1894, + "mean_token_accuracy": 0.510259747505188, + "num_tokens": 1532724958.0, + "step": 2998 + }, + { + "epoch": 0.8109789075175771, + "grad_norm": 1.5569876432418823, + "learning_rate": 1.9188443370263965e-05, + "loss": 2.2222, + "mean_token_accuracy": 0.5144679546356201, + "num_tokens": 1533214569.0, + "step": 2999 + }, + { + "epoch": 0.8112493239588967, + "grad_norm": 1.518999695777893, + "learning_rate": 1.9187789021083774e-05, + "loss": 2.0651, + "mean_token_accuracy": 0.5163427591323853, + "num_tokens": 1533738801.0, + "step": 3000 + }, + { + "epoch": 0.8115197404002163, + "grad_norm": 0.8071084022521973, + "learning_rate": 1.9187134420678145e-05, + "loss": 1.0826, + "mean_token_accuracy": 0.7119253873825073, + "num_tokens": 1534259549.0, + "step": 3001 + }, + { + "epoch": 0.811790156841536, + "grad_norm": 2.8559210300445557, + "learning_rate": 1.918647956906715e-05, + "loss": 2.202, + "mean_token_accuracy": 0.5209182500839233, + "num_tokens": 1534783713.0, + "step": 3002 + }, + { + "epoch": 0.8120605732828556, + "grad_norm": 2.110699415206909, + "learning_rate": 1.9185824466270883e-05, + "loss": 2.1826, + "mean_token_accuracy": 0.5216629505157471, + "num_tokens": 1535307972.0, + "step": 3003 + }, + { + "epoch": 0.8123309897241753, + "grad_norm": 1.7807605266571045, + "learning_rate": 1.9185169112309453e-05, + "loss": 2.2174, + "mean_token_accuracy": 0.5086511373519897, + "num_tokens": 1535832213.0, + "step": 3004 + }, + { + "epoch": 0.8126014061654948, + "grad_norm": 1.865066409111023, + "learning_rate": 1.9184513507202964e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.5307043790817261, + "num_tokens": 1536319191.0, + "step": 3005 + }, + { + "epoch": 0.8128718226068145, + "grad_norm": 1.6516770124435425, + "learning_rate": 1.9183857650971532e-05, + "loss": 2.2176, + "mean_token_accuracy": 0.5159552097320557, + "num_tokens": 1536843350.0, + "step": 3006 + }, + { + "epoch": 0.8131422390481341, + "grad_norm": 1.6852082014083862, + "learning_rate": 1.9183201543635275e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.5291831493377686, + "num_tokens": 1537367369.0, + "step": 3007 + }, + { + "epoch": 0.8134126554894537, + "grad_norm": 4.971508502960205, + "learning_rate": 1.9182545185214336e-05, + "loss": 2.0666, + "mean_token_accuracy": 0.5584995746612549, + "num_tokens": 1537891508.0, + "step": 3008 + }, + { + "epoch": 0.8136830719307734, + "grad_norm": 1.8486106395721436, + "learning_rate": 1.9181888575728844e-05, + "loss": 2.178, + "mean_token_accuracy": 0.5279475450515747, + "num_tokens": 1538384383.0, + "step": 3009 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 74.04293060302734, + "learning_rate": 1.918123171519895e-05, + "loss": 2.136, + "mean_token_accuracy": 0.5445772409439087, + "num_tokens": 1538908517.0, + "step": 3010 + }, + { + "epoch": 0.8142239048134127, + "grad_norm": 1.8473860025405884, + "learning_rate": 1.918057460364481e-05, + "loss": 2.1116, + "mean_token_accuracy": 0.5255906581878662, + "num_tokens": 1539432762.0, + "step": 3011 + }, + { + "epoch": 0.8144943212547323, + "grad_norm": 1.832921028137207, + "learning_rate": 1.917991724108658e-05, + "loss": 2.1541, + "mean_token_accuracy": 0.5408111810684204, + "num_tokens": 1539893205.0, + "step": 3012 + }, + { + "epoch": 0.814764737696052, + "grad_norm": 1.6224769353866577, + "learning_rate": 1.9179259627544433e-05, + "loss": 1.9571, + "mean_token_accuracy": 0.5565099716186523, + "num_tokens": 1540406047.0, + "step": 3013 + }, + { + "epoch": 0.8150351541373716, + "grad_norm": 2.2582356929779053, + "learning_rate": 1.917860176303855e-05, + "loss": 2.2398, + "mean_token_accuracy": 0.5148437023162842, + "num_tokens": 1540916144.0, + "step": 3014 + }, + { + "epoch": 0.8153055705786911, + "grad_norm": 20.659948348999023, + "learning_rate": 1.917794364758911e-05, + "loss": 2.1797, + "mean_token_accuracy": 0.5304989814758301, + "num_tokens": 1541440425.0, + "step": 3015 + }, + { + "epoch": 0.8155759870200108, + "grad_norm": 2.3235511779785156, + "learning_rate": 1.917728528121631e-05, + "loss": 1.9927, + "mean_token_accuracy": 0.5794143676757812, + "num_tokens": 1541964640.0, + "step": 3016 + }, + { + "epoch": 0.8158464034613304, + "grad_norm": 1.9597867727279663, + "learning_rate": 1.9176626663940355e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.5303336381912231, + "num_tokens": 1542488743.0, + "step": 3017 + }, + { + "epoch": 0.8161168199026501, + "grad_norm": 1.660814881324768, + "learning_rate": 1.9175967795781444e-05, + "loss": 2.1402, + "mean_token_accuracy": 0.5282350778579712, + "num_tokens": 1543012994.0, + "step": 3018 + }, + { + "epoch": 0.8163872363439697, + "grad_norm": 1.9401156902313232, + "learning_rate": 1.9175308676759793e-05, + "loss": 2.0734, + "mean_token_accuracy": 0.5372225642204285, + "num_tokens": 1543537047.0, + "step": 3019 + }, + { + "epoch": 0.8166576527852893, + "grad_norm": 1.7952547073364258, + "learning_rate": 1.9174649306895638e-05, + "loss": 2.2076, + "mean_token_accuracy": 0.5114045143127441, + "num_tokens": 1544061282.0, + "step": 3020 + }, + { + "epoch": 0.816928069226609, + "grad_norm": 0.7939530611038208, + "learning_rate": 1.9173989686209193e-05, + "loss": 1.0699, + "mean_token_accuracy": 0.7047719359397888, + "num_tokens": 1544537590.0, + "step": 3021 + }, + { + "epoch": 0.8171984856679286, + "grad_norm": 2.38476300239563, + "learning_rate": 1.9173329814720715e-05, + "loss": 2.1042, + "mean_token_accuracy": 0.5199391841888428, + "num_tokens": 1545061777.0, + "step": 3022 + }, + { + "epoch": 0.8174689021092483, + "grad_norm": 2.223146438598633, + "learning_rate": 1.9172669692450436e-05, + "loss": 2.1145, + "mean_token_accuracy": 0.5300801396369934, + "num_tokens": 1545574866.0, + "step": 3023 + }, + { + "epoch": 0.8177393185505679, + "grad_norm": 1.7061160802841187, + "learning_rate": 1.9172009319418618e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.517949104309082, + "num_tokens": 1546099044.0, + "step": 3024 + }, + { + "epoch": 0.8180097349918876, + "grad_norm": 2.177414894104004, + "learning_rate": 1.9171348695645525e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.5334187150001526, + "num_tokens": 1546567356.0, + "step": 3025 + }, + { + "epoch": 0.8182801514332071, + "grad_norm": 1.9237070083618164, + "learning_rate": 1.917068782115142e-05, + "loss": 2.0949, + "mean_token_accuracy": 0.5431907176971436, + "num_tokens": 1547091345.0, + "step": 3026 + }, + { + "epoch": 0.8185505678745267, + "grad_norm": 1.8864684104919434, + "learning_rate": 1.9170026695956587e-05, + "loss": 2.2357, + "mean_token_accuracy": 0.5202405452728271, + "num_tokens": 1547592037.0, + "step": 3027 + }, + { + "epoch": 0.8188209843158464, + "grad_norm": 2.2111823558807373, + "learning_rate": 1.9169365320081304e-05, + "loss": 2.2212, + "mean_token_accuracy": 0.502843976020813, + "num_tokens": 1548116238.0, + "step": 3028 + }, + { + "epoch": 0.819091400757166, + "grad_norm": 2.0382113456726074, + "learning_rate": 1.916870369354587e-05, + "loss": 2.2506, + "mean_token_accuracy": 0.5166586637496948, + "num_tokens": 1548640519.0, + "step": 3029 + }, + { + "epoch": 0.8193618171984857, + "grad_norm": 1.674344778060913, + "learning_rate": 1.9168041816370587e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.5157294273376465, + "num_tokens": 1549164739.0, + "step": 3030 + }, + { + "epoch": 0.8196322336398053, + "grad_norm": 1.5445736646652222, + "learning_rate": 1.9167379688575758e-05, + "loss": 2.0912, + "mean_token_accuracy": 0.5317858457565308, + "num_tokens": 1549672078.0, + "step": 3031 + }, + { + "epoch": 0.819902650081125, + "grad_norm": 1.99692964553833, + "learning_rate": 1.91667173101817e-05, + "loss": 2.149, + "mean_token_accuracy": 0.5434532165527344, + "num_tokens": 1550196255.0, + "step": 3032 + }, + { + "epoch": 0.8201730665224446, + "grad_norm": 1.7037383317947388, + "learning_rate": 1.9166054681208745e-05, + "loss": 2.17, + "mean_token_accuracy": 0.5196126103401184, + "num_tokens": 1550720411.0, + "step": 3033 + }, + { + "epoch": 0.8204434829637642, + "grad_norm": 1.3199973106384277, + "learning_rate": 1.9165391801677213e-05, + "loss": 2.0802, + "mean_token_accuracy": 0.5340321660041809, + "num_tokens": 1551244522.0, + "step": 3034 + }, + { + "epoch": 0.8207138994050839, + "grad_norm": 1.8096323013305664, + "learning_rate": 1.9164728671607443e-05, + "loss": 2.1138, + "mean_token_accuracy": 0.509402871131897, + "num_tokens": 1551768693.0, + "step": 3035 + }, + { + "epoch": 0.8209843158464034, + "grad_norm": 1.7649822235107422, + "learning_rate": 1.9164065291019795e-05, + "loss": 2.1451, + "mean_token_accuracy": 0.5405940413475037, + "num_tokens": 1552252073.0, + "step": 3036 + }, + { + "epoch": 0.821254732287723, + "grad_norm": 2.1683459281921387, + "learning_rate": 1.916340165993461e-05, + "loss": 2.1819, + "mean_token_accuracy": 0.5312862992286682, + "num_tokens": 1552776176.0, + "step": 3037 + }, + { + "epoch": 0.8215251487290427, + "grad_norm": 1.81108820438385, + "learning_rate": 1.9162737778372253e-05, + "loss": 2.1346, + "mean_token_accuracy": 0.517224907875061, + "num_tokens": 1553300391.0, + "step": 3038 + }, + { + "epoch": 0.8217955651703623, + "grad_norm": 1.5360926389694214, + "learning_rate": 1.9162073646353102e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.5208234190940857, + "num_tokens": 1553824543.0, + "step": 3039 + }, + { + "epoch": 0.822065981611682, + "grad_norm": 1.7472542524337769, + "learning_rate": 1.9161409263897523e-05, + "loss": 2.1014, + "mean_token_accuracy": 0.5252307057380676, + "num_tokens": 1554348750.0, + "step": 3040 + }, + { + "epoch": 0.8223363980530016, + "grad_norm": 0.7322084903717041, + "learning_rate": 1.916074463102591e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6694782376289368, + "num_tokens": 1554873034.0, + "step": 3041 + }, + { + "epoch": 0.8226068144943213, + "grad_norm": 1.9786758422851562, + "learning_rate": 1.9160079747758652e-05, + "loss": 2.1587, + "mean_token_accuracy": 0.5298800468444824, + "num_tokens": 1555397220.0, + "step": 3042 + }, + { + "epoch": 0.8228772309356409, + "grad_norm": 1.555931806564331, + "learning_rate": 1.9159414614116153e-05, + "loss": 2.089, + "mean_token_accuracy": 0.5472861528396606, + "num_tokens": 1555921332.0, + "step": 3043 + }, + { + "epoch": 0.8231476473769606, + "grad_norm": 1.5438368320465088, + "learning_rate": 1.9158749230118812e-05, + "loss": 2.143, + "mean_token_accuracy": 0.5316650867462158, + "num_tokens": 1556445602.0, + "step": 3044 + }, + { + "epoch": 0.8234180638182802, + "grad_norm": 1.5313223600387573, + "learning_rate": 1.9158083595787053e-05, + "loss": 2.1042, + "mean_token_accuracy": 0.527502179145813, + "num_tokens": 1556963950.0, + "step": 3045 + }, + { + "epoch": 0.8236884802595997, + "grad_norm": 1.9357340335845947, + "learning_rate": 1.91574177111413e-05, + "loss": 2.1693, + "mean_token_accuracy": 0.49347829818725586, + "num_tokens": 1557444200.0, + "step": 3046 + }, + { + "epoch": 0.8239588967009194, + "grad_norm": 1.8588933944702148, + "learning_rate": 1.915675157620198e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.5304391980171204, + "num_tokens": 1557910475.0, + "step": 3047 + }, + { + "epoch": 0.824229313142239, + "grad_norm": 1.3303364515304565, + "learning_rate": 1.9156085190989538e-05, + "loss": 2.108, + "mean_token_accuracy": 0.5268733501434326, + "num_tokens": 1558434728.0, + "step": 3048 + }, + { + "epoch": 0.8244997295835587, + "grad_norm": 1.8471568822860718, + "learning_rate": 1.9155418555524416e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.48877137899398804, + "num_tokens": 1558958908.0, + "step": 3049 + }, + { + "epoch": 0.8247701460248783, + "grad_norm": 1.6800175905227661, + "learning_rate": 1.9154751669827066e-05, + "loss": 2.1392, + "mean_token_accuracy": 0.5367015600204468, + "num_tokens": 1559483128.0, + "step": 3050 + }, + { + "epoch": 0.825040562466198, + "grad_norm": 1.3505754470825195, + "learning_rate": 1.9154084533917955e-05, + "loss": 2.1885, + "mean_token_accuracy": 0.5295156240463257, + "num_tokens": 1559950800.0, + "step": 3051 + }, + { + "epoch": 0.8253109789075176, + "grad_norm": 1.7517653703689575, + "learning_rate": 1.915341714781755e-05, + "loss": 2.2456, + "mean_token_accuracy": 0.5134899020195007, + "num_tokens": 1560420668.0, + "step": 3052 + }, + { + "epoch": 0.8255813953488372, + "grad_norm": 3.653545379638672, + "learning_rate": 1.9152749511546332e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5903929471969604, + "num_tokens": 1560855941.0, + "step": 3053 + }, + { + "epoch": 0.8258518117901569, + "grad_norm": 1.6680530309677124, + "learning_rate": 1.915208162512478e-05, + "loss": 2.1396, + "mean_token_accuracy": 0.529380202293396, + "num_tokens": 1561342373.0, + "step": 3054 + }, + { + "epoch": 0.8261222282314765, + "grad_norm": 1.7348239421844482, + "learning_rate": 1.915141348857339e-05, + "loss": 2.287, + "mean_token_accuracy": 0.5129331350326538, + "num_tokens": 1561866647.0, + "step": 3055 + }, + { + "epoch": 0.826392644672796, + "grad_norm": 1.6473596096038818, + "learning_rate": 1.9150745101912666e-05, + "loss": 2.2887, + "mean_token_accuracy": 0.503459632396698, + "num_tokens": 1562390927.0, + "step": 3056 + }, + { + "epoch": 0.8266630611141157, + "grad_norm": 1.5035945177078247, + "learning_rate": 1.915007646516311e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.5113818049430847, + "num_tokens": 1562915110.0, + "step": 3057 + }, + { + "epoch": 0.8269334775554353, + "grad_norm": 1.4797347784042358, + "learning_rate": 1.914940757834524e-05, + "loss": 2.2254, + "mean_token_accuracy": 0.5047715902328491, + "num_tokens": 1563435537.0, + "step": 3058 + }, + { + "epoch": 0.827203893996755, + "grad_norm": 1.3446723222732544, + "learning_rate": 1.9148738441479582e-05, + "loss": 2.1155, + "mean_token_accuracy": 0.5420520305633545, + "num_tokens": 1563959790.0, + "step": 3059 + }, + { + "epoch": 0.8274743104380746, + "grad_norm": 1.6154940128326416, + "learning_rate": 1.914806905458666e-05, + "loss": 2.2118, + "mean_token_accuracy": 0.4872049391269684, + "num_tokens": 1564484060.0, + "step": 3060 + }, + { + "epoch": 0.8277447268793943, + "grad_norm": 0.7314753532409668, + "learning_rate": 1.9147399417687023e-05, + "loss": 1.1366, + "mean_token_accuracy": 0.7067223787307739, + "num_tokens": 1565008305.0, + "step": 3061 + }, + { + "epoch": 0.8280151433207139, + "grad_norm": 2.5417659282684326, + "learning_rate": 1.914672953080121e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.5190209150314331, + "num_tokens": 1565524004.0, + "step": 3062 + }, + { + "epoch": 0.8282855597620336, + "grad_norm": 1.931990385055542, + "learning_rate": 1.9146059393949776e-05, + "loss": 2.1835, + "mean_token_accuracy": 0.5131645202636719, + "num_tokens": 1566048096.0, + "step": 3063 + }, + { + "epoch": 0.8285559762033532, + "grad_norm": 1.5715996026992798, + "learning_rate": 1.9145389007153286e-05, + "loss": 2.0862, + "mean_token_accuracy": 0.5247892737388611, + "num_tokens": 1566572371.0, + "step": 3064 + }, + { + "epoch": 0.8288263926446728, + "grad_norm": 2.0357561111450195, + "learning_rate": 1.9144718370432305e-05, + "loss": 2.1109, + "mean_token_accuracy": 0.5367531180381775, + "num_tokens": 1567057679.0, + "step": 3065 + }, + { + "epoch": 0.8290968090859925, + "grad_norm": 1.416254997253418, + "learning_rate": 1.9144047483807412e-05, + "loss": 2.1856, + "mean_token_accuracy": 0.517872154712677, + "num_tokens": 1567581819.0, + "step": 3066 + }, + { + "epoch": 0.829367225527312, + "grad_norm": 1.4485563039779663, + "learning_rate": 1.9143376347299193e-05, + "loss": 2.1521, + "mean_token_accuracy": 0.5277303457260132, + "num_tokens": 1568096218.0, + "step": 3067 + }, + { + "epoch": 0.8296376419686317, + "grad_norm": 1.90717351436615, + "learning_rate": 1.9142704960928236e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.5434178113937378, + "num_tokens": 1568620455.0, + "step": 3068 + }, + { + "epoch": 0.8299080584099513, + "grad_norm": 1.5551124811172485, + "learning_rate": 1.9142033324715153e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.5191774964332581, + "num_tokens": 1569144586.0, + "step": 3069 + }, + { + "epoch": 0.8301784748512709, + "grad_norm": 1.9906070232391357, + "learning_rate": 1.9141361438680538e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.5254793167114258, + "num_tokens": 1569655484.0, + "step": 3070 + }, + { + "epoch": 0.8304488912925906, + "grad_norm": 1.7853071689605713, + "learning_rate": 1.9140689302845014e-05, + "loss": 2.2017, + "mean_token_accuracy": 0.5343232154846191, + "num_tokens": 1570127846.0, + "step": 3071 + }, + { + "epoch": 0.8307193077339102, + "grad_norm": 2.3042337894439697, + "learning_rate": 1.9140016917229197e-05, + "loss": 2.1734, + "mean_token_accuracy": 0.5298929214477539, + "num_tokens": 1570652124.0, + "step": 3072 + }, + { + "epoch": 0.8309897241752299, + "grad_norm": 1.73373544216156, + "learning_rate": 1.9139344281853725e-05, + "loss": 2.0836, + "mean_token_accuracy": 0.5389831066131592, + "num_tokens": 1571156053.0, + "step": 3073 + }, + { + "epoch": 0.8312601406165495, + "grad_norm": 1.678112506866455, + "learning_rate": 1.913867139673924e-05, + "loss": 2.2829, + "mean_token_accuracy": 0.5146366357803345, + "num_tokens": 1571680323.0, + "step": 3074 + }, + { + "epoch": 0.8315305570578692, + "grad_norm": 2.0439181327819824, + "learning_rate": 1.9137998261906375e-05, + "loss": 2.0706, + "mean_token_accuracy": 0.5329731702804565, + "num_tokens": 1572204574.0, + "step": 3075 + }, + { + "epoch": 0.8318009734991888, + "grad_norm": 1.7083262205123901, + "learning_rate": 1.9137324877375795e-05, + "loss": 2.2181, + "mean_token_accuracy": 0.5005086660385132, + "num_tokens": 1572728853.0, + "step": 3076 + }, + { + "epoch": 0.8320713899405083, + "grad_norm": 1.597989559173584, + "learning_rate": 1.9136651243168155e-05, + "loss": 2.1826, + "mean_token_accuracy": 0.5165729522705078, + "num_tokens": 1573216999.0, + "step": 3077 + }, + { + "epoch": 0.832341806381828, + "grad_norm": 1.4499703645706177, + "learning_rate": 1.9135977359304127e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.5432965755462646, + "num_tokens": 1573694449.0, + "step": 3078 + }, + { + "epoch": 0.8326122228231476, + "grad_norm": 1.3898063898086548, + "learning_rate": 1.9135303225804385e-05, + "loss": 2.0665, + "mean_token_accuracy": 0.5333842635154724, + "num_tokens": 1574218676.0, + "step": 3079 + }, + { + "epoch": 0.8328826392644673, + "grad_norm": 1.6776597499847412, + "learning_rate": 1.913462884268962e-05, + "loss": 2.2154, + "mean_token_accuracy": 0.5166066288948059, + "num_tokens": 1574661091.0, + "step": 3080 + }, + { + "epoch": 0.8331530557057869, + "grad_norm": 0.9018259644508362, + "learning_rate": 1.913395420998052e-05, + "loss": 1.2163, + "mean_token_accuracy": 0.6769991517066956, + "num_tokens": 1575185209.0, + "step": 3081 + }, + { + "epoch": 0.8334234721471065, + "grad_norm": 1.687093734741211, + "learning_rate": 1.9133279327697784e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.5213579535484314, + "num_tokens": 1575709479.0, + "step": 3082 + }, + { + "epoch": 0.8336938885884262, + "grad_norm": 1.5733870267868042, + "learning_rate": 1.9132604195862117e-05, + "loss": 2.0691, + "mean_token_accuracy": 0.5083342790603638, + "num_tokens": 1576233695.0, + "step": 3083 + }, + { + "epoch": 0.8339643050297458, + "grad_norm": 1.300407886505127, + "learning_rate": 1.913192881449424e-05, + "loss": 1.9548, + "mean_token_accuracy": 0.5614482164382935, + "num_tokens": 1576757880.0, + "step": 3084 + }, + { + "epoch": 0.8342347214710655, + "grad_norm": 1.453201174736023, + "learning_rate": 1.9131253183614868e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.5256737470626831, + "num_tokens": 1577282072.0, + "step": 3085 + }, + { + "epoch": 0.8345051379123851, + "grad_norm": 1.3737695217132568, + "learning_rate": 1.913057730324474e-05, + "loss": 2.0268, + "mean_token_accuracy": 0.5360413789749146, + "num_tokens": 1577806256.0, + "step": 3086 + }, + { + "epoch": 0.8347755543537047, + "grad_norm": 1.9339377880096436, + "learning_rate": 1.912990117340459e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.5245174765586853, + "num_tokens": 1578326218.0, + "step": 3087 + }, + { + "epoch": 0.8350459707950243, + "grad_norm": 1.4734255075454712, + "learning_rate": 1.9129224794115162e-05, + "loss": 2.2125, + "mean_token_accuracy": 0.5214383602142334, + "num_tokens": 1578850485.0, + "step": 3088 + }, + { + "epoch": 0.8353163872363439, + "grad_norm": 1.6214666366577148, + "learning_rate": 1.912854816539721e-05, + "loss": 2.1229, + "mean_token_accuracy": 0.5427889823913574, + "num_tokens": 1579374646.0, + "step": 3089 + }, + { + "epoch": 0.8355868036776636, + "grad_norm": 1.9720451831817627, + "learning_rate": 1.91278712872715e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.5155732035636902, + "num_tokens": 1579898852.0, + "step": 3090 + }, + { + "epoch": 0.8358572201189832, + "grad_norm": 1.5027227401733398, + "learning_rate": 1.9127194159758792e-05, + "loss": 2.2103, + "mean_token_accuracy": 0.5376877784729004, + "num_tokens": 1580381904.0, + "step": 3091 + }, + { + "epoch": 0.8361276365603029, + "grad_norm": 1.5272996425628662, + "learning_rate": 1.9126516782879865e-05, + "loss": 2.1401, + "mean_token_accuracy": 0.5601872205734253, + "num_tokens": 1580858910.0, + "step": 3092 + }, + { + "epoch": 0.8363980530016225, + "grad_norm": 1.4270168542861938, + "learning_rate": 1.9125839156655507e-05, + "loss": 2.0674, + "mean_token_accuracy": 0.5510798096656799, + "num_tokens": 1581383031.0, + "step": 3093 + }, + { + "epoch": 0.8366684694429422, + "grad_norm": 1.5995821952819824, + "learning_rate": 1.912516128110651e-05, + "loss": 2.1787, + "mean_token_accuracy": 0.5276225805282593, + "num_tokens": 1581907305.0, + "step": 3094 + }, + { + "epoch": 0.8369388858842618, + "grad_norm": 1.526404619216919, + "learning_rate": 1.912448315625367e-05, + "loss": 2.1408, + "mean_token_accuracy": 0.5231369137763977, + "num_tokens": 1582431535.0, + "step": 3095 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 5.372806549072266, + "learning_rate": 1.912380478211779e-05, + "loss": 2.0742, + "mean_token_accuracy": 0.5436117053031921, + "num_tokens": 1582955743.0, + "step": 3096 + }, + { + "epoch": 0.8374797187669011, + "grad_norm": 2.019021987915039, + "learning_rate": 1.912312615871969e-05, + "loss": 2.1979, + "mean_token_accuracy": 0.523020327091217, + "num_tokens": 1583479955.0, + "step": 3097 + }, + { + "epoch": 0.8377501352082206, + "grad_norm": 1.5942257642745972, + "learning_rate": 1.912244728608019e-05, + "loss": 2.1478, + "mean_token_accuracy": 0.536700427532196, + "num_tokens": 1583946009.0, + "step": 3098 + }, + { + "epoch": 0.8380205516495403, + "grad_norm": 1.4018869400024414, + "learning_rate": 1.912176816422012e-05, + "loss": 2.0075, + "mean_token_accuracy": 0.5406582951545715, + "num_tokens": 1584470171.0, + "step": 3099 + }, + { + "epoch": 0.8382909680908599, + "grad_norm": 1.3654356002807617, + "learning_rate": 1.9121088793160326e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.5278287529945374, + "num_tokens": 1584978992.0, + "step": 3100 + }, + { + "epoch": 0.8385613845321795, + "grad_norm": 0.7224537134170532, + "learning_rate": 1.912040917292164e-05, + "loss": 1.1684, + "mean_token_accuracy": 0.6882117986679077, + "num_tokens": 1585503080.0, + "step": 3101 + }, + { + "epoch": 0.8388318009734992, + "grad_norm": 2.1115005016326904, + "learning_rate": 1.9119729303524918e-05, + "loss": 2.1337, + "mean_token_accuracy": 0.5285142660140991, + "num_tokens": 1585929687.0, + "step": 3102 + }, + { + "epoch": 0.8391022174148188, + "grad_norm": 1.8594391345977783, + "learning_rate": 1.9119049184991025e-05, + "loss": 2.2112, + "mean_token_accuracy": 0.5185881853103638, + "num_tokens": 1586453821.0, + "step": 3103 + }, + { + "epoch": 0.8393726338561385, + "grad_norm": 1.4696553945541382, + "learning_rate": 1.9118368817340826e-05, + "loss": 2.1098, + "mean_token_accuracy": 0.5210840702056885, + "num_tokens": 1586978086.0, + "step": 3104 + }, + { + "epoch": 0.8396430502974581, + "grad_norm": 1.3332104682922363, + "learning_rate": 1.9117688200595195e-05, + "loss": 2.0411, + "mean_token_accuracy": 0.5278831720352173, + "num_tokens": 1587502257.0, + "step": 3105 + }, + { + "epoch": 0.8399134667387778, + "grad_norm": 1.801061987876892, + "learning_rate": 1.911700733477502e-05, + "loss": 2.0806, + "mean_token_accuracy": 0.5262324810028076, + "num_tokens": 1588026514.0, + "step": 3106 + }, + { + "epoch": 0.8401838831800974, + "grad_norm": 1.649311900138855, + "learning_rate": 1.9116326219901188e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.5292889475822449, + "num_tokens": 1588550703.0, + "step": 3107 + }, + { + "epoch": 0.8404542996214169, + "grad_norm": 1.4306615591049194, + "learning_rate": 1.91156448559946e-05, + "loss": 2.116, + "mean_token_accuracy": 0.5354213714599609, + "num_tokens": 1589074878.0, + "step": 3108 + }, + { + "epoch": 0.8407247160627366, + "grad_norm": 1.456963062286377, + "learning_rate": 1.9114963243076163e-05, + "loss": 2.1574, + "mean_token_accuracy": 0.5237948894500732, + "num_tokens": 1589559314.0, + "step": 3109 + }, + { + "epoch": 0.8409951325040562, + "grad_norm": 1.4406505823135376, + "learning_rate": 1.9114281381166787e-05, + "loss": 1.9856, + "mean_token_accuracy": 0.5464087128639221, + "num_tokens": 1590083534.0, + "step": 3110 + }, + { + "epoch": 0.8412655489453759, + "grad_norm": 1.6153695583343506, + "learning_rate": 1.9113599270287394e-05, + "loss": 2.2146, + "mean_token_accuracy": 0.5070135593414307, + "num_tokens": 1590607705.0, + "step": 3111 + }, + { + "epoch": 0.8415359653866955, + "grad_norm": 1.4504969120025635, + "learning_rate": 1.911291691045892e-05, + "loss": 2.1874, + "mean_token_accuracy": 0.5350950956344604, + "num_tokens": 1591087369.0, + "step": 3112 + }, + { + "epoch": 0.8418063818280151, + "grad_norm": 1.4607131481170654, + "learning_rate": 1.911223430170229e-05, + "loss": 2.1394, + "mean_token_accuracy": 0.5137184858322144, + "num_tokens": 1591611613.0, + "step": 3113 + }, + { + "epoch": 0.8420767982693348, + "grad_norm": 1.2625350952148438, + "learning_rate": 1.911155144403846e-05, + "loss": 2.0794, + "mean_token_accuracy": 0.537237286567688, + "num_tokens": 1592114730.0, + "step": 3114 + }, + { + "epoch": 0.8423472147106544, + "grad_norm": 1.167140245437622, + "learning_rate": 1.9110868337488375e-05, + "loss": 2.1147, + "mean_token_accuracy": 0.5147099494934082, + "num_tokens": 1592638962.0, + "step": 3115 + }, + { + "epoch": 0.8426176311519741, + "grad_norm": 1.5134888887405396, + "learning_rate": 1.9110184982072996e-05, + "loss": 2.1473, + "mean_token_accuracy": 0.5273872017860413, + "num_tokens": 1593163133.0, + "step": 3116 + }, + { + "epoch": 0.8428880475932937, + "grad_norm": 1.5333616733551025, + "learning_rate": 1.9109501377813296e-05, + "loss": 2.1388, + "mean_token_accuracy": 0.5331262350082397, + "num_tokens": 1593658382.0, + "step": 3117 + }, + { + "epoch": 0.8431584640346133, + "grad_norm": 1.4197767972946167, + "learning_rate": 1.910881752473024e-05, + "loss": 2.1995, + "mean_token_accuracy": 0.5306646227836609, + "num_tokens": 1594182497.0, + "step": 3118 + }, + { + "epoch": 0.8434288804759329, + "grad_norm": 1.5220040082931519, + "learning_rate": 1.910813342284482e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.5279404520988464, + "num_tokens": 1594706779.0, + "step": 3119 + }, + { + "epoch": 0.8436992969172525, + "grad_norm": 1.3706190586090088, + "learning_rate": 1.910744907217802e-05, + "loss": 2.1892, + "mean_token_accuracy": 0.513505756855011, + "num_tokens": 1595230998.0, + "step": 3120 + }, + { + "epoch": 0.8439697133585722, + "grad_norm": 0.7377224564552307, + "learning_rate": 1.910676447275084e-05, + "loss": 1.0748, + "mean_token_accuracy": 0.7117986083030701, + "num_tokens": 1595755130.0, + "step": 3121 + }, + { + "epoch": 0.8442401297998918, + "grad_norm": 2.080348491668701, + "learning_rate": 1.9106079624584283e-05, + "loss": 2.1047, + "mean_token_accuracy": 0.5246002078056335, + "num_tokens": 1596279353.0, + "step": 3122 + }, + { + "epoch": 0.8445105462412115, + "grad_norm": 1.8954294919967651, + "learning_rate": 1.9105394527699367e-05, + "loss": 2.0869, + "mean_token_accuracy": 0.5353215932846069, + "num_tokens": 1596803331.0, + "step": 3123 + }, + { + "epoch": 0.8447809626825311, + "grad_norm": 1.5679514408111572, + "learning_rate": 1.910470918211711e-05, + "loss": 2.2511, + "mean_token_accuracy": 0.4920603632926941, + "num_tokens": 1597327573.0, + "step": 3124 + }, + { + "epoch": 0.8450513791238508, + "grad_norm": 1.677425742149353, + "learning_rate": 1.9104023587858543e-05, + "loss": 2.0805, + "mean_token_accuracy": 0.5239757299423218, + "num_tokens": 1597851849.0, + "step": 3125 + }, + { + "epoch": 0.8453217955651704, + "grad_norm": 1.5755207538604736, + "learning_rate": 1.9103337744944696e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.519959568977356, + "num_tokens": 1598376119.0, + "step": 3126 + }, + { + "epoch": 0.84559221200649, + "grad_norm": 1.4432557821273804, + "learning_rate": 1.9102651653396623e-05, + "loss": 2.0655, + "mean_token_accuracy": 0.5285055041313171, + "num_tokens": 1598900240.0, + "step": 3127 + }, + { + "epoch": 0.8458626284478096, + "grad_norm": 1.4669448137283325, + "learning_rate": 1.910196531323536e-05, + "loss": 2.1711, + "mean_token_accuracy": 0.5230278968811035, + "num_tokens": 1599424492.0, + "step": 3128 + }, + { + "epoch": 0.8461330448891292, + "grad_norm": 1.442421793937683, + "learning_rate": 1.910127872448198e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.5240143537521362, + "num_tokens": 1599948593.0, + "step": 3129 + }, + { + "epoch": 0.8464034613304489, + "grad_norm": 1.76181960105896, + "learning_rate": 1.9100591887157545e-05, + "loss": 1.9888, + "mean_token_accuracy": 0.5569729208946228, + "num_tokens": 1600472845.0, + "step": 3130 + }, + { + "epoch": 0.8466738777717685, + "grad_norm": 1.454270362854004, + "learning_rate": 1.909990480128313e-05, + "loss": 2.17, + "mean_token_accuracy": 0.5230475068092346, + "num_tokens": 1600997108.0, + "step": 3131 + }, + { + "epoch": 0.8469442942130881, + "grad_norm": 1.3958051204681396, + "learning_rate": 1.9099217466879812e-05, + "loss": 2.0184, + "mean_token_accuracy": 0.5282706022262573, + "num_tokens": 1601521346.0, + "step": 3132 + }, + { + "epoch": 0.8472147106544078, + "grad_norm": 2.6404640674591064, + "learning_rate": 1.9098529883968686e-05, + "loss": 1.9346, + "mean_token_accuracy": 0.5830709934234619, + "num_tokens": 1601985140.0, + "step": 3133 + }, + { + "epoch": 0.8474851270957274, + "grad_norm": 2.031980276107788, + "learning_rate": 1.9097842052570846e-05, + "loss": 2.2398, + "mean_token_accuracy": 0.5317518711090088, + "num_tokens": 1602495773.0, + "step": 3134 + }, + { + "epoch": 0.8477555435370471, + "grad_norm": 1.6027755737304688, + "learning_rate": 1.9097153972707397e-05, + "loss": 2.143, + "mean_token_accuracy": 0.5300835967063904, + "num_tokens": 1603008651.0, + "step": 3135 + }, + { + "epoch": 0.8480259599783667, + "grad_norm": 1.944615364074707, + "learning_rate": 1.9096465644399454e-05, + "loss": 2.2, + "mean_token_accuracy": 0.5320522785186768, + "num_tokens": 1603501274.0, + "step": 3136 + }, + { + "epoch": 0.8482963764196864, + "grad_norm": 2.008500576019287, + "learning_rate": 1.9095777067668132e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.5027891993522644, + "num_tokens": 1604025501.0, + "step": 3137 + }, + { + "epoch": 0.848566792861006, + "grad_norm": 1.8000619411468506, + "learning_rate": 1.9095088242534562e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.5045648217201233, + "num_tokens": 1604549703.0, + "step": 3138 + }, + { + "epoch": 0.8488372093023255, + "grad_norm": 1.8788894414901733, + "learning_rate": 1.9094399169019884e-05, + "loss": 2.1129, + "mean_token_accuracy": 0.5321779847145081, + "num_tokens": 1605073851.0, + "step": 3139 + }, + { + "epoch": 0.8491076257436452, + "grad_norm": 1.5117542743682861, + "learning_rate": 1.909370984714523e-05, + "loss": 2.161, + "mean_token_accuracy": 0.5348155498504639, + "num_tokens": 1605567844.0, + "step": 3140 + }, + { + "epoch": 0.8493780421849648, + "grad_norm": 0.9084751605987549, + "learning_rate": 1.9093020276931757e-05, + "loss": 1.162, + "mean_token_accuracy": 0.6976551413536072, + "num_tokens": 1606031663.0, + "step": 3141 + }, + { + "epoch": 0.8496484586262845, + "grad_norm": 3.1089117527008057, + "learning_rate": 1.909233045840062e-05, + "loss": 2.0755, + "mean_token_accuracy": 0.5321809649467468, + "num_tokens": 1606530531.0, + "step": 3142 + }, + { + "epoch": 0.8499188750676041, + "grad_norm": 2.708055257797241, + "learning_rate": 1.909164039157299e-05, + "loss": 2.151, + "mean_token_accuracy": 0.5524985194206238, + "num_tokens": 1606916016.0, + "step": 3143 + }, + { + "epoch": 0.8501892915089238, + "grad_norm": 2.1812984943389893, + "learning_rate": 1.9090950076470037e-05, + "loss": 1.8316, + "mean_token_accuracy": 0.5907837152481079, + "num_tokens": 1607440159.0, + "step": 3144 + }, + { + "epoch": 0.8504597079502434, + "grad_norm": 2.386626720428467, + "learning_rate": 1.909025951311294e-05, + "loss": 2.2335, + "mean_token_accuracy": 0.4949881136417389, + "num_tokens": 1607964369.0, + "step": 3145 + }, + { + "epoch": 0.850730124391563, + "grad_norm": 2.667607069015503, + "learning_rate": 1.9089568701522885e-05, + "loss": 2.1087, + "mean_token_accuracy": 0.5271506309509277, + "num_tokens": 1608488638.0, + "step": 3146 + }, + { + "epoch": 0.8510005408328827, + "grad_norm": 1.8503683805465698, + "learning_rate": 1.9088877641721078e-05, + "loss": 2.1315, + "mean_token_accuracy": 0.5364640951156616, + "num_tokens": 1609012849.0, + "step": 3147 + }, + { + "epoch": 0.8512709572742023, + "grad_norm": 2.1376872062683105, + "learning_rate": 1.9088186333728718e-05, + "loss": 2.0598, + "mean_token_accuracy": 0.5427930355072021, + "num_tokens": 1609505177.0, + "step": 3148 + }, + { + "epoch": 0.8515413737155219, + "grad_norm": 2.4247992038726807, + "learning_rate": 1.908749477756701e-05, + "loss": 2.12, + "mean_token_accuracy": 0.5319451093673706, + "num_tokens": 1610029397.0, + "step": 3149 + }, + { + "epoch": 0.8518117901568415, + "grad_norm": 1.6370618343353271, + "learning_rate": 1.9086802973257184e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.5149735808372498, + "num_tokens": 1610553598.0, + "step": 3150 + }, + { + "epoch": 0.8520822065981611, + "grad_norm": 1.5318591594696045, + "learning_rate": 1.9086110920820458e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.5201668739318848, + "num_tokens": 1611077864.0, + "step": 3151 + }, + { + "epoch": 0.8523526230394808, + "grad_norm": 1.8640631437301636, + "learning_rate": 1.9085418620278068e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.5309929847717285, + "num_tokens": 1611602005.0, + "step": 3152 + }, + { + "epoch": 0.8526230394808004, + "grad_norm": 1.5467609167099, + "learning_rate": 1.9084726071651257e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.5352299213409424, + "num_tokens": 1612055205.0, + "step": 3153 + }, + { + "epoch": 0.8528934559221201, + "grad_norm": 1.935638666152954, + "learning_rate": 1.9084033274961276e-05, + "loss": 2.307, + "mean_token_accuracy": 0.4902316629886627, + "num_tokens": 1612579421.0, + "step": 3154 + }, + { + "epoch": 0.8531638723634397, + "grad_norm": 2.300675868988037, + "learning_rate": 1.9083340230229383e-05, + "loss": 2.2169, + "mean_token_accuracy": 0.5326868295669556, + "num_tokens": 1613103699.0, + "step": 3155 + }, + { + "epoch": 0.8534342888047594, + "grad_norm": 1.7671074867248535, + "learning_rate": 1.908264693747684e-05, + "loss": 2.2764, + "mean_token_accuracy": 0.5013143420219421, + "num_tokens": 1613627755.0, + "step": 3156 + }, + { + "epoch": 0.853704705246079, + "grad_norm": 1.9898138046264648, + "learning_rate": 1.9081953396724916e-05, + "loss": 2.1076, + "mean_token_accuracy": 0.5239374041557312, + "num_tokens": 1614151962.0, + "step": 3157 + }, + { + "epoch": 0.8539751216873986, + "grad_norm": 42.25894546508789, + "learning_rate": 1.9081259607994897e-05, + "loss": 2.5913, + "mean_token_accuracy": 0.5063967108726501, + "num_tokens": 1614676196.0, + "step": 3158 + }, + { + "epoch": 0.8542455381287182, + "grad_norm": 2.140423536300659, + "learning_rate": 1.9080565571308066e-05, + "loss": 2.1133, + "mean_token_accuracy": 0.5292004942893982, + "num_tokens": 1615200374.0, + "step": 3159 + }, + { + "epoch": 0.8545159545700378, + "grad_norm": 1.6917823553085327, + "learning_rate": 1.907987128668572e-05, + "loss": 1.9323, + "mean_token_accuracy": 0.5407809615135193, + "num_tokens": 1615724599.0, + "step": 3160 + }, + { + "epoch": 0.8547863710113575, + "grad_norm": 1.0086396932601929, + "learning_rate": 1.9079176754149164e-05, + "loss": 1.1285, + "mean_token_accuracy": 0.6913131475448608, + "num_tokens": 1616248798.0, + "step": 3161 + }, + { + "epoch": 0.8550567874526771, + "grad_norm": 1.9245657920837402, + "learning_rate": 1.9078481973719703e-05, + "loss": 2.1868, + "mean_token_accuracy": 0.5134856700897217, + "num_tokens": 1616773062.0, + "step": 3162 + }, + { + "epoch": 0.8553272038939967, + "grad_norm": 1.7609713077545166, + "learning_rate": 1.907778694541866e-05, + "loss": 2.0562, + "mean_token_accuracy": 0.5277715921401978, + "num_tokens": 1617297300.0, + "step": 3163 + }, + { + "epoch": 0.8555976203353164, + "grad_norm": 1.3341152667999268, + "learning_rate": 1.9077091669267355e-05, + "loss": 2.0076, + "mean_token_accuracy": 0.5309654474258423, + "num_tokens": 1617821463.0, + "step": 3164 + }, + { + "epoch": 0.855868036776636, + "grad_norm": 1.4896719455718994, + "learning_rate": 1.9076396145287125e-05, + "loss": 2.056, + "mean_token_accuracy": 0.5326027870178223, + "num_tokens": 1618345493.0, + "step": 3165 + }, + { + "epoch": 0.8561384532179557, + "grad_norm": 1.740841031074524, + "learning_rate": 1.907570037349931e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.523500382900238, + "num_tokens": 1618869729.0, + "step": 3166 + }, + { + "epoch": 0.8564088696592753, + "grad_norm": 1.660088300704956, + "learning_rate": 1.9075004353925264e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.5433833599090576, + "num_tokens": 1619393698.0, + "step": 3167 + }, + { + "epoch": 0.856679286100595, + "grad_norm": 1.6366828680038452, + "learning_rate": 1.9074308086586334e-05, + "loss": 2.0913, + "mean_token_accuracy": 0.5314833521842957, + "num_tokens": 1619917801.0, + "step": 3168 + }, + { + "epoch": 0.8569497025419145, + "grad_norm": 2.1016452312469482, + "learning_rate": 1.9073611571503883e-05, + "loss": 2.1366, + "mean_token_accuracy": 0.5393602848052979, + "num_tokens": 1620442085.0, + "step": 3169 + }, + { + "epoch": 0.8572201189832341, + "grad_norm": 1.6769403219223022, + "learning_rate": 1.907291480869929e-05, + "loss": 2.213, + "mean_token_accuracy": 0.5289733409881592, + "num_tokens": 1620966224.0, + "step": 3170 + }, + { + "epoch": 0.8574905354245538, + "grad_norm": 1.6190142631530762, + "learning_rate": 1.907221779819393e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.5290113687515259, + "num_tokens": 1621490309.0, + "step": 3171 + }, + { + "epoch": 0.8577609518658734, + "grad_norm": 1.4144723415374756, + "learning_rate": 1.9071520540009186e-05, + "loss": 1.9547, + "mean_token_accuracy": 0.559267520904541, + "num_tokens": 1621977026.0, + "step": 3172 + }, + { + "epoch": 0.8580313683071931, + "grad_norm": 1.964215874671936, + "learning_rate": 1.907082303416646e-05, + "loss": 2.118, + "mean_token_accuracy": 0.5198613405227661, + "num_tokens": 1622501226.0, + "step": 3173 + }, + { + "epoch": 0.8583017847485127, + "grad_norm": 1.983967900276184, + "learning_rate": 1.9070125280687143e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.5292027592658997, + "num_tokens": 1622979223.0, + "step": 3174 + }, + { + "epoch": 0.8585722011898324, + "grad_norm": 1.5963835716247559, + "learning_rate": 1.9069427279592655e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.5182717442512512, + "num_tokens": 1623503468.0, + "step": 3175 + }, + { + "epoch": 0.858842617631152, + "grad_norm": 1.5803347826004028, + "learning_rate": 1.9068729030904406e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5260375738143921, + "num_tokens": 1624027712.0, + "step": 3176 + }, + { + "epoch": 0.8591130340724716, + "grad_norm": 1.5247318744659424, + "learning_rate": 1.906803053464382e-05, + "loss": 2.2744, + "mean_token_accuracy": 0.5190484523773193, + "num_tokens": 1624546227.0, + "step": 3177 + }, + { + "epoch": 0.8593834505137913, + "grad_norm": 1.4968844652175903, + "learning_rate": 1.906733179083233e-05, + "loss": 2.1378, + "mean_token_accuracy": 0.540393590927124, + "num_tokens": 1625057030.0, + "step": 3178 + }, + { + "epoch": 0.8596538669551109, + "grad_norm": 1.4811025857925415, + "learning_rate": 1.9066632799491375e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.5286545753479004, + "num_tokens": 1625581156.0, + "step": 3179 + }, + { + "epoch": 0.8599242833964305, + "grad_norm": 1.5332282781600952, + "learning_rate": 1.906593356064241e-05, + "loss": 2.1563, + "mean_token_accuracy": 0.5217998623847961, + "num_tokens": 1626105420.0, + "step": 3180 + }, + { + "epoch": 0.8601946998377501, + "grad_norm": 0.6982324123382568, + "learning_rate": 1.9065234074306878e-05, + "loss": 1.0652, + "mean_token_accuracy": 0.7161045074462891, + "num_tokens": 1626627803.0, + "step": 3181 + }, + { + "epoch": 0.8604651162790697, + "grad_norm": 1.8953348398208618, + "learning_rate": 1.9064534340506245e-05, + "loss": 2.0747, + "mean_token_accuracy": 0.5322533249855042, + "num_tokens": 1627151995.0, + "step": 3182 + }, + { + "epoch": 0.8607355327203894, + "grad_norm": 1.5152615308761597, + "learning_rate": 1.906383435926199e-05, + "loss": 2.0897, + "mean_token_accuracy": 0.5222108960151672, + "num_tokens": 1627676201.0, + "step": 3183 + }, + { + "epoch": 0.861005949161709, + "grad_norm": 1.4434514045715332, + "learning_rate": 1.9063134130595574e-05, + "loss": 1.9461, + "mean_token_accuracy": 0.5609846115112305, + "num_tokens": 1628200416.0, + "step": 3184 + }, + { + "epoch": 0.8612763656030287, + "grad_norm": 1.6623238325119019, + "learning_rate": 1.906243365452849e-05, + "loss": 2.0747, + "mean_token_accuracy": 0.5325440168380737, + "num_tokens": 1628679868.0, + "step": 3185 + }, + { + "epoch": 0.8615467820443483, + "grad_norm": 1.5331083536148071, + "learning_rate": 1.906173293108224e-05, + "loss": 2.0825, + "mean_token_accuracy": 0.5336595773696899, + "num_tokens": 1629204139.0, + "step": 3186 + }, + { + "epoch": 0.861817198485668, + "grad_norm": 1.3931379318237305, + "learning_rate": 1.906103196027831e-05, + "loss": 2.1637, + "mean_token_accuracy": 0.5194077491760254, + "num_tokens": 1629629630.0, + "step": 3187 + }, + { + "epoch": 0.8620876149269876, + "grad_norm": 1.5541179180145264, + "learning_rate": 1.906033074213821e-05, + "loss": 2.099, + "mean_token_accuracy": 0.5308583378791809, + "num_tokens": 1630153844.0, + "step": 3188 + }, + { + "epoch": 0.8623580313683072, + "grad_norm": 1.4161561727523804, + "learning_rate": 1.9059629276683464e-05, + "loss": 1.9383, + "mean_token_accuracy": 0.5572459101676941, + "num_tokens": 1630669262.0, + "step": 3189 + }, + { + "epoch": 0.8626284478096268, + "grad_norm": 1.4686418771743774, + "learning_rate": 1.9058927563935592e-05, + "loss": 2.1312, + "mean_token_accuracy": 0.5284469127655029, + "num_tokens": 1631193453.0, + "step": 3190 + }, + { + "epoch": 0.8628988642509464, + "grad_norm": 1.687522053718567, + "learning_rate": 1.905822560391612e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.527838945388794, + "num_tokens": 1631717465.0, + "step": 3191 + }, + { + "epoch": 0.8631692806922661, + "grad_norm": 1.3249844312667847, + "learning_rate": 1.9057523396646587e-05, + "loss": 2.1776, + "mean_token_accuracy": 0.5205957889556885, + "num_tokens": 1632241626.0, + "step": 3192 + }, + { + "epoch": 0.8634396971335857, + "grad_norm": 1.6049326658248901, + "learning_rate": 1.9056820942148542e-05, + "loss": 2.129, + "mean_token_accuracy": 0.5001600384712219, + "num_tokens": 1632765590.0, + "step": 3193 + }, + { + "epoch": 0.8637101135749053, + "grad_norm": 1.3955336809158325, + "learning_rate": 1.9056118240443537e-05, + "loss": 2.0349, + "mean_token_accuracy": 0.5405848026275635, + "num_tokens": 1633289868.0, + "step": 3194 + }, + { + "epoch": 0.863980530016225, + "grad_norm": 1.6652737855911255, + "learning_rate": 1.9055415291553133e-05, + "loss": 2.2131, + "mean_token_accuracy": 0.5220987796783447, + "num_tokens": 1633814102.0, + "step": 3195 + }, + { + "epoch": 0.8642509464575446, + "grad_norm": 1.574709415435791, + "learning_rate": 1.9054712095498898e-05, + "loss": 2.1289, + "mean_token_accuracy": 0.5232770442962646, + "num_tokens": 1634338355.0, + "step": 3196 + }, + { + "epoch": 0.8645213628988643, + "grad_norm": 1.6456891298294067, + "learning_rate": 1.905400865230241e-05, + "loss": 2.144, + "mean_token_accuracy": 0.5265499353408813, + "num_tokens": 1634862603.0, + "step": 3197 + }, + { + "epoch": 0.8647917793401839, + "grad_norm": 1.5469337701797485, + "learning_rate": 1.905330496198525e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.550636887550354, + "num_tokens": 1635351596.0, + "step": 3198 + }, + { + "epoch": 0.8650621957815036, + "grad_norm": 1.3436473608016968, + "learning_rate": 1.905260102456901e-05, + "loss": 2.1081, + "mean_token_accuracy": 0.5379206538200378, + "num_tokens": 1635833883.0, + "step": 3199 + }, + { + "epoch": 0.8653326122228231, + "grad_norm": 1.5868263244628906, + "learning_rate": 1.9051896840075292e-05, + "loss": 2.1366, + "mean_token_accuracy": 0.5338678359985352, + "num_tokens": 1636292422.0, + "step": 3200 + }, + { + "epoch": 0.8656030286641427, + "grad_norm": 0.6883298754692078, + "learning_rate": 1.9051192408525698e-05, + "loss": 1.214, + "mean_token_accuracy": 0.6877666115760803, + "num_tokens": 1636816628.0, + "step": 3201 + }, + { + "epoch": 0.8658734451054624, + "grad_norm": 2.028073787689209, + "learning_rate": 1.905048772994184e-05, + "loss": 2.1918, + "mean_token_accuracy": 0.5282242298126221, + "num_tokens": 1637268156.0, + "step": 3202 + }, + { + "epoch": 0.866143861546782, + "grad_norm": 1.7990387678146362, + "learning_rate": 1.904978280434535e-05, + "loss": 2.2971, + "mean_token_accuracy": 0.5005111694335938, + "num_tokens": 1637792422.0, + "step": 3203 + }, + { + "epoch": 0.8664142779881017, + "grad_norm": 1.1747812032699585, + "learning_rate": 1.9049077631757847e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.517953097820282, + "num_tokens": 1638316707.0, + "step": 3204 + }, + { + "epoch": 0.8666846944294213, + "grad_norm": 1.3948066234588623, + "learning_rate": 1.9048372212200973e-05, + "loss": 2.1007, + "mean_token_accuracy": 0.5309860706329346, + "num_tokens": 1638783226.0, + "step": 3205 + }, + { + "epoch": 0.866955110870741, + "grad_norm": 1.3352117538452148, + "learning_rate": 1.904766654569637e-05, + "loss": 2.0697, + "mean_token_accuracy": 0.5063292980194092, + "num_tokens": 1639307280.0, + "step": 3206 + }, + { + "epoch": 0.8672255273120606, + "grad_norm": 1.8152804374694824, + "learning_rate": 1.904696063226569e-05, + "loss": 1.9685, + "mean_token_accuracy": 0.565610408782959, + "num_tokens": 1639738280.0, + "step": 3207 + }, + { + "epoch": 0.8674959437533802, + "grad_norm": 1.6441054344177246, + "learning_rate": 1.9046254471930595e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5216021537780762, + "num_tokens": 1640262329.0, + "step": 3208 + }, + { + "epoch": 0.8677663601946999, + "grad_norm": 1.3256644010543823, + "learning_rate": 1.904554806471275e-05, + "loss": 2.0465, + "mean_token_accuracy": 0.546785831451416, + "num_tokens": 1640786495.0, + "step": 3209 + }, + { + "epoch": 0.8680367766360195, + "grad_norm": 1.4263827800750732, + "learning_rate": 1.904484141063383e-05, + "loss": 2.1516, + "mean_token_accuracy": 0.521571934223175, + "num_tokens": 1641251773.0, + "step": 3210 + }, + { + "epoch": 0.8683071930773391, + "grad_norm": 33.031246185302734, + "learning_rate": 1.9044134509715516e-05, + "loss": 2.0289, + "mean_token_accuracy": 0.5216419100761414, + "num_tokens": 1641775942.0, + "step": 3211 + }, + { + "epoch": 0.8685776095186587, + "grad_norm": 2.6376657485961914, + "learning_rate": 1.90434273619795e-05, + "loss": 1.941, + "mean_token_accuracy": 0.5821059346199036, + "num_tokens": 1642300165.0, + "step": 3212 + }, + { + "epoch": 0.8688480259599783, + "grad_norm": 1.7843352556228638, + "learning_rate": 1.9042719967447475e-05, + "loss": 1.9932, + "mean_token_accuracy": 0.5431210398674011, + "num_tokens": 1642824372.0, + "step": 3213 + }, + { + "epoch": 0.869118442401298, + "grad_norm": 2.1521308422088623, + "learning_rate": 1.9042012326141157e-05, + "loss": 2.1605, + "mean_token_accuracy": 0.5214024782180786, + "num_tokens": 1643347673.0, + "step": 3214 + }, + { + "epoch": 0.8693888588426176, + "grad_norm": 1.5073347091674805, + "learning_rate": 1.9041304438082246e-05, + "loss": 2.0853, + "mean_token_accuracy": 0.5343065857887268, + "num_tokens": 1643860391.0, + "step": 3215 + }, + { + "epoch": 0.8696592752839373, + "grad_norm": 1.8553047180175781, + "learning_rate": 1.9040596303292464e-05, + "loss": 2.0746, + "mean_token_accuracy": 0.5143896341323853, + "num_tokens": 1644384516.0, + "step": 3216 + }, + { + "epoch": 0.8699296917252569, + "grad_norm": 1.2196576595306396, + "learning_rate": 1.9039887921793544e-05, + "loss": 2.063, + "mean_token_accuracy": 0.505172610282898, + "num_tokens": 1644908688.0, + "step": 3217 + }, + { + "epoch": 0.8702001081665766, + "grad_norm": 1.5985050201416016, + "learning_rate": 1.9039179293607222e-05, + "loss": 2.0781, + "mean_token_accuracy": 0.5346366167068481, + "num_tokens": 1645432845.0, + "step": 3218 + }, + { + "epoch": 0.8704705246078962, + "grad_norm": 1.429444670677185, + "learning_rate": 1.9038470418755236e-05, + "loss": 2.0644, + "mean_token_accuracy": 0.5243058204650879, + "num_tokens": 1645957069.0, + "step": 3219 + }, + { + "epoch": 0.8707409410492158, + "grad_norm": 1.388048768043518, + "learning_rate": 1.9037761297259334e-05, + "loss": 1.968, + "mean_token_accuracy": 0.5575254559516907, + "num_tokens": 1646474570.0, + "step": 3220 + }, + { + "epoch": 0.8710113574905354, + "grad_norm": 0.8036924004554749, + "learning_rate": 1.9037051929141282e-05, + "loss": 1.1956, + "mean_token_accuracy": 0.6916208863258362, + "num_tokens": 1646930689.0, + "step": 3221 + }, + { + "epoch": 0.871281773931855, + "grad_norm": 2.453169584274292, + "learning_rate": 1.903634231442284e-05, + "loss": 2.1738, + "mean_token_accuracy": 0.5260593891143799, + "num_tokens": 1647454766.0, + "step": 3222 + }, + { + "epoch": 0.8715521903731747, + "grad_norm": 1.8506579399108887, + "learning_rate": 1.9035632453125783e-05, + "loss": 2.0434, + "mean_token_accuracy": 0.5394012331962585, + "num_tokens": 1647978966.0, + "step": 3223 + }, + { + "epoch": 0.8718226068144943, + "grad_norm": 1.5264393091201782, + "learning_rate": 1.903492234527189e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.5349017977714539, + "num_tokens": 1648503173.0, + "step": 3224 + }, + { + "epoch": 0.872093023255814, + "grad_norm": 2.048784017562866, + "learning_rate": 1.903421199088295e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.5215390920639038, + "num_tokens": 1649027428.0, + "step": 3225 + }, + { + "epoch": 0.8723634396971336, + "grad_norm": 1.6777337789535522, + "learning_rate": 1.9033501389980765e-05, + "loss": 2.0848, + "mean_token_accuracy": 0.5512753129005432, + "num_tokens": 1649551574.0, + "step": 3226 + }, + { + "epoch": 0.8726338561384532, + "grad_norm": 1.4575144052505493, + "learning_rate": 1.903279054258712e-05, + "loss": 2.1382, + "mean_token_accuracy": 0.5207399129867554, + "num_tokens": 1650075771.0, + "step": 3227 + }, + { + "epoch": 0.8729042725797729, + "grad_norm": 1.5315454006195068, + "learning_rate": 1.9032079448723852e-05, + "loss": 2.0844, + "mean_token_accuracy": 0.5321866273880005, + "num_tokens": 1650599954.0, + "step": 3228 + }, + { + "epoch": 0.8731746890210925, + "grad_norm": 1.5131165981292725, + "learning_rate": 1.9031368108412754e-05, + "loss": 2.0669, + "mean_token_accuracy": 0.5275248289108276, + "num_tokens": 1651124074.0, + "step": 3229 + }, + { + "epoch": 0.8734451054624122, + "grad_norm": 1.3430471420288086, + "learning_rate": 1.9030656521675673e-05, + "loss": 2.1338, + "mean_token_accuracy": 0.5335515737533569, + "num_tokens": 1651623273.0, + "step": 3230 + }, + { + "epoch": 0.8737155219037317, + "grad_norm": 1.5182379484176636, + "learning_rate": 1.902994468853443e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.5163933634757996, + "num_tokens": 1652147448.0, + "step": 3231 + }, + { + "epoch": 0.8739859383450513, + "grad_norm": 1.7572675943374634, + "learning_rate": 1.9029232609010866e-05, + "loss": 2.161, + "mean_token_accuracy": 0.5353212356567383, + "num_tokens": 1652671608.0, + "step": 3232 + }, + { + "epoch": 0.874256354786371, + "grad_norm": 1.4876466989517212, + "learning_rate": 1.9028520283126838e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.5103812217712402, + "num_tokens": 1653171493.0, + "step": 3233 + }, + { + "epoch": 0.8745267712276906, + "grad_norm": 1.6541962623596191, + "learning_rate": 1.9027807710904192e-05, + "loss": 2.1059, + "mean_token_accuracy": 0.5359803438186646, + "num_tokens": 1653695657.0, + "step": 3234 + }, + { + "epoch": 0.8747971876690103, + "grad_norm": 1.5180749893188477, + "learning_rate": 1.9027094892364805e-05, + "loss": 2.1603, + "mean_token_accuracy": 0.5209958553314209, + "num_tokens": 1654204048.0, + "step": 3235 + }, + { + "epoch": 0.8750676041103299, + "grad_norm": 1.4264920949935913, + "learning_rate": 1.9026381827530535e-05, + "loss": 2.1099, + "mean_token_accuracy": 0.5336133241653442, + "num_tokens": 1654728102.0, + "step": 3236 + }, + { + "epoch": 0.8753380205516496, + "grad_norm": 1.4152966737747192, + "learning_rate": 1.9025668516423268e-05, + "loss": 2.1093, + "mean_token_accuracy": 0.5333299040794373, + "num_tokens": 1655252377.0, + "step": 3237 + }, + { + "epoch": 0.8756084369929692, + "grad_norm": 1.7573885917663574, + "learning_rate": 1.902495495906489e-05, + "loss": 2.1506, + "mean_token_accuracy": 0.5374822616577148, + "num_tokens": 1655776570.0, + "step": 3238 + }, + { + "epoch": 0.8758788534342888, + "grad_norm": 1.4579367637634277, + "learning_rate": 1.902424115547729e-05, + "loss": 2.158, + "mean_token_accuracy": 0.5201456546783447, + "num_tokens": 1656300826.0, + "step": 3239 + }, + { + "epoch": 0.8761492698756085, + "grad_norm": 1.7382866144180298, + "learning_rate": 1.9023527105682375e-05, + "loss": 2.1816, + "mean_token_accuracy": 0.5204137563705444, + "num_tokens": 1656825006.0, + "step": 3240 + }, + { + "epoch": 0.876419686316928, + "grad_norm": 0.8407219052314758, + "learning_rate": 1.9022812809702054e-05, + "loss": 1.1948, + "mean_token_accuracy": 0.6884245872497559, + "num_tokens": 1657311913.0, + "step": 3241 + }, + { + "epoch": 0.8766901027582477, + "grad_norm": 1.8433376550674438, + "learning_rate": 1.9022098267558243e-05, + "loss": 2.1066, + "mean_token_accuracy": 0.5302565097808838, + "num_tokens": 1657836141.0, + "step": 3242 + }, + { + "epoch": 0.8769605191995673, + "grad_norm": 1.636946439743042, + "learning_rate": 1.9021383479272865e-05, + "loss": 2.1319, + "mean_token_accuracy": 0.5232928991317749, + "num_tokens": 1658360359.0, + "step": 3243 + }, + { + "epoch": 0.8772309356408869, + "grad_norm": 1.5356703996658325, + "learning_rate": 1.9020668444867855e-05, + "loss": 2.2591, + "mean_token_accuracy": 0.522484302520752, + "num_tokens": 1658884634.0, + "step": 3244 + }, + { + "epoch": 0.8775013520822066, + "grad_norm": 1.6088563203811646, + "learning_rate": 1.9019953164365147e-05, + "loss": 2.248, + "mean_token_accuracy": 0.502951979637146, + "num_tokens": 1659376133.0, + "step": 3245 + }, + { + "epoch": 0.8777717685235262, + "grad_norm": 2.1490044593811035, + "learning_rate": 1.901923763778669e-05, + "loss": 2.1253, + "mean_token_accuracy": 0.5444620847702026, + "num_tokens": 1659894840.0, + "step": 3246 + }, + { + "epoch": 0.8780421849648459, + "grad_norm": 1.8686856031417847, + "learning_rate": 1.901852186515444e-05, + "loss": 2.1273, + "mean_token_accuracy": 0.5655754208564758, + "num_tokens": 1660364320.0, + "step": 3247 + }, + { + "epoch": 0.8783126014061655, + "grad_norm": 4.039051055908203, + "learning_rate": 1.9017805846490358e-05, + "loss": 1.9645, + "mean_token_accuracy": 0.564423680305481, + "num_tokens": 1660874089.0, + "step": 3248 + }, + { + "epoch": 0.8785830178474852, + "grad_norm": 2.979260206222534, + "learning_rate": 1.9017089581816412e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.5168936252593994, + "num_tokens": 1661398237.0, + "step": 3249 + }, + { + "epoch": 0.8788534342888048, + "grad_norm": 2.257073402404785, + "learning_rate": 1.901637307115459e-05, + "loss": 2.2081, + "mean_token_accuracy": 0.5234878659248352, + "num_tokens": 1661922498.0, + "step": 3250 + }, + { + "epoch": 0.8791238507301244, + "grad_norm": 1.7244930267333984, + "learning_rate": 1.901565631452686e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.5232315063476562, + "num_tokens": 1662416055.0, + "step": 3251 + }, + { + "epoch": 0.879394267171444, + "grad_norm": 2.092893600463867, + "learning_rate": 1.901493931195522e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.5242464542388916, + "num_tokens": 1662940322.0, + "step": 3252 + }, + { + "epoch": 0.8796646836127636, + "grad_norm": 1.9523205757141113, + "learning_rate": 1.9014222063461672e-05, + "loss": 2.1079, + "mean_token_accuracy": 0.5353926420211792, + "num_tokens": 1663430053.0, + "step": 3253 + }, + { + "epoch": 0.8799351000540833, + "grad_norm": 1.4117342233657837, + "learning_rate": 1.9013504569068228e-05, + "loss": 2.121, + "mean_token_accuracy": 0.5103268623352051, + "num_tokens": 1663954218.0, + "step": 3254 + }, + { + "epoch": 0.8802055164954029, + "grad_norm": 1.5124568939208984, + "learning_rate": 1.901278682879689e-05, + "loss": 2.1951, + "mean_token_accuracy": 0.5254464149475098, + "num_tokens": 1664475497.0, + "step": 3255 + }, + { + "epoch": 0.8804759329367225, + "grad_norm": 1.5935618877410889, + "learning_rate": 1.9012068842669697e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.537678062915802, + "num_tokens": 1664960729.0, + "step": 3256 + }, + { + "epoch": 0.8807463493780422, + "grad_norm": 1.5829899311065674, + "learning_rate": 1.9011350610708663e-05, + "loss": 2.1672, + "mean_token_accuracy": 0.5273723602294922, + "num_tokens": 1665484925.0, + "step": 3257 + }, + { + "epoch": 0.8810167658193618, + "grad_norm": 1.5039933919906616, + "learning_rate": 1.9010632132935836e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.5158757567405701, + "num_tokens": 1666009097.0, + "step": 3258 + }, + { + "epoch": 0.8812871822606815, + "grad_norm": 1.7279727458953857, + "learning_rate": 1.9009913409373257e-05, + "loss": 2.213, + "mean_token_accuracy": 0.5202317833900452, + "num_tokens": 1666533335.0, + "step": 3259 + }, + { + "epoch": 0.8815575987020011, + "grad_norm": 1.419427752494812, + "learning_rate": 1.900919444004298e-05, + "loss": 1.9353, + "mean_token_accuracy": 0.5759384632110596, + "num_tokens": 1667057614.0, + "step": 3260 + }, + { + "epoch": 0.8818280151433208, + "grad_norm": 0.7466105222702026, + "learning_rate": 1.900847522496706e-05, + "loss": 1.1413, + "mean_token_accuracy": 0.6969797015190125, + "num_tokens": 1667581822.0, + "step": 3261 + }, + { + "epoch": 0.8820984315846403, + "grad_norm": 1.612399697303772, + "learning_rate": 1.900775576416757e-05, + "loss": 2.0299, + "mean_token_accuracy": 0.5398379564285278, + "num_tokens": 1668054447.0, + "step": 3262 + }, + { + "epoch": 0.8823688480259599, + "grad_norm": 1.5857666730880737, + "learning_rate": 1.9007036057666587e-05, + "loss": 2.2269, + "mean_token_accuracy": 0.5261431932449341, + "num_tokens": 1668568646.0, + "step": 3263 + }, + { + "epoch": 0.8826392644672796, + "grad_norm": 1.339415431022644, + "learning_rate": 1.900631610548619e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.5280871391296387, + "num_tokens": 1669092923.0, + "step": 3264 + }, + { + "epoch": 0.8829096809085992, + "grad_norm": 1.667464256286621, + "learning_rate": 1.9005595907648468e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.4991876184940338, + "num_tokens": 1669617142.0, + "step": 3265 + }, + { + "epoch": 0.8831800973499189, + "grad_norm": 1.6526750326156616, + "learning_rate": 1.9004875464175522e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.5396242141723633, + "num_tokens": 1670101798.0, + "step": 3266 + }, + { + "epoch": 0.8834505137912385, + "grad_norm": 1.5985091924667358, + "learning_rate": 1.9004154775089453e-05, + "loss": 2.1634, + "mean_token_accuracy": 0.5525082349777222, + "num_tokens": 1670520193.0, + "step": 3267 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 1.6578978300094604, + "learning_rate": 1.900343384041238e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.525216281414032, + "num_tokens": 1671044455.0, + "step": 3268 + }, + { + "epoch": 0.8839913466738778, + "grad_norm": 1.8279049396514893, + "learning_rate": 1.9002712660166417e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.5220087766647339, + "num_tokens": 1671568730.0, + "step": 3269 + }, + { + "epoch": 0.8842617631151974, + "grad_norm": 1.4706748723983765, + "learning_rate": 1.90019912343737e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.5488289594650269, + "num_tokens": 1672092900.0, + "step": 3270 + }, + { + "epoch": 0.8845321795565171, + "grad_norm": 1.452735185623169, + "learning_rate": 1.900126956305636e-05, + "loss": 2.2051, + "mean_token_accuracy": 0.5174098610877991, + "num_tokens": 1672617079.0, + "step": 3271 + }, + { + "epoch": 0.8848025959978366, + "grad_norm": 1.6608498096466064, + "learning_rate": 1.9000547646236534e-05, + "loss": 2.1026, + "mean_token_accuracy": 0.5410302877426147, + "num_tokens": 1673141350.0, + "step": 3272 + }, + { + "epoch": 0.8850730124391563, + "grad_norm": 1.5340676307678223, + "learning_rate": 1.8999825483936376e-05, + "loss": 2.0266, + "mean_token_accuracy": 0.5399760007858276, + "num_tokens": 1673615023.0, + "step": 3273 + }, + { + "epoch": 0.8853434288804759, + "grad_norm": 1.5970752239227295, + "learning_rate": 1.899910307617805e-05, + "loss": 2.1297, + "mean_token_accuracy": 0.530010998249054, + "num_tokens": 1674139289.0, + "step": 3274 + }, + { + "epoch": 0.8856138453217955, + "grad_norm": 1.3114893436431885, + "learning_rate": 1.8998380422983723e-05, + "loss": 2.1308, + "mean_token_accuracy": 0.5465285778045654, + "num_tokens": 1674603358.0, + "step": 3275 + }, + { + "epoch": 0.8858842617631152, + "grad_norm": 1.747964859008789, + "learning_rate": 1.8997657524375558e-05, + "loss": 2.1721, + "mean_token_accuracy": 0.52359938621521, + "num_tokens": 1675127536.0, + "step": 3276 + }, + { + "epoch": 0.8861546782044348, + "grad_norm": 1.491047739982605, + "learning_rate": 1.899693438037574e-05, + "loss": 2.1792, + "mean_token_accuracy": 0.5455535650253296, + "num_tokens": 1675651805.0, + "step": 3277 + }, + { + "epoch": 0.8864250946457545, + "grad_norm": 1.6417593955993652, + "learning_rate": 1.8996210991006458e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.5417659282684326, + "num_tokens": 1676120189.0, + "step": 3278 + }, + { + "epoch": 0.8866955110870741, + "grad_norm": 2.7084462642669678, + "learning_rate": 1.8995487356289907e-05, + "loss": 1.8742, + "mean_token_accuracy": 0.5964856743812561, + "num_tokens": 1676583526.0, + "step": 3279 + }, + { + "epoch": 0.8869659275283938, + "grad_norm": 1.8030097484588623, + "learning_rate": 1.8994763476248295e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.5293769836425781, + "num_tokens": 1677107799.0, + "step": 3280 + }, + { + "epoch": 0.8872363439697134, + "grad_norm": 0.8872600793838501, + "learning_rate": 1.8994039350903827e-05, + "loss": 1.2317, + "mean_token_accuracy": 0.6802483201026917, + "num_tokens": 1677632028.0, + "step": 3281 + }, + { + "epoch": 0.8875067604110329, + "grad_norm": 1.5594545602798462, + "learning_rate": 1.899331498027872e-05, + "loss": 2.1384, + "mean_token_accuracy": 0.538915753364563, + "num_tokens": 1678156123.0, + "step": 3282 + }, + { + "epoch": 0.8877771768523526, + "grad_norm": 1.4511662721633911, + "learning_rate": 1.89925903643952e-05, + "loss": 2.1305, + "mean_token_accuracy": 0.5297475457191467, + "num_tokens": 1678680269.0, + "step": 3283 + }, + { + "epoch": 0.8880475932936722, + "grad_norm": 0.9990242719650269, + "learning_rate": 1.899186550327551e-05, + "loss": 2.1048, + "mean_token_accuracy": 0.523471474647522, + "num_tokens": 1679204498.0, + "step": 3284 + }, + { + "epoch": 0.8883180097349919, + "grad_norm": 1.4697989225387573, + "learning_rate": 1.899114039694188e-05, + "loss": 2.0906, + "mean_token_accuracy": 0.5486829876899719, + "num_tokens": 1679707306.0, + "step": 3285 + }, + { + "epoch": 0.8885884261763115, + "grad_norm": 1.3152687549591064, + "learning_rate": 1.8990415045416563e-05, + "loss": 2.1242, + "mean_token_accuracy": 0.5394425392150879, + "num_tokens": 1680187603.0, + "step": 3286 + }, + { + "epoch": 0.8888588426176312, + "grad_norm": 1.6189724206924438, + "learning_rate": 1.8989689448721815e-05, + "loss": 2.0544, + "mean_token_accuracy": 0.5279932022094727, + "num_tokens": 1680711757.0, + "step": 3287 + }, + { + "epoch": 0.8891292590589508, + "grad_norm": 1.3553009033203125, + "learning_rate": 1.8988963606879895e-05, + "loss": 2.0604, + "mean_token_accuracy": 0.5238977074623108, + "num_tokens": 1681236026.0, + "step": 3288 + }, + { + "epoch": 0.8893996755002704, + "grad_norm": 1.590701699256897, + "learning_rate": 1.898823751991308e-05, + "loss": 2.1204, + "mean_token_accuracy": 0.5312274694442749, + "num_tokens": 1681760234.0, + "step": 3289 + }, + { + "epoch": 0.8896700919415901, + "grad_norm": 1.3891716003417969, + "learning_rate": 1.8987511187843645e-05, + "loss": 2.1553, + "mean_token_accuracy": 0.517246425151825, + "num_tokens": 1682284471.0, + "step": 3290 + }, + { + "epoch": 0.8899405083829097, + "grad_norm": 1.6126587390899658, + "learning_rate": 1.898678461069388e-05, + "loss": 2.2273, + "mean_token_accuracy": 0.5199968814849854, + "num_tokens": 1682808562.0, + "step": 3291 + }, + { + "epoch": 0.8902109248242294, + "grad_norm": 1.5493894815444946, + "learning_rate": 1.8986057788486074e-05, + "loss": 2.137, + "mean_token_accuracy": 0.5363516807556152, + "num_tokens": 1683332628.0, + "step": 3292 + }, + { + "epoch": 0.8904813412655489, + "grad_norm": 1.5340819358825684, + "learning_rate": 1.898533072124253e-05, + "loss": 2.0442, + "mean_token_accuracy": 0.5401300191879272, + "num_tokens": 1683856838.0, + "step": 3293 + }, + { + "epoch": 0.8907517577068685, + "grad_norm": 1.6332353353500366, + "learning_rate": 1.8984603408985555e-05, + "loss": 2.1697, + "mean_token_accuracy": 0.5155630707740784, + "num_tokens": 1684364056.0, + "step": 3294 + }, + { + "epoch": 0.8910221741481882, + "grad_norm": 1.3600378036499023, + "learning_rate": 1.8983875851737464e-05, + "loss": 2.1573, + "mean_token_accuracy": 0.5214183330535889, + "num_tokens": 1684888321.0, + "step": 3295 + }, + { + "epoch": 0.8912925905895078, + "grad_norm": 1.4491345882415771, + "learning_rate": 1.8983148049520588e-05, + "loss": 2.0712, + "mean_token_accuracy": 0.5158506035804749, + "num_tokens": 1685412517.0, + "step": 3296 + }, + { + "epoch": 0.8915630070308275, + "grad_norm": 1.4333903789520264, + "learning_rate": 1.898242000235725e-05, + "loss": 2.0584, + "mean_token_accuracy": 0.5325069427490234, + "num_tokens": 1685936765.0, + "step": 3297 + }, + { + "epoch": 0.8918334234721471, + "grad_norm": 1.116274356842041, + "learning_rate": 1.898169171026979e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.5414159893989563, + "num_tokens": 1686433886.0, + "step": 3298 + }, + { + "epoch": 0.8921038399134668, + "grad_norm": 1.386544108390808, + "learning_rate": 1.8980963173280558e-05, + "loss": 2.1156, + "mean_token_accuracy": 0.5394917130470276, + "num_tokens": 1686958004.0, + "step": 3299 + }, + { + "epoch": 0.8923742563547864, + "grad_norm": 1.5748142004013062, + "learning_rate": 1.8980234391411904e-05, + "loss": 2.1543, + "mean_token_accuracy": 0.5335731506347656, + "num_tokens": 1687465880.0, + "step": 3300 + }, + { + "epoch": 0.892644672796106, + "grad_norm": 1.2016125917434692, + "learning_rate": 1.8979505364686194e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.6635416746139526, + "num_tokens": 1687990074.0, + "step": 3301 + }, + { + "epoch": 0.8929150892374257, + "grad_norm": 2.2199978828430176, + "learning_rate": 1.897877609312579e-05, + "loss": 2.132, + "mean_token_accuracy": 0.5384634733200073, + "num_tokens": 1688469941.0, + "step": 3302 + }, + { + "epoch": 0.8931855056787452, + "grad_norm": 1.489335536956787, + "learning_rate": 1.897804657675307e-05, + "loss": 2.0546, + "mean_token_accuracy": 0.5305145978927612, + "num_tokens": 1688948583.0, + "step": 3303 + }, + { + "epoch": 0.8934559221200649, + "grad_norm": 1.3318266868591309, + "learning_rate": 1.8977316815590426e-05, + "loss": 2.1531, + "mean_token_accuracy": 0.5327754616737366, + "num_tokens": 1689432395.0, + "step": 3304 + }, + { + "epoch": 0.8937263385613845, + "grad_norm": 1.4309576749801636, + "learning_rate": 1.8976586809660236e-05, + "loss": 2.0751, + "mean_token_accuracy": 0.5352158546447754, + "num_tokens": 1689931914.0, + "step": 3305 + }, + { + "epoch": 0.8939967550027041, + "grad_norm": 1.293602466583252, + "learning_rate": 1.8975856558984908e-05, + "loss": 1.968, + "mean_token_accuracy": 0.5570103526115417, + "num_tokens": 1690415866.0, + "step": 3306 + }, + { + "epoch": 0.8942671714440238, + "grad_norm": 1.5835446119308472, + "learning_rate": 1.897512606358684e-05, + "loss": 2.0861, + "mean_token_accuracy": 0.5419412851333618, + "num_tokens": 1690892915.0, + "step": 3307 + }, + { + "epoch": 0.8945375878853434, + "grad_norm": 1.3185702562332153, + "learning_rate": 1.8974395323488454e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.5409334897994995, + "num_tokens": 1691395385.0, + "step": 3308 + }, + { + "epoch": 0.8948080043266631, + "grad_norm": 1.289047360420227, + "learning_rate": 1.897366433871217e-05, + "loss": 2.1203, + "mean_token_accuracy": 0.5376955270767212, + "num_tokens": 1691862367.0, + "step": 3309 + }, + { + "epoch": 0.8950784207679827, + "grad_norm": 1.3873484134674072, + "learning_rate": 1.8972933109280416e-05, + "loss": 2.0698, + "mean_token_accuracy": 0.5358260869979858, + "num_tokens": 1692386486.0, + "step": 3310 + }, + { + "epoch": 0.8953488372093024, + "grad_norm": 1.537113070487976, + "learning_rate": 1.8972201635215625e-05, + "loss": 2.1897, + "mean_token_accuracy": 0.5312743782997131, + "num_tokens": 1692863921.0, + "step": 3311 + }, + { + "epoch": 0.895619253650622, + "grad_norm": 1.7689588069915771, + "learning_rate": 1.8971469916540245e-05, + "loss": 2.2187, + "mean_token_accuracy": 0.525603175163269, + "num_tokens": 1693353717.0, + "step": 3312 + }, + { + "epoch": 0.8958896700919415, + "grad_norm": 1.869495153427124, + "learning_rate": 1.8970737953276723e-05, + "loss": 2.1555, + "mean_token_accuracy": 0.5180287957191467, + "num_tokens": 1693877869.0, + "step": 3313 + }, + { + "epoch": 0.8961600865332612, + "grad_norm": 1.3782882690429688, + "learning_rate": 1.897000574544752e-05, + "loss": 2.1604, + "mean_token_accuracy": 0.5377712249755859, + "num_tokens": 1694370856.0, + "step": 3314 + }, + { + "epoch": 0.8964305029745808, + "grad_norm": 1.2842590808868408, + "learning_rate": 1.8969273293075105e-05, + "loss": 2.0999, + "mean_token_accuracy": 0.5199004411697388, + "num_tokens": 1694895041.0, + "step": 3315 + }, + { + "epoch": 0.8967009194159005, + "grad_norm": 1.447857141494751, + "learning_rate": 1.8968540596181947e-05, + "loss": 2.1745, + "mean_token_accuracy": 0.5149269104003906, + "num_tokens": 1695419271.0, + "step": 3316 + }, + { + "epoch": 0.8969713358572201, + "grad_norm": 1.5313407182693481, + "learning_rate": 1.896780765479053e-05, + "loss": 2.1628, + "mean_token_accuracy": 0.5311118364334106, + "num_tokens": 1695943418.0, + "step": 3317 + }, + { + "epoch": 0.8972417522985398, + "grad_norm": 1.3419835567474365, + "learning_rate": 1.8967074468923344e-05, + "loss": 2.0262, + "mean_token_accuracy": 0.5261580944061279, + "num_tokens": 1696467627.0, + "step": 3318 + }, + { + "epoch": 0.8975121687398594, + "grad_norm": 1.7792840003967285, + "learning_rate": 1.8966341038602883e-05, + "loss": 2.1495, + "mean_token_accuracy": 0.5433116555213928, + "num_tokens": 1696950949.0, + "step": 3319 + }, + { + "epoch": 0.897782585181179, + "grad_norm": 1.563547134399414, + "learning_rate": 1.896560736385165e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.5381946563720703, + "num_tokens": 1697436670.0, + "step": 3320 + }, + { + "epoch": 0.8980530016224987, + "grad_norm": 0.7194002270698547, + "learning_rate": 1.8964873444692163e-05, + "loss": 1.1675, + "mean_token_accuracy": 0.6916494369506836, + "num_tokens": 1697960876.0, + "step": 3321 + }, + { + "epoch": 0.8983234180638183, + "grad_norm": 2.3596560955047607, + "learning_rate": 1.896413928114693e-05, + "loss": 2.2289, + "mean_token_accuracy": 0.5169621706008911, + "num_tokens": 1698480394.0, + "step": 3322 + }, + { + "epoch": 0.898593834505138, + "grad_norm": 1.813738226890564, + "learning_rate": 1.8963404873238486e-05, + "loss": 2.111, + "mean_token_accuracy": 0.5199013948440552, + "num_tokens": 1699004479.0, + "step": 3323 + }, + { + "epoch": 0.8988642509464575, + "grad_norm": 1.5462764501571655, + "learning_rate": 1.896267022098936e-05, + "loss": 2.0272, + "mean_token_accuracy": 0.5469118356704712, + "num_tokens": 1699468290.0, + "step": 3324 + }, + { + "epoch": 0.8991346673877771, + "grad_norm": 1.6021064519882202, + "learning_rate": 1.896193532442209e-05, + "loss": 2.0835, + "mean_token_accuracy": 0.5318729877471924, + "num_tokens": 1699966113.0, + "step": 3325 + }, + { + "epoch": 0.8994050838290968, + "grad_norm": 1.4009041786193848, + "learning_rate": 1.896120018355924e-05, + "loss": 1.9146, + "mean_token_accuracy": 0.5368547439575195, + "num_tokens": 1700490329.0, + "step": 3326 + }, + { + "epoch": 0.8996755002704164, + "grad_norm": 1.4760527610778809, + "learning_rate": 1.8960464798423347e-05, + "loss": 2.1189, + "mean_token_accuracy": 0.5231729745864868, + "num_tokens": 1700993044.0, + "step": 3327 + }, + { + "epoch": 0.8999459167117361, + "grad_norm": 1.6452497243881226, + "learning_rate": 1.8959729169036985e-05, + "loss": 2.0495, + "mean_token_accuracy": 0.5463835000991821, + "num_tokens": 1701517277.0, + "step": 3328 + }, + { + "epoch": 0.9002163331530557, + "grad_norm": 1.425177812576294, + "learning_rate": 1.895899329542273e-05, + "loss": 2.0883, + "mean_token_accuracy": 0.5359330773353577, + "num_tokens": 1702041474.0, + "step": 3329 + }, + { + "epoch": 0.9004867495943754, + "grad_norm": 1.3279136419296265, + "learning_rate": 1.895825717760315e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.545490026473999, + "num_tokens": 1702524933.0, + "step": 3330 + }, + { + "epoch": 0.900757166035695, + "grad_norm": 1.7444472312927246, + "learning_rate": 1.895752081560084e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.536223828792572, + "num_tokens": 1703049172.0, + "step": 3331 + }, + { + "epoch": 0.9010275824770146, + "grad_norm": 1.4716740846633911, + "learning_rate": 1.8956784209438387e-05, + "loss": 2.082, + "mean_token_accuracy": 0.5193668007850647, + "num_tokens": 1703541320.0, + "step": 3332 + }, + { + "epoch": 0.9012979989183343, + "grad_norm": 1.752985954284668, + "learning_rate": 1.8956047359138393e-05, + "loss": 2.1062, + "mean_token_accuracy": 0.5261455774307251, + "num_tokens": 1704065576.0, + "step": 3333 + }, + { + "epoch": 0.9015684153596538, + "grad_norm": 1.5940099954605103, + "learning_rate": 1.8955310264723473e-05, + "loss": 2.172, + "mean_token_accuracy": 0.5223581790924072, + "num_tokens": 1704589718.0, + "step": 3334 + }, + { + "epoch": 0.9018388318009735, + "grad_norm": 1.3089282512664795, + "learning_rate": 1.8954572926216237e-05, + "loss": 2.0487, + "mean_token_accuracy": 0.535641610622406, + "num_tokens": 1705092184.0, + "step": 3335 + }, + { + "epoch": 0.9021092482422931, + "grad_norm": 1.5847517251968384, + "learning_rate": 1.895383534363931e-05, + "loss": 2.1303, + "mean_token_accuracy": 0.5193087458610535, + "num_tokens": 1705616326.0, + "step": 3336 + }, + { + "epoch": 0.9023796646836127, + "grad_norm": 1.6783937215805054, + "learning_rate": 1.8953097517015323e-05, + "loss": 2.0147, + "mean_token_accuracy": 0.5473117828369141, + "num_tokens": 1706094729.0, + "step": 3337 + }, + { + "epoch": 0.9026500811249324, + "grad_norm": 1.4922457933425903, + "learning_rate": 1.8952359446366918e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.5442283153533936, + "num_tokens": 1706555535.0, + "step": 3338 + }, + { + "epoch": 0.902920497566252, + "grad_norm": 1.508952021598816, + "learning_rate": 1.895162113171674e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.534980833530426, + "num_tokens": 1707079693.0, + "step": 3339 + }, + { + "epoch": 0.9031909140075717, + "grad_norm": 1.8157566785812378, + "learning_rate": 1.8950882573087438e-05, + "loss": 2.1724, + "mean_token_accuracy": 0.5354467630386353, + "num_tokens": 1707603798.0, + "step": 3340 + }, + { + "epoch": 0.9034613304488913, + "grad_norm": 0.8947188258171082, + "learning_rate": 1.895014377050168e-05, + "loss": 1.1822, + "mean_token_accuracy": 0.6965222954750061, + "num_tokens": 1708006629.0, + "step": 3341 + }, + { + "epoch": 0.903731746890211, + "grad_norm": 1.9166814088821411, + "learning_rate": 1.894940472398213e-05, + "loss": 2.1665, + "mean_token_accuracy": 0.527540385723114, + "num_tokens": 1708509688.0, + "step": 3342 + }, + { + "epoch": 0.9040021633315306, + "grad_norm": 1.7644819021224976, + "learning_rate": 1.8948665433551464e-05, + "loss": 2.0508, + "mean_token_accuracy": 0.5398432016372681, + "num_tokens": 1709033885.0, + "step": 3343 + }, + { + "epoch": 0.9042725797728501, + "grad_norm": 1.371738314628601, + "learning_rate": 1.8947925899232366e-05, + "loss": 2.1509, + "mean_token_accuracy": 0.5116658210754395, + "num_tokens": 1709557939.0, + "step": 3344 + }, + { + "epoch": 0.9045429962141698, + "grad_norm": 1.6475275754928589, + "learning_rate": 1.8947186121047533e-05, + "loss": 2.0673, + "mean_token_accuracy": 0.5446523427963257, + "num_tokens": 1710033656.0, + "step": 3345 + }, + { + "epoch": 0.9048134126554894, + "grad_norm": 2.0599899291992188, + "learning_rate": 1.8946446099019654e-05, + "loss": 2.1176, + "mean_token_accuracy": 0.541438102722168, + "num_tokens": 1710518283.0, + "step": 3346 + }, + { + "epoch": 0.9050838290968091, + "grad_norm": 1.590183973312378, + "learning_rate": 1.8945705833171444e-05, + "loss": 2.0223, + "mean_token_accuracy": 0.5465002059936523, + "num_tokens": 1711042519.0, + "step": 3347 + }, + { + "epoch": 0.9053542455381287, + "grad_norm": 1.6482256650924683, + "learning_rate": 1.894496532352561e-05, + "loss": 2.2324, + "mean_token_accuracy": 0.5080844163894653, + "num_tokens": 1711510610.0, + "step": 3348 + }, + { + "epoch": 0.9056246619794484, + "grad_norm": 1.6310299634933472, + "learning_rate": 1.8944224570104873e-05, + "loss": 2.0136, + "mean_token_accuracy": 0.5441582202911377, + "num_tokens": 1712034859.0, + "step": 3349 + }, + { + "epoch": 0.905895078420768, + "grad_norm": 1.2922719717025757, + "learning_rate": 1.8943483572931963e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.5371295213699341, + "num_tokens": 1712559138.0, + "step": 3350 + }, + { + "epoch": 0.9061654948620876, + "grad_norm": 1.5614413022994995, + "learning_rate": 1.894274233202962e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.5265361070632935, + "num_tokens": 1713083331.0, + "step": 3351 + }, + { + "epoch": 0.9064359113034073, + "grad_norm": 1.4291588068008423, + "learning_rate": 1.8942000847420586e-05, + "loss": 2.0858, + "mean_token_accuracy": 0.521235466003418, + "num_tokens": 1713607390.0, + "step": 3352 + }, + { + "epoch": 0.9067063277447269, + "grad_norm": 1.9643350839614868, + "learning_rate": 1.8941259119127607e-05, + "loss": 2.2322, + "mean_token_accuracy": 0.5183480978012085, + "num_tokens": 1714131542.0, + "step": 3353 + }, + { + "epoch": 0.9069767441860465, + "grad_norm": 1.3096733093261719, + "learning_rate": 1.8940517147173444e-05, + "loss": 2.1729, + "mean_token_accuracy": 0.5330121517181396, + "num_tokens": 1714655636.0, + "step": 3354 + }, + { + "epoch": 0.9072471606273661, + "grad_norm": 1.6096500158309937, + "learning_rate": 1.8939774931580866e-05, + "loss": 2.2426, + "mean_token_accuracy": 0.5131072998046875, + "num_tokens": 1715179916.0, + "step": 3355 + }, + { + "epoch": 0.9075175770686857, + "grad_norm": 1.8327081203460693, + "learning_rate": 1.8939032472372643e-05, + "loss": 2.2612, + "mean_token_accuracy": 0.5181915760040283, + "num_tokens": 1715704011.0, + "step": 3356 + }, + { + "epoch": 0.9077879935100054, + "grad_norm": 1.5710108280181885, + "learning_rate": 1.8938289769571555e-05, + "loss": 2.1713, + "mean_token_accuracy": 0.515715479850769, + "num_tokens": 1716228276.0, + "step": 3357 + }, + { + "epoch": 0.908058409951325, + "grad_norm": 1.6047710180282593, + "learning_rate": 1.8937546823200393e-05, + "loss": 2.1954, + "mean_token_accuracy": 0.5203109979629517, + "num_tokens": 1716752387.0, + "step": 3358 + }, + { + "epoch": 0.9083288263926447, + "grad_norm": 1.4279247522354126, + "learning_rate": 1.893680363328195e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.5203720927238464, + "num_tokens": 1717276454.0, + "step": 3359 + }, + { + "epoch": 0.9085992428339643, + "grad_norm": 1.523319125175476, + "learning_rate": 1.8936060199839034e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.5176452398300171, + "num_tokens": 1717800730.0, + "step": 3360 + }, + { + "epoch": 0.908869659275284, + "grad_norm": 0.6618362665176392, + "learning_rate": 1.893531652289445e-05, + "loss": 1.1584, + "mean_token_accuracy": 0.6764997839927673, + "num_tokens": 1718324830.0, + "step": 3361 + }, + { + "epoch": 0.9091400757166036, + "grad_norm": 3.1924002170562744, + "learning_rate": 1.893457260247102e-05, + "loss": 2.1892, + "mean_token_accuracy": 0.5357109308242798, + "num_tokens": 1718849071.0, + "step": 3362 + }, + { + "epoch": 0.9094104921579232, + "grad_norm": 2.844637632369995, + "learning_rate": 1.8933828438591568e-05, + "loss": 2.1616, + "mean_token_accuracy": 0.5258766412734985, + "num_tokens": 1719373356.0, + "step": 3363 + }, + { + "epoch": 0.9096809085992429, + "grad_norm": 1.4217349290847778, + "learning_rate": 1.893308403127893e-05, + "loss": 2.1223, + "mean_token_accuracy": 0.5135676860809326, + "num_tokens": 1719897615.0, + "step": 3364 + }, + { + "epoch": 0.9099513250405624, + "grad_norm": 2.334735155105591, + "learning_rate": 1.8932339380555943e-05, + "loss": 2.1025, + "mean_token_accuracy": 0.545377254486084, + "num_tokens": 1720393324.0, + "step": 3365 + }, + { + "epoch": 0.9102217414818821, + "grad_norm": 2.3067634105682373, + "learning_rate": 1.8931594486445452e-05, + "loss": 2.1527, + "mean_token_accuracy": 0.5433840751647949, + "num_tokens": 1720917522.0, + "step": 3366 + }, + { + "epoch": 0.9104921579232017, + "grad_norm": 1.8390483856201172, + "learning_rate": 1.8930849348970323e-05, + "loss": 2.1791, + "mean_token_accuracy": 0.5139201879501343, + "num_tokens": 1721406395.0, + "step": 3367 + }, + { + "epoch": 0.9107625743645213, + "grad_norm": 2.500349283218384, + "learning_rate": 1.893010396815341e-05, + "loss": 2.1324, + "mean_token_accuracy": 0.5399112701416016, + "num_tokens": 1721930572.0, + "step": 3368 + }, + { + "epoch": 0.911032990805841, + "grad_norm": 1.9789490699768066, + "learning_rate": 1.8929358344017587e-05, + "loss": 2.1095, + "mean_token_accuracy": 0.5478094816207886, + "num_tokens": 1722454851.0, + "step": 3369 + }, + { + "epoch": 0.9113034072471606, + "grad_norm": 1.405185341835022, + "learning_rate": 1.892861247658573e-05, + "loss": 1.8597, + "mean_token_accuracy": 0.5609136819839478, + "num_tokens": 1722979073.0, + "step": 3370 + }, + { + "epoch": 0.9115738236884803, + "grad_norm": 1.9301072359085083, + "learning_rate": 1.8927866365880726e-05, + "loss": 2.0409, + "mean_token_accuracy": 0.5445624589920044, + "num_tokens": 1723473409.0, + "step": 3371 + }, + { + "epoch": 0.9118442401297999, + "grad_norm": 2.1946918964385986, + "learning_rate": 1.8927120011925467e-05, + "loss": 2.093, + "mean_token_accuracy": 0.5310556292533875, + "num_tokens": 1723997685.0, + "step": 3372 + }, + { + "epoch": 0.9121146565711196, + "grad_norm": 1.939782738685608, + "learning_rate": 1.8926373414742857e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.5103715658187866, + "num_tokens": 1724521924.0, + "step": 3373 + }, + { + "epoch": 0.9123850730124392, + "grad_norm": 1.8131040334701538, + "learning_rate": 1.8925626574355798e-05, + "loss": 2.1127, + "mean_token_accuracy": 0.5404344797134399, + "num_tokens": 1725046107.0, + "step": 3374 + }, + { + "epoch": 0.9126554894537587, + "grad_norm": 1.7443310022354126, + "learning_rate": 1.892487949078721e-05, + "loss": 2.0286, + "mean_token_accuracy": 0.5351306200027466, + "num_tokens": 1725570238.0, + "step": 3375 + }, + { + "epoch": 0.9129259058950784, + "grad_norm": 1.729333519935608, + "learning_rate": 1.892413216406001e-05, + "loss": 2.184, + "mean_token_accuracy": 0.5022600293159485, + "num_tokens": 1726094472.0, + "step": 3376 + }, + { + "epoch": 0.913196322336398, + "grad_norm": 1.5499756336212158, + "learning_rate": 1.8923384594197135e-05, + "loss": 2.1053, + "mean_token_accuracy": 0.5307824015617371, + "num_tokens": 1726602224.0, + "step": 3377 + }, + { + "epoch": 0.9134667387777177, + "grad_norm": 1.7895840406417847, + "learning_rate": 1.892263678122152e-05, + "loss": 2.1403, + "mean_token_accuracy": 0.5126461982727051, + "num_tokens": 1727126499.0, + "step": 3378 + }, + { + "epoch": 0.9137371552190373, + "grad_norm": 1.5767687559127808, + "learning_rate": 1.8921888725156107e-05, + "loss": 2.0696, + "mean_token_accuracy": 0.5316135883331299, + "num_tokens": 1727650622.0, + "step": 3379 + }, + { + "epoch": 0.914007571660357, + "grad_norm": 1.1928385496139526, + "learning_rate": 1.892114042602385e-05, + "loss": 2.1391, + "mean_token_accuracy": 0.5317100286483765, + "num_tokens": 1728169611.0, + "step": 3380 + }, + { + "epoch": 0.9142779881016766, + "grad_norm": 0.724144458770752, + "learning_rate": 1.8920391883847714e-05, + "loss": 1.0224, + "mean_token_accuracy": 0.7401614785194397, + "num_tokens": 1728635601.0, + "step": 3381 + }, + { + "epoch": 0.9145484045429962, + "grad_norm": 2.8393642902374268, + "learning_rate": 1.8919643098650663e-05, + "loss": 2.1652, + "mean_token_accuracy": 0.5276550054550171, + "num_tokens": 1729159747.0, + "step": 3382 + }, + { + "epoch": 0.9148188209843159, + "grad_norm": 2.679452657699585, + "learning_rate": 1.8918894070455668e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.5351351499557495, + "num_tokens": 1729683938.0, + "step": 3383 + }, + { + "epoch": 0.9150892374256355, + "grad_norm": 1.6268693208694458, + "learning_rate": 1.891814479928572e-05, + "loss": 2.2013, + "mean_token_accuracy": 0.5298318862915039, + "num_tokens": 1730171904.0, + "step": 3384 + }, + { + "epoch": 0.9153596538669551, + "grad_norm": 2.516277551651001, + "learning_rate": 1.89173952851638e-05, + "loss": 2.1863, + "mean_token_accuracy": 0.5120482444763184, + "num_tokens": 1730696165.0, + "step": 3385 + }, + { + "epoch": 0.9156300703082747, + "grad_norm": 2.13698673248291, + "learning_rate": 1.8916645528112907e-05, + "loss": 2.1602, + "mean_token_accuracy": 0.5286815166473389, + "num_tokens": 1731220404.0, + "step": 3386 + }, + { + "epoch": 0.9159004867495943, + "grad_norm": 1.6095637083053589, + "learning_rate": 1.891589552815605e-05, + "loss": 2.0976, + "mean_token_accuracy": 0.5153366923332214, + "num_tokens": 1731744481.0, + "step": 3387 + }, + { + "epoch": 0.916170903190914, + "grad_norm": 3.1928584575653076, + "learning_rate": 1.8915145285316243e-05, + "loss": 1.9831, + "mean_token_accuracy": 0.5568597316741943, + "num_tokens": 1732240432.0, + "step": 3388 + }, + { + "epoch": 0.9164413196322336, + "grad_norm": 2.037386894226074, + "learning_rate": 1.8914394799616497e-05, + "loss": 2.136, + "mean_token_accuracy": 0.53738933801651, + "num_tokens": 1732714041.0, + "step": 3389 + }, + { + "epoch": 0.9167117360735533, + "grad_norm": 1.7403017282485962, + "learning_rate": 1.8913644071079846e-05, + "loss": 2.1129, + "mean_token_accuracy": 0.5162608623504639, + "num_tokens": 1733238285.0, + "step": 3390 + }, + { + "epoch": 0.9169821525148729, + "grad_norm": 1.4995367527008057, + "learning_rate": 1.8912893099729318e-05, + "loss": 2.0726, + "mean_token_accuracy": 0.5239932537078857, + "num_tokens": 1733762480.0, + "step": 3391 + }, + { + "epoch": 0.9172525689561926, + "grad_norm": 1.6200141906738281, + "learning_rate": 1.8912141885587963e-05, + "loss": 2.1468, + "mean_token_accuracy": 0.5373568534851074, + "num_tokens": 1734186945.0, + "step": 3392 + }, + { + "epoch": 0.9175229853975122, + "grad_norm": 1.8011219501495361, + "learning_rate": 1.8911390428678827e-05, + "loss": 2.2069, + "mean_token_accuracy": 0.5171757936477661, + "num_tokens": 1734711214.0, + "step": 3393 + }, + { + "epoch": 0.9177934018388318, + "grad_norm": 1.4921040534973145, + "learning_rate": 1.8910638729024966e-05, + "loss": 2.1106, + "mean_token_accuracy": 0.5308188199996948, + "num_tokens": 1735235367.0, + "step": 3394 + }, + { + "epoch": 0.9180638182801514, + "grad_norm": 1.8063517808914185, + "learning_rate": 1.890988678664945e-05, + "loss": 2.0813, + "mean_token_accuracy": 0.5448503494262695, + "num_tokens": 1735759461.0, + "step": 3395 + }, + { + "epoch": 0.918334234721471, + "grad_norm": 1.4657877683639526, + "learning_rate": 1.890913460157534e-05, + "loss": 2.1556, + "mean_token_accuracy": 0.5187714695930481, + "num_tokens": 1736283717.0, + "step": 3396 + }, + { + "epoch": 0.9186046511627907, + "grad_norm": 1.663254976272583, + "learning_rate": 1.890838217382572e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.5216892957687378, + "num_tokens": 1736746511.0, + "step": 3397 + }, + { + "epoch": 0.9188750676041103, + "grad_norm": 1.3185032606124878, + "learning_rate": 1.8907629503423683e-05, + "loss": 2.0848, + "mean_token_accuracy": 0.5282914638519287, + "num_tokens": 1737270754.0, + "step": 3398 + }, + { + "epoch": 0.91914548404543, + "grad_norm": 1.3593159914016724, + "learning_rate": 1.8906876590392313e-05, + "loss": 2.1218, + "mean_token_accuracy": 0.5243632197380066, + "num_tokens": 1737794923.0, + "step": 3399 + }, + { + "epoch": 0.9194159004867496, + "grad_norm": 1.6991103887557983, + "learning_rate": 1.8906123434754722e-05, + "loss": 2.1418, + "mean_token_accuracy": 0.544174075126648, + "num_tokens": 1738319186.0, + "step": 3400 + }, + { + "epoch": 0.9196863169280692, + "grad_norm": 0.8596840500831604, + "learning_rate": 1.890537003653401e-05, + "loss": 1.2004, + "mean_token_accuracy": 0.6850268840789795, + "num_tokens": 1738818523.0, + "step": 3401 + }, + { + "epoch": 0.9199567333693889, + "grad_norm": 2.2199623584747314, + "learning_rate": 1.8904616395753297e-05, + "loss": 2.2236, + "mean_token_accuracy": 0.5024537444114685, + "num_tokens": 1739342605.0, + "step": 3402 + }, + { + "epoch": 0.9202271498107085, + "grad_norm": 1.8272221088409424, + "learning_rate": 1.8903862512435712e-05, + "loss": 2.299, + "mean_token_accuracy": 0.5338205099105835, + "num_tokens": 1739802349.0, + "step": 3403 + }, + { + "epoch": 0.9204975662520282, + "grad_norm": 1.1473197937011719, + "learning_rate": 1.8903108386604374e-05, + "loss": 2.1235, + "mean_token_accuracy": 0.5302413105964661, + "num_tokens": 1740326243.0, + "step": 3404 + }, + { + "epoch": 0.9207679826933478, + "grad_norm": 1.5020464658737183, + "learning_rate": 1.890235401828243e-05, + "loss": 1.9715, + "mean_token_accuracy": 0.5439541339874268, + "num_tokens": 1740850383.0, + "step": 3405 + }, + { + "epoch": 0.9210383991346673, + "grad_norm": 1.7585391998291016, + "learning_rate": 1.8901599407493032e-05, + "loss": 1.9956, + "mean_token_accuracy": 0.5490155220031738, + "num_tokens": 1741374648.0, + "step": 3406 + }, + { + "epoch": 0.921308815575987, + "grad_norm": 1.7217247486114502, + "learning_rate": 1.890084455425932e-05, + "loss": 2.0919, + "mean_token_accuracy": 0.5402685403823853, + "num_tokens": 1741898821.0, + "step": 3407 + }, + { + "epoch": 0.9215792320173066, + "grad_norm": 1.5066046714782715, + "learning_rate": 1.8900089458604467e-05, + "loss": 2.1157, + "mean_token_accuracy": 0.5364087820053101, + "num_tokens": 1742413109.0, + "step": 3408 + }, + { + "epoch": 0.9218496484586263, + "grad_norm": 1.6785815954208374, + "learning_rate": 1.8899334120551637e-05, + "loss": 2.0775, + "mean_token_accuracy": 0.5384359955787659, + "num_tokens": 1742935164.0, + "step": 3409 + }, + { + "epoch": 0.9221200648999459, + "grad_norm": 1.4996426105499268, + "learning_rate": 1.8898578540124e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.5506617426872253, + "num_tokens": 1743459422.0, + "step": 3410 + }, + { + "epoch": 0.9223904813412656, + "grad_norm": 1.435150384902954, + "learning_rate": 1.889782271734475e-05, + "loss": 2.0482, + "mean_token_accuracy": 0.530677080154419, + "num_tokens": 1743983591.0, + "step": 3411 + }, + { + "epoch": 0.9226608977825852, + "grad_norm": 1.9247416257858276, + "learning_rate": 1.8897066652237073e-05, + "loss": 2.1913, + "mean_token_accuracy": 0.5211578607559204, + "num_tokens": 1744507812.0, + "step": 3412 + }, + { + "epoch": 0.9229313142239048, + "grad_norm": 1.5193532705307007, + "learning_rate": 1.8896310344824167e-05, + "loss": 2.0854, + "mean_token_accuracy": 0.5348819494247437, + "num_tokens": 1745031995.0, + "step": 3413 + }, + { + "epoch": 0.9232017306652245, + "grad_norm": 2.1351332664489746, + "learning_rate": 1.8895553795129235e-05, + "loss": 2.1694, + "mean_token_accuracy": 0.5118023157119751, + "num_tokens": 1745556028.0, + "step": 3414 + }, + { + "epoch": 0.9234721471065441, + "grad_norm": 2.2089226245880127, + "learning_rate": 1.88947970031755e-05, + "loss": 2.107, + "mean_token_accuracy": 0.5298211574554443, + "num_tokens": 1746080274.0, + "step": 3415 + }, + { + "epoch": 0.9237425635478637, + "grad_norm": 1.5656630992889404, + "learning_rate": 1.8894039968986168e-05, + "loss": 2.2294, + "mean_token_accuracy": 0.5035794973373413, + "num_tokens": 1746604551.0, + "step": 3416 + }, + { + "epoch": 0.9240129799891833, + "grad_norm": 1.8545840978622437, + "learning_rate": 1.8893282692584477e-05, + "loss": 2.1452, + "mean_token_accuracy": 0.521888017654419, + "num_tokens": 1747046520.0, + "step": 3417 + }, + { + "epoch": 0.924283396430503, + "grad_norm": 2.7070741653442383, + "learning_rate": 1.8892525173993663e-05, + "loss": 2.168, + "mean_token_accuracy": 0.536302387714386, + "num_tokens": 1747512962.0, + "step": 3418 + }, + { + "epoch": 0.9245538128718226, + "grad_norm": 1.6103988885879517, + "learning_rate": 1.8891767413236967e-05, + "loss": 2.1191, + "mean_token_accuracy": 0.5108230710029602, + "num_tokens": 1748037235.0, + "step": 3419 + }, + { + "epoch": 0.9248242293131422, + "grad_norm": 21.031476974487305, + "learning_rate": 1.8891009410337634e-05, + "loss": 1.9716, + "mean_token_accuracy": 0.5700078010559082, + "num_tokens": 1748561519.0, + "step": 3420 + }, + { + "epoch": 0.9250946457544619, + "grad_norm": 0.846709668636322, + "learning_rate": 1.889025116531893e-05, + "loss": 1.1542, + "mean_token_accuracy": 0.6802806258201599, + "num_tokens": 1749085715.0, + "step": 3421 + }, + { + "epoch": 0.9253650621957815, + "grad_norm": 4.268240928649902, + "learning_rate": 1.8889492678204116e-05, + "loss": 2.083, + "mean_token_accuracy": 0.5634564161300659, + "num_tokens": 1749603838.0, + "step": 3422 + }, + { + "epoch": 0.9256354786371012, + "grad_norm": 3.5562665462493896, + "learning_rate": 1.8888733949016464e-05, + "loss": 2.2112, + "mean_token_accuracy": 0.5222878456115723, + "num_tokens": 1750127989.0, + "step": 3423 + }, + { + "epoch": 0.9259058950784208, + "grad_norm": 1.895595908164978, + "learning_rate": 1.8887974977779257e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.5022509098052979, + "num_tokens": 1750652100.0, + "step": 3424 + }, + { + "epoch": 0.9261763115197404, + "grad_norm": 2.5242507457733154, + "learning_rate": 1.888721576451578e-05, + "loss": 2.1425, + "mean_token_accuracy": 0.5148037672042847, + "num_tokens": 1751176247.0, + "step": 3425 + }, + { + "epoch": 0.92644672796106, + "grad_norm": 3.172987699508667, + "learning_rate": 1.888645630924933e-05, + "loss": 2.1982, + "mean_token_accuracy": 0.5205351114273071, + "num_tokens": 1751639870.0, + "step": 3426 + }, + { + "epoch": 0.9267171444023796, + "grad_norm": 2.5137033462524414, + "learning_rate": 1.8885696612003202e-05, + "loss": 2.1788, + "mean_token_accuracy": 0.5188839435577393, + "num_tokens": 1752164073.0, + "step": 3427 + }, + { + "epoch": 0.9269875608436993, + "grad_norm": 1.951432228088379, + "learning_rate": 1.8884936672800714e-05, + "loss": 1.8395, + "mean_token_accuracy": 0.5676600933074951, + "num_tokens": 1752688245.0, + "step": 3428 + }, + { + "epoch": 0.9272579772850189, + "grad_norm": 1.9772770404815674, + "learning_rate": 1.8884176491665186e-05, + "loss": 2.1399, + "mean_token_accuracy": 0.5310311317443848, + "num_tokens": 1753173039.0, + "step": 3429 + }, + { + "epoch": 0.9275283937263386, + "grad_norm": 2.352935791015625, + "learning_rate": 1.888341606861993e-05, + "loss": 2.0326, + "mean_token_accuracy": 0.5315271615982056, + "num_tokens": 1753697219.0, + "step": 3430 + }, + { + "epoch": 0.9277988101676582, + "grad_norm": 1.6037944555282593, + "learning_rate": 1.888265540368829e-05, + "loss": 1.8512, + "mean_token_accuracy": 0.5585599541664124, + "num_tokens": 1754190216.0, + "step": 3431 + }, + { + "epoch": 0.9280692266089778, + "grad_norm": 1.8046374320983887, + "learning_rate": 1.8881894496893603e-05, + "loss": 2.0899, + "mean_token_accuracy": 0.5365995168685913, + "num_tokens": 1754714313.0, + "step": 3432 + }, + { + "epoch": 0.9283396430502975, + "grad_norm": 1.711767554283142, + "learning_rate": 1.8881133348259207e-05, + "loss": 2.1594, + "mean_token_accuracy": 0.5164257287979126, + "num_tokens": 1755238591.0, + "step": 3433 + }, + { + "epoch": 0.9286100594916171, + "grad_norm": 1.3254492282867432, + "learning_rate": 1.888037195780847e-05, + "loss": 1.9978, + "mean_token_accuracy": 0.5578765869140625, + "num_tokens": 1755762826.0, + "step": 3434 + }, + { + "epoch": 0.9288804759329368, + "grad_norm": 1.5171992778778076, + "learning_rate": 1.8879610325564742e-05, + "loss": 2.0901, + "mean_token_accuracy": 0.533119261264801, + "num_tokens": 1756286966.0, + "step": 3435 + }, + { + "epoch": 0.9291508923742564, + "grad_norm": 1.416154146194458, + "learning_rate": 1.8878848451551398e-05, + "loss": 2.216, + "mean_token_accuracy": 0.5117480754852295, + "num_tokens": 1756811067.0, + "step": 3436 + }, + { + "epoch": 0.9294213088155759, + "grad_norm": 1.1469085216522217, + "learning_rate": 1.8878086335791812e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.5208228230476379, + "num_tokens": 1757287015.0, + "step": 3437 + }, + { + "epoch": 0.9296917252568956, + "grad_norm": 1.4036297798156738, + "learning_rate": 1.8877323978309372e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.5205721259117126, + "num_tokens": 1757717339.0, + "step": 3438 + }, + { + "epoch": 0.9299621416982152, + "grad_norm": 1.2479115724563599, + "learning_rate": 1.8876561379127466e-05, + "loss": 2.0954, + "mean_token_accuracy": 0.5277395248413086, + "num_tokens": 1758202328.0, + "step": 3439 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 1.2665055990219116, + "learning_rate": 1.8875798538269493e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.5347498655319214, + "num_tokens": 1758716213.0, + "step": 3440 + }, + { + "epoch": 0.9305029745808545, + "grad_norm": 0.6462133526802063, + "learning_rate": 1.8875035455758858e-05, + "loss": 1.1505, + "mean_token_accuracy": 0.6922694444656372, + "num_tokens": 1759240451.0, + "step": 3441 + }, + { + "epoch": 0.9307733910221742, + "grad_norm": 2.0446982383728027, + "learning_rate": 1.887427213161898e-05, + "loss": 2.1483, + "mean_token_accuracy": 0.5261306166648865, + "num_tokens": 1759710047.0, + "step": 3442 + }, + { + "epoch": 0.9310438074634938, + "grad_norm": 1.5742188692092896, + "learning_rate": 1.8873508565873276e-05, + "loss": 2.1431, + "mean_token_accuracy": 0.5359541177749634, + "num_tokens": 1760234288.0, + "step": 3443 + }, + { + "epoch": 0.9313142239048134, + "grad_norm": 1.3822966814041138, + "learning_rate": 1.887274475854517e-05, + "loss": 2.1714, + "mean_token_accuracy": 0.5221141576766968, + "num_tokens": 1760758494.0, + "step": 3444 + }, + { + "epoch": 0.9315846403461331, + "grad_norm": 1.5755269527435303, + "learning_rate": 1.8871980709658105e-05, + "loss": 1.9888, + "mean_token_accuracy": 0.5515842437744141, + "num_tokens": 1761223123.0, + "step": 3445 + }, + { + "epoch": 0.9318550567874527, + "grad_norm": 1.6744005680084229, + "learning_rate": 1.8871216419235522e-05, + "loss": 2.0731, + "mean_token_accuracy": 0.5174629092216492, + "num_tokens": 1761747304.0, + "step": 3446 + }, + { + "epoch": 0.9321254732287723, + "grad_norm": 1.6476517915725708, + "learning_rate": 1.887045188730087e-05, + "loss": 2.2207, + "mean_token_accuracy": 0.5202873945236206, + "num_tokens": 1762271492.0, + "step": 3447 + }, + { + "epoch": 0.9323958896700919, + "grad_norm": 1.566838026046753, + "learning_rate": 1.886968711387761e-05, + "loss": 2.1449, + "mean_token_accuracy": 0.532696545124054, + "num_tokens": 1762795729.0, + "step": 3448 + }, + { + "epoch": 0.9326663061114115, + "grad_norm": 1.4401029348373413, + "learning_rate": 1.886892209898921e-05, + "loss": 2.0441, + "mean_token_accuracy": 0.5466119050979614, + "num_tokens": 1763319962.0, + "step": 3449 + }, + { + "epoch": 0.9329367225527312, + "grad_norm": 1.6266086101531982, + "learning_rate": 1.886815684265913e-05, + "loss": 2.165, + "mean_token_accuracy": 0.5396434664726257, + "num_tokens": 1763789564.0, + "step": 3450 + }, + { + "epoch": 0.9332071389940508, + "grad_norm": 1.5701972246170044, + "learning_rate": 1.8867391344910868e-05, + "loss": 2.2758, + "mean_token_accuracy": 0.5368064641952515, + "num_tokens": 1764313728.0, + "step": 3451 + }, + { + "epoch": 0.9334775554353705, + "grad_norm": 1.2079118490219116, + "learning_rate": 1.88666256057679e-05, + "loss": 2.1381, + "mean_token_accuracy": 0.5237846374511719, + "num_tokens": 1764816608.0, + "step": 3452 + }, + { + "epoch": 0.9337479718766901, + "grad_norm": 1.2690008878707886, + "learning_rate": 1.8865859625253724e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.5070186257362366, + "num_tokens": 1765340695.0, + "step": 3453 + }, + { + "epoch": 0.9340183883180098, + "grad_norm": 1.4271219968795776, + "learning_rate": 1.8865093403391842e-05, + "loss": 2.1529, + "mean_token_accuracy": 0.546830952167511, + "num_tokens": 1765864858.0, + "step": 3454 + }, + { + "epoch": 0.9342888047593294, + "grad_norm": 1.3138779401779175, + "learning_rate": 1.8864326940205764e-05, + "loss": 2.0875, + "mean_token_accuracy": 0.5465079545974731, + "num_tokens": 1766387853.0, + "step": 3455 + }, + { + "epoch": 0.934559221200649, + "grad_norm": 1.5221550464630127, + "learning_rate": 1.886356023571901e-05, + "loss": 2.1808, + "mean_token_accuracy": 0.5183927416801453, + "num_tokens": 1766891906.0, + "step": 3456 + }, + { + "epoch": 0.9348296376419686, + "grad_norm": 1.5027828216552734, + "learning_rate": 1.88627932899551e-05, + "loss": 2.1259, + "mean_token_accuracy": 0.5337295532226562, + "num_tokens": 1767393127.0, + "step": 3457 + }, + { + "epoch": 0.9351000540832882, + "grad_norm": 1.4635425806045532, + "learning_rate": 1.886202610293757e-05, + "loss": 2.2048, + "mean_token_accuracy": 0.5138725638389587, + "num_tokens": 1767917323.0, + "step": 3458 + }, + { + "epoch": 0.9353704705246079, + "grad_norm": 1.5680407285690308, + "learning_rate": 1.8861258674689963e-05, + "loss": 2.0349, + "mean_token_accuracy": 0.5446391105651855, + "num_tokens": 1768404863.0, + "step": 3459 + }, + { + "epoch": 0.9356408869659275, + "grad_norm": 1.1898759603500366, + "learning_rate": 1.8860491005235818e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5501660108566284, + "num_tokens": 1768893762.0, + "step": 3460 + }, + { + "epoch": 0.9359113034072472, + "grad_norm": 0.8080999851226807, + "learning_rate": 1.885972309459869e-05, + "loss": 1.1789, + "mean_token_accuracy": 0.6854989528656006, + "num_tokens": 1769417889.0, + "step": 3461 + }, + { + "epoch": 0.9361817198485668, + "grad_norm": 2.748542308807373, + "learning_rate": 1.885895494280215e-05, + "loss": 1.984, + "mean_token_accuracy": 0.5782679915428162, + "num_tokens": 1769931388.0, + "step": 3462 + }, + { + "epoch": 0.9364521362898864, + "grad_norm": 2.243429183959961, + "learning_rate": 1.8858186549869754e-05, + "loss": 2.0764, + "mean_token_accuracy": 0.5453599691390991, + "num_tokens": 1770455639.0, + "step": 3463 + }, + { + "epoch": 0.9367225527312061, + "grad_norm": 1.5615872144699097, + "learning_rate": 1.8857417915825087e-05, + "loss": 2.1401, + "mean_token_accuracy": 0.5459312796592712, + "num_tokens": 1770920626.0, + "step": 3464 + }, + { + "epoch": 0.9369929691725257, + "grad_norm": 1.9058620929718018, + "learning_rate": 1.8856649040691733e-05, + "loss": 2.1096, + "mean_token_accuracy": 0.5245440006256104, + "num_tokens": 1771444891.0, + "step": 3465 + }, + { + "epoch": 0.9372633856138454, + "grad_norm": 1.860337495803833, + "learning_rate": 1.885587992449328e-05, + "loss": 2.2476, + "mean_token_accuracy": 0.5158230066299438, + "num_tokens": 1771969138.0, + "step": 3466 + }, + { + "epoch": 0.9375338020551649, + "grad_norm": 2.055481195449829, + "learning_rate": 1.885511056725333e-05, + "loss": 2.1436, + "mean_token_accuracy": 0.5228817462921143, + "num_tokens": 1772493293.0, + "step": 3467 + }, + { + "epoch": 0.9378042184964845, + "grad_norm": 1.9298299551010132, + "learning_rate": 1.8854340968995486e-05, + "loss": 2.1188, + "mean_token_accuracy": 0.5495762825012207, + "num_tokens": 1773017527.0, + "step": 3468 + }, + { + "epoch": 0.9380746349378042, + "grad_norm": 1.9676682949066162, + "learning_rate": 1.8853571129743363e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.5211607217788696, + "num_tokens": 1773496308.0, + "step": 3469 + }, + { + "epoch": 0.9383450513791238, + "grad_norm": 1.7396278381347656, + "learning_rate": 1.885280104952058e-05, + "loss": 2.0823, + "mean_token_accuracy": 0.5397167205810547, + "num_tokens": 1774020561.0, + "step": 3470 + }, + { + "epoch": 0.9386154678204435, + "grad_norm": 1.6695048809051514, + "learning_rate": 1.885203072835077e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.5178723931312561, + "num_tokens": 1774544530.0, + "step": 3471 + }, + { + "epoch": 0.9388858842617631, + "grad_norm": 1.5131149291992188, + "learning_rate": 1.885126016625757e-05, + "loss": 2.1925, + "mean_token_accuracy": 0.5212008953094482, + "num_tokens": 1775068761.0, + "step": 3472 + }, + { + "epoch": 0.9391563007030828, + "grad_norm": 1.4210585355758667, + "learning_rate": 1.885048936326461e-05, + "loss": 2.0855, + "mean_token_accuracy": 0.5297439098358154, + "num_tokens": 1775592989.0, + "step": 3473 + }, + { + "epoch": 0.9394267171444024, + "grad_norm": 1.2903090715408325, + "learning_rate": 1.8849718319395554e-05, + "loss": 2.0249, + "mean_token_accuracy": 0.5323798656463623, + "num_tokens": 1776117234.0, + "step": 3474 + }, + { + "epoch": 0.939697133585722, + "grad_norm": 1.4229016304016113, + "learning_rate": 1.884894703467406e-05, + "loss": 2.1945, + "mean_token_accuracy": 0.5264455080032349, + "num_tokens": 1776641440.0, + "step": 3475 + }, + { + "epoch": 0.9399675500270417, + "grad_norm": 1.3621355295181274, + "learning_rate": 1.884817550912378e-05, + "loss": 2.132, + "mean_token_accuracy": 0.5378479957580566, + "num_tokens": 1777072917.0, + "step": 3476 + }, + { + "epoch": 0.9402379664683613, + "grad_norm": 1.4099740982055664, + "learning_rate": 1.88474037427684e-05, + "loss": 2.1266, + "mean_token_accuracy": 0.5272213220596313, + "num_tokens": 1777597186.0, + "step": 3477 + }, + { + "epoch": 0.9405083829096809, + "grad_norm": 1.4357932806015015, + "learning_rate": 1.8846631735631592e-05, + "loss": 2.1343, + "mean_token_accuracy": 0.5305334329605103, + "num_tokens": 1778105395.0, + "step": 3478 + }, + { + "epoch": 0.9407787993510005, + "grad_norm": 1.308824896812439, + "learning_rate": 1.884585948773705e-05, + "loss": 2.1519, + "mean_token_accuracy": 0.532663106918335, + "num_tokens": 1778629587.0, + "step": 3479 + }, + { + "epoch": 0.9410492157923201, + "grad_norm": 1.4618563652038574, + "learning_rate": 1.884508699910847e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5081139802932739, + "num_tokens": 1779153805.0, + "step": 3480 + }, + { + "epoch": 0.9413196322336398, + "grad_norm": 0.6304420232772827, + "learning_rate": 1.884431426976954e-05, + "loss": 1.1953, + "mean_token_accuracy": 0.694123387336731, + "num_tokens": 1779678033.0, + "step": 3481 + }, + { + "epoch": 0.9415900486749594, + "grad_norm": 2.0328876972198486, + "learning_rate": 1.884354129974399e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.5119131803512573, + "num_tokens": 1780154827.0, + "step": 3482 + }, + { + "epoch": 0.9418604651162791, + "grad_norm": 1.525461196899414, + "learning_rate": 1.8842768089055517e-05, + "loss": 2.2054, + "mean_token_accuracy": 0.5313040018081665, + "num_tokens": 1780679110.0, + "step": 3483 + }, + { + "epoch": 0.9421308815575987, + "grad_norm": 1.4721590280532837, + "learning_rate": 1.8841994637727864e-05, + "loss": 2.1948, + "mean_token_accuracy": 0.5109500288963318, + "num_tokens": 1781203317.0, + "step": 3484 + }, + { + "epoch": 0.9424012979989184, + "grad_norm": 1.5208481550216675, + "learning_rate": 1.8841220945784747e-05, + "loss": 2.0516, + "mean_token_accuracy": 0.5488273501396179, + "num_tokens": 1781702304.0, + "step": 3485 + }, + { + "epoch": 0.942671714440238, + "grad_norm": 1.6272976398468018, + "learning_rate": 1.8840447013249915e-05, + "loss": 1.9465, + "mean_token_accuracy": 0.555056095123291, + "num_tokens": 1782226370.0, + "step": 3486 + }, + { + "epoch": 0.9429421308815576, + "grad_norm": 2.0682380199432373, + "learning_rate": 1.8839672840147113e-05, + "loss": 2.1595, + "mean_token_accuracy": 0.5052184462547302, + "num_tokens": 1782750600.0, + "step": 3487 + }, + { + "epoch": 0.9432125473228772, + "grad_norm": 2.110133647918701, + "learning_rate": 1.883889842650009e-05, + "loss": 2.0817, + "mean_token_accuracy": 0.5348770618438721, + "num_tokens": 1783274703.0, + "step": 3488 + }, + { + "epoch": 0.9434829637641968, + "grad_norm": 1.3765039443969727, + "learning_rate": 1.883812377233261e-05, + "loss": 2.0293, + "mean_token_accuracy": 0.5419301390647888, + "num_tokens": 1783798854.0, + "step": 3489 + }, + { + "epoch": 0.9437533802055165, + "grad_norm": 2.038572311401367, + "learning_rate": 1.8837348877668443e-05, + "loss": 2.2823, + "mean_token_accuracy": 0.5009360313415527, + "num_tokens": 1784323125.0, + "step": 3490 + }, + { + "epoch": 0.9440237966468361, + "grad_norm": 1.8283612728118896, + "learning_rate": 1.8836573742531367e-05, + "loss": 1.9799, + "mean_token_accuracy": 0.5319276452064514, + "num_tokens": 1784847195.0, + "step": 3491 + }, + { + "epoch": 0.9442942130881558, + "grad_norm": 1.5663224458694458, + "learning_rate": 1.883579836694516e-05, + "loss": 2.1012, + "mean_token_accuracy": 0.5720857381820679, + "num_tokens": 1785273674.0, + "step": 3492 + }, + { + "epoch": 0.9445646295294754, + "grad_norm": 1.7128385305404663, + "learning_rate": 1.8835022750933616e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.5285917520523071, + "num_tokens": 1785797939.0, + "step": 3493 + }, + { + "epoch": 0.944835045970795, + "grad_norm": 1.6198478937149048, + "learning_rate": 1.8834246894520534e-05, + "loss": 2.0116, + "mean_token_accuracy": 0.5357540249824524, + "num_tokens": 1786322022.0, + "step": 3494 + }, + { + "epoch": 0.9451054624121147, + "grad_norm": 1.6196688413619995, + "learning_rate": 1.8833470797729717e-05, + "loss": 2.1468, + "mean_token_accuracy": 0.5289987325668335, + "num_tokens": 1786809571.0, + "step": 3495 + }, + { + "epoch": 0.9453758788534343, + "grad_norm": 1.4343976974487305, + "learning_rate": 1.8832694460584977e-05, + "loss": 2.0587, + "mean_token_accuracy": 0.5471656322479248, + "num_tokens": 1787333850.0, + "step": 3496 + }, + { + "epoch": 0.945646295294754, + "grad_norm": 1.7938255071640015, + "learning_rate": 1.883191788311014e-05, + "loss": 2.062, + "mean_token_accuracy": 0.5322791337966919, + "num_tokens": 1787858126.0, + "step": 3497 + }, + { + "epoch": 0.9459167117360735, + "grad_norm": 1.3963462114334106, + "learning_rate": 1.883114106532903e-05, + "loss": 2.0397, + "mean_token_accuracy": 0.5547926425933838, + "num_tokens": 1788382294.0, + "step": 3498 + }, + { + "epoch": 0.9461871281773931, + "grad_norm": 1.188460350036621, + "learning_rate": 1.883036400726548e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.5245319604873657, + "num_tokens": 1788906475.0, + "step": 3499 + }, + { + "epoch": 0.9464575446187128, + "grad_norm": 1.3493540287017822, + "learning_rate": 1.8829586708943336e-05, + "loss": 1.9726, + "mean_token_accuracy": 0.5361555814743042, + "num_tokens": 1789422679.0, + "step": 3500 + }, + { + "epoch": 0.9467279610600324, + "grad_norm": 0.6215768456459045, + "learning_rate": 1.8828809170386447e-05, + "loss": 1.2322, + "mean_token_accuracy": 0.6719073057174683, + "num_tokens": 1789946791.0, + "step": 3501 + }, + { + "epoch": 0.9469983775013521, + "grad_norm": 2.2979516983032227, + "learning_rate": 1.882803139161867e-05, + "loss": 2.1096, + "mean_token_accuracy": 0.5230611562728882, + "num_tokens": 1790443399.0, + "step": 3502 + }, + { + "epoch": 0.9472687939426717, + "grad_norm": 1.7982535362243652, + "learning_rate": 1.8827253372663868e-05, + "loss": 2.1089, + "mean_token_accuracy": 0.5310706496238708, + "num_tokens": 1790949170.0, + "step": 3503 + }, + { + "epoch": 0.9475392103839914, + "grad_norm": 1.7006183862686157, + "learning_rate": 1.8826475113545913e-05, + "loss": 2.2105, + "mean_token_accuracy": 0.5185816884040833, + "num_tokens": 1791473271.0, + "step": 3504 + }, + { + "epoch": 0.947809626825311, + "grad_norm": 2.151195526123047, + "learning_rate": 1.8825696614288687e-05, + "loss": 2.0657, + "mean_token_accuracy": 0.5438166856765747, + "num_tokens": 1791997400.0, + "step": 3505 + }, + { + "epoch": 0.9480800432666306, + "grad_norm": 1.5608513355255127, + "learning_rate": 1.8824917874916076e-05, + "loss": 2.0765, + "mean_token_accuracy": 0.5511687397956848, + "num_tokens": 1792521604.0, + "step": 3506 + }, + { + "epoch": 0.9483504597079503, + "grad_norm": 1.5432674884796143, + "learning_rate": 1.8824138895451972e-05, + "loss": 2.0965, + "mean_token_accuracy": 0.5560111999511719, + "num_tokens": 1792995898.0, + "step": 3507 + }, + { + "epoch": 0.9486208761492698, + "grad_norm": 1.4128756523132324, + "learning_rate": 1.8823359675920275e-05, + "loss": 2.133, + "mean_token_accuracy": 0.5119946002960205, + "num_tokens": 1793520055.0, + "step": 3508 + }, + { + "epoch": 0.9488912925905895, + "grad_norm": 17.690885543823242, + "learning_rate": 1.88225802163449e-05, + "loss": 2.0514, + "mean_token_accuracy": 0.5539701581001282, + "num_tokens": 1794008315.0, + "step": 3509 + }, + { + "epoch": 0.9491617090319091, + "grad_norm": 3.1865851879119873, + "learning_rate": 1.8821800516749756e-05, + "loss": 2.1217, + "mean_token_accuracy": 0.5288760662078857, + "num_tokens": 1794532582.0, + "step": 3510 + }, + { + "epoch": 0.9494321254732287, + "grad_norm": 2.0766355991363525, + "learning_rate": 1.8821020577158773e-05, + "loss": 2.1701, + "mean_token_accuracy": 0.5154522657394409, + "num_tokens": 1794995539.0, + "step": 3511 + }, + { + "epoch": 0.9497025419145484, + "grad_norm": 2.3360111713409424, + "learning_rate": 1.8820240397595876e-05, + "loss": 1.8321, + "mean_token_accuracy": 0.5786721706390381, + "num_tokens": 1795519736.0, + "step": 3512 + }, + { + "epoch": 0.949972958355868, + "grad_norm": 2.8869879245758057, + "learning_rate": 1.8819459978085007e-05, + "loss": 2.0704, + "mean_token_accuracy": 0.5569394826889038, + "num_tokens": 1795981428.0, + "step": 3513 + }, + { + "epoch": 0.9502433747971877, + "grad_norm": 2.4745280742645264, + "learning_rate": 1.8818679318650108e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.5307705402374268, + "num_tokens": 1796407699.0, + "step": 3514 + }, + { + "epoch": 0.9505137912385073, + "grad_norm": 1.638360619544983, + "learning_rate": 1.8817898419315135e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.527336597442627, + "num_tokens": 1796931879.0, + "step": 3515 + }, + { + "epoch": 0.950784207679827, + "grad_norm": 2.283379077911377, + "learning_rate": 1.8817117280104048e-05, + "loss": 2.1527, + "mean_token_accuracy": 0.5405756235122681, + "num_tokens": 1797392470.0, + "step": 3516 + }, + { + "epoch": 0.9510546241211466, + "grad_norm": 1.8440163135528564, + "learning_rate": 1.8816335901040815e-05, + "loss": 2.3019, + "mean_token_accuracy": 0.505606472492218, + "num_tokens": 1797869297.0, + "step": 3517 + }, + { + "epoch": 0.9513250405624663, + "grad_norm": 2.0861380100250244, + "learning_rate": 1.8815554282149413e-05, + "loss": 2.279, + "mean_token_accuracy": 0.5061063170433044, + "num_tokens": 1798393539.0, + "step": 3518 + }, + { + "epoch": 0.9515954570037858, + "grad_norm": 1.944105863571167, + "learning_rate": 1.8814772423453815e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.5308791399002075, + "num_tokens": 1798870733.0, + "step": 3519 + }, + { + "epoch": 0.9518658734451054, + "grad_norm": 1.2898863554000854, + "learning_rate": 1.8813990324978023e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5486266613006592, + "num_tokens": 1799395018.0, + "step": 3520 + }, + { + "epoch": 0.9521362898864251, + "grad_norm": 0.8849107027053833, + "learning_rate": 1.8813207986746028e-05, + "loss": 1.1459, + "mean_token_accuracy": 0.7065853476524353, + "num_tokens": 1799919190.0, + "step": 3521 + }, + { + "epoch": 0.9524067063277447, + "grad_norm": 3.393096685409546, + "learning_rate": 1.8812425408781832e-05, + "loss": 2.018, + "mean_token_accuracy": 0.5396087169647217, + "num_tokens": 1800443472.0, + "step": 3522 + }, + { + "epoch": 0.9526771227690644, + "grad_norm": 3.3642258644104004, + "learning_rate": 1.881164259110945e-05, + "loss": 2.1045, + "mean_token_accuracy": 0.5236406326293945, + "num_tokens": 1800967697.0, + "step": 3523 + }, + { + "epoch": 0.952947539210384, + "grad_norm": 2.041534423828125, + "learning_rate": 1.88108595337529e-05, + "loss": 2.0841, + "mean_token_accuracy": 0.5615192651748657, + "num_tokens": 1801491769.0, + "step": 3524 + }, + { + "epoch": 0.9532179556517036, + "grad_norm": 2.5576395988464355, + "learning_rate": 1.881007623673621e-05, + "loss": 2.0436, + "mean_token_accuracy": 0.5253344774246216, + "num_tokens": 1802015966.0, + "step": 3525 + }, + { + "epoch": 0.9534883720930233, + "grad_norm": 3.254812479019165, + "learning_rate": 1.8809292700083418e-05, + "loss": 2.0535, + "mean_token_accuracy": 0.5480344891548157, + "num_tokens": 1802518203.0, + "step": 3526 + }, + { + "epoch": 0.9537587885343429, + "grad_norm": 2.4480435848236084, + "learning_rate": 1.8808508923818557e-05, + "loss": 2.0673, + "mean_token_accuracy": 0.5355861186981201, + "num_tokens": 1803042465.0, + "step": 3527 + }, + { + "epoch": 0.9540292049756626, + "grad_norm": 1.657179594039917, + "learning_rate": 1.8807724907965677e-05, + "loss": 2.0858, + "mean_token_accuracy": 0.5224875211715698, + "num_tokens": 1803566695.0, + "step": 3528 + }, + { + "epoch": 0.9542996214169821, + "grad_norm": 2.709923267364502, + "learning_rate": 1.8806940652548837e-05, + "loss": 2.1329, + "mean_token_accuracy": 0.5396084785461426, + "num_tokens": 1804090763.0, + "step": 3529 + }, + { + "epoch": 0.9545700378583017, + "grad_norm": 2.3835084438323975, + "learning_rate": 1.88061561575921e-05, + "loss": 2.0093, + "mean_token_accuracy": 0.5441617965698242, + "num_tokens": 1804615012.0, + "step": 3530 + }, + { + "epoch": 0.9548404542996214, + "grad_norm": 1.4534869194030762, + "learning_rate": 1.8805371423119536e-05, + "loss": 1.9895, + "mean_token_accuracy": 0.5329951643943787, + "num_tokens": 1805139229.0, + "step": 3531 + }, + { + "epoch": 0.955110870740941, + "grad_norm": 2.1351845264434814, + "learning_rate": 1.8804586449155217e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.5113214254379272, + "num_tokens": 1805663501.0, + "step": 3532 + }, + { + "epoch": 0.9553812871822607, + "grad_norm": 1.9051839113235474, + "learning_rate": 1.880380123572324e-05, + "loss": 1.6941, + "mean_token_accuracy": 0.595771312713623, + "num_tokens": 1806187781.0, + "step": 3533 + }, + { + "epoch": 0.9556517036235803, + "grad_norm": 1.430993676185608, + "learning_rate": 1.880301578284769e-05, + "loss": 2.0947, + "mean_token_accuracy": 0.5449954271316528, + "num_tokens": 1806704534.0, + "step": 3534 + }, + { + "epoch": 0.9559221200649, + "grad_norm": 2.1323177814483643, + "learning_rate": 1.8802230090552667e-05, + "loss": 2.1662, + "mean_token_accuracy": 0.5338917970657349, + "num_tokens": 1807228768.0, + "step": 3535 + }, + { + "epoch": 0.9561925365062196, + "grad_norm": 2.0897459983825684, + "learning_rate": 1.8801444158862282e-05, + "loss": 2.2509, + "mean_token_accuracy": 0.5081745386123657, + "num_tokens": 1807718315.0, + "step": 3536 + }, + { + "epoch": 0.9564629529475392, + "grad_norm": 1.1640704870224, + "learning_rate": 1.8800657987800648e-05, + "loss": 2.0303, + "mean_token_accuracy": 0.5367549657821655, + "num_tokens": 1808188294.0, + "step": 3537 + }, + { + "epoch": 0.9567333693888589, + "grad_norm": 1.5505200624465942, + "learning_rate": 1.8799871577391887e-05, + "loss": 2.2605, + "mean_token_accuracy": 0.5096353888511658, + "num_tokens": 1808656973.0, + "step": 3538 + }, + { + "epoch": 0.9570037858301784, + "grad_norm": 1.4638527631759644, + "learning_rate": 1.8799084927660123e-05, + "loss": 2.1443, + "mean_token_accuracy": 0.5138593912124634, + "num_tokens": 1809181171.0, + "step": 3539 + }, + { + "epoch": 0.9572742022714981, + "grad_norm": 1.4787416458129883, + "learning_rate": 1.8798298038629507e-05, + "loss": 2.2056, + "mean_token_accuracy": 0.5223342180252075, + "num_tokens": 1809705440.0, + "step": 3540 + }, + { + "epoch": 0.9575446187128177, + "grad_norm": 0.9793581962585449, + "learning_rate": 1.8797510910324165e-05, + "loss": 1.209, + "mean_token_accuracy": 0.6783362030982971, + "num_tokens": 1810229667.0, + "step": 3541 + }, + { + "epoch": 0.9578150351541374, + "grad_norm": 2.9576520919799805, + "learning_rate": 1.879672354276826e-05, + "loss": 2.1524, + "mean_token_accuracy": 0.5277377367019653, + "num_tokens": 1810753879.0, + "step": 3542 + }, + { + "epoch": 0.958085451595457, + "grad_norm": 2.2816193103790283, + "learning_rate": 1.879593593598595e-05, + "loss": 2.0039, + "mean_token_accuracy": 0.5416915416717529, + "num_tokens": 1811278104.0, + "step": 3543 + }, + { + "epoch": 0.9583558680367766, + "grad_norm": 1.4756754636764526, + "learning_rate": 1.8795148090001398e-05, + "loss": 2.1258, + "mean_token_accuracy": 0.5249212384223938, + "num_tokens": 1811802338.0, + "step": 3544 + }, + { + "epoch": 0.9586262844780963, + "grad_norm": 2.3025405406951904, + "learning_rate": 1.879436000483878e-05, + "loss": 2.1488, + "mean_token_accuracy": 0.5403984785079956, + "num_tokens": 1812326624.0, + "step": 3545 + }, + { + "epoch": 0.9588967009194159, + "grad_norm": 2.3206048011779785, + "learning_rate": 1.8793571680522272e-05, + "loss": 2.1549, + "mean_token_accuracy": 0.5429762601852417, + "num_tokens": 1812822234.0, + "step": 3546 + }, + { + "epoch": 0.9591671173607356, + "grad_norm": 1.8176835775375366, + "learning_rate": 1.8792783117076068e-05, + "loss": 2.0953, + "mean_token_accuracy": 0.5272072553634644, + "num_tokens": 1813346433.0, + "step": 3547 + }, + { + "epoch": 0.9594375338020552, + "grad_norm": 2.1207473278045654, + "learning_rate": 1.879199431452436e-05, + "loss": 2.0382, + "mean_token_accuracy": 0.5660465955734253, + "num_tokens": 1813870614.0, + "step": 3548 + }, + { + "epoch": 0.9597079502433749, + "grad_norm": 1.8018465042114258, + "learning_rate": 1.879120527289135e-05, + "loss": 2.0642, + "mean_token_accuracy": 0.538935661315918, + "num_tokens": 1814394825.0, + "step": 3549 + }, + { + "epoch": 0.9599783666846944, + "grad_norm": 1.7482706308364868, + "learning_rate": 1.8790415992201255e-05, + "loss": 2.1087, + "mean_token_accuracy": 0.519233763217926, + "num_tokens": 1814918936.0, + "step": 3550 + }, + { + "epoch": 0.960248783126014, + "grad_norm": 1.9669231176376343, + "learning_rate": 1.8789626472478283e-05, + "loss": 2.0846, + "mean_token_accuracy": 0.5481295585632324, + "num_tokens": 1815412870.0, + "step": 3551 + }, + { + "epoch": 0.9605191995673337, + "grad_norm": 1.827328085899353, + "learning_rate": 1.8788836713746662e-05, + "loss": 2.1275, + "mean_token_accuracy": 0.5248656272888184, + "num_tokens": 1815920950.0, + "step": 3552 + }, + { + "epoch": 0.9607896160086533, + "grad_norm": 1.7289353609085083, + "learning_rate": 1.8788046716030628e-05, + "loss": 2.1036, + "mean_token_accuracy": 0.536017894744873, + "num_tokens": 1816445172.0, + "step": 3553 + }, + { + "epoch": 0.961060032449973, + "grad_norm": 1.8909913301467896, + "learning_rate": 1.8787256479354415e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.5214934945106506, + "num_tokens": 1816969308.0, + "step": 3554 + }, + { + "epoch": 0.9613304488912926, + "grad_norm": 1.689712643623352, + "learning_rate": 1.8786466003742277e-05, + "loss": 2.15, + "mean_token_accuracy": 0.537784218788147, + "num_tokens": 1817493584.0, + "step": 3555 + }, + { + "epoch": 0.9616008653326122, + "grad_norm": 1.726517677307129, + "learning_rate": 1.8785675289218458e-05, + "loss": 2.1158, + "mean_token_accuracy": 0.5191826224327087, + "num_tokens": 1817980137.0, + "step": 3556 + }, + { + "epoch": 0.9618712817739319, + "grad_norm": 1.7454718351364136, + "learning_rate": 1.8784884335807228e-05, + "loss": 2.0779, + "mean_token_accuracy": 0.5358519554138184, + "num_tokens": 1818504217.0, + "step": 3557 + }, + { + "epoch": 0.9621416982152515, + "grad_norm": 1.7137317657470703, + "learning_rate": 1.878409314353285e-05, + "loss": 2.0272, + "mean_token_accuracy": 0.5469305515289307, + "num_tokens": 1819028475.0, + "step": 3558 + }, + { + "epoch": 0.9624121146565712, + "grad_norm": 1.4009841680526733, + "learning_rate": 1.8783301712419605e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.5179696083068848, + "num_tokens": 1819552704.0, + "step": 3559 + }, + { + "epoch": 0.9626825310978907, + "grad_norm": 4.079151153564453, + "learning_rate": 1.8782510042491776e-05, + "loss": 1.9813, + "mean_token_accuracy": 0.5604435801506042, + "num_tokens": 1820076778.0, + "step": 3560 + }, + { + "epoch": 0.9629529475392103, + "grad_norm": 0.7140775918960571, + "learning_rate": 1.878171813377365e-05, + "loss": 1.0854, + "mean_token_accuracy": 0.7140241861343384, + "num_tokens": 1820579670.0, + "step": 3561 + }, + { + "epoch": 0.96322336398053, + "grad_norm": 3.2806594371795654, + "learning_rate": 1.8780925986289528e-05, + "loss": 2.1844, + "mean_token_accuracy": 0.5164963006973267, + "num_tokens": 1821103905.0, + "step": 3562 + }, + { + "epoch": 0.9634937804218496, + "grad_norm": 2.8190605640411377, + "learning_rate": 1.8780133600063714e-05, + "loss": 2.0443, + "mean_token_accuracy": 0.537772536277771, + "num_tokens": 1821628133.0, + "step": 3563 + }, + { + "epoch": 0.9637641968631693, + "grad_norm": 1.3385730981826782, + "learning_rate": 1.8779340975120525e-05, + "loss": 1.956, + "mean_token_accuracy": 0.5522007942199707, + "num_tokens": 1822152356.0, + "step": 3564 + }, + { + "epoch": 0.9640346133044889, + "grad_norm": 1.840915322303772, + "learning_rate": 1.877854811148427e-05, + "loss": 1.86, + "mean_token_accuracy": 0.5813716650009155, + "num_tokens": 1822641629.0, + "step": 3565 + }, + { + "epoch": 0.9643050297458086, + "grad_norm": 2.9004671573638916, + "learning_rate": 1.8777755009179288e-05, + "loss": 2.1425, + "mean_token_accuracy": 0.535056471824646, + "num_tokens": 1823165694.0, + "step": 3566 + }, + { + "epoch": 0.9645754461871282, + "grad_norm": 2.3063278198242188, + "learning_rate": 1.877696166822991e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.5490157008171082, + "num_tokens": 1823689884.0, + "step": 3567 + }, + { + "epoch": 0.9648458626284478, + "grad_norm": 2.1401214599609375, + "learning_rate": 1.8776168088660474e-05, + "loss": 2.0283, + "mean_token_accuracy": 0.5562629103660583, + "num_tokens": 1824214163.0, + "step": 3568 + }, + { + "epoch": 0.9651162790697675, + "grad_norm": 1.9365571737289429, + "learning_rate": 1.8775374270495335e-05, + "loss": 2.1922, + "mean_token_accuracy": 0.5227328538894653, + "num_tokens": 1824738413.0, + "step": 3569 + }, + { + "epoch": 0.965386695511087, + "grad_norm": 2.0118460655212402, + "learning_rate": 1.8774580213758847e-05, + "loss": 2.2206, + "mean_token_accuracy": 0.5180988311767578, + "num_tokens": 1825262688.0, + "step": 3570 + }, + { + "epoch": 0.9656571119524067, + "grad_norm": 1.4114800691604614, + "learning_rate": 1.8773785918475374e-05, + "loss": 2.001, + "mean_token_accuracy": 0.5490869283676147, + "num_tokens": 1825769950.0, + "step": 3571 + }, + { + "epoch": 0.9659275283937263, + "grad_norm": 1.5495119094848633, + "learning_rate": 1.8772991384669293e-05, + "loss": 2.116, + "mean_token_accuracy": 0.5205119848251343, + "num_tokens": 1826294090.0, + "step": 3572 + }, + { + "epoch": 0.966197944835046, + "grad_norm": 1.6955372095108032, + "learning_rate": 1.877219661236497e-05, + "loss": 2.2023, + "mean_token_accuracy": 0.5219583511352539, + "num_tokens": 1826818353.0, + "step": 3573 + }, + { + "epoch": 0.9664683612763656, + "grad_norm": 2.0230295658111572, + "learning_rate": 1.87714016015868e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.5527807474136353, + "num_tokens": 1827240180.0, + "step": 3574 + }, + { + "epoch": 0.9667387777176852, + "grad_norm": 1.7966009378433228, + "learning_rate": 1.877060635235918e-05, + "loss": 2.1074, + "mean_token_accuracy": 0.5404601693153381, + "num_tokens": 1827704230.0, + "step": 3575 + }, + { + "epoch": 0.9670091941590049, + "grad_norm": 1.584836483001709, + "learning_rate": 1.8769810864706498e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.5412486791610718, + "num_tokens": 1828228490.0, + "step": 3576 + }, + { + "epoch": 0.9672796106003245, + "grad_norm": 1.8079543113708496, + "learning_rate": 1.8769015138653175e-05, + "loss": 2.0919, + "mean_token_accuracy": 0.5248920321464539, + "num_tokens": 1828752765.0, + "step": 3577 + }, + { + "epoch": 0.9675500270416442, + "grad_norm": 1.8024182319641113, + "learning_rate": 1.8768219174223618e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.5439010858535767, + "num_tokens": 1829254106.0, + "step": 3578 + }, + { + "epoch": 0.9678204434829638, + "grad_norm": 1.6795843839645386, + "learning_rate": 1.8767422971442253e-05, + "loss": 2.2095, + "mean_token_accuracy": 0.5137876272201538, + "num_tokens": 1829729704.0, + "step": 3579 + }, + { + "epoch": 0.9680908599242833, + "grad_norm": 1.645134687423706, + "learning_rate": 1.8766626530333507e-05, + "loss": 2.0463, + "mean_token_accuracy": 0.5371054410934448, + "num_tokens": 1830242295.0, + "step": 3580 + }, + { + "epoch": 0.968361276365603, + "grad_norm": 0.6277889013290405, + "learning_rate": 1.876582985092182e-05, + "loss": 1.158, + "mean_token_accuracy": 0.6882504224777222, + "num_tokens": 1830766344.0, + "step": 3581 + }, + { + "epoch": 0.9686316928069226, + "grad_norm": 2.1258938312530518, + "learning_rate": 1.8765032933231636e-05, + "loss": 2.0665, + "mean_token_accuracy": 0.5289781093597412, + "num_tokens": 1831290549.0, + "step": 3582 + }, + { + "epoch": 0.9689021092482423, + "grad_norm": 1.7883895635604858, + "learning_rate": 1.8764235777287402e-05, + "loss": 1.9607, + "mean_token_accuracy": 0.5684113502502441, + "num_tokens": 1831814769.0, + "step": 3583 + }, + { + "epoch": 0.9691725256895619, + "grad_norm": 1.2339015007019043, + "learning_rate": 1.8763438383113586e-05, + "loss": 2.1236, + "mean_token_accuracy": 0.5158587098121643, + "num_tokens": 1832339044.0, + "step": 3584 + }, + { + "epoch": 0.9694429421308816, + "grad_norm": 1.4593172073364258, + "learning_rate": 1.8762640750734647e-05, + "loss": 2.0438, + "mean_token_accuracy": 0.5486724972724915, + "num_tokens": 1832863298.0, + "step": 3585 + }, + { + "epoch": 0.9697133585722012, + "grad_norm": 1.4266109466552734, + "learning_rate": 1.876184288017506e-05, + "loss": 2.1113, + "mean_token_accuracy": 0.5254105925559998, + "num_tokens": 1833354672.0, + "step": 3586 + }, + { + "epoch": 0.9699837750135208, + "grad_norm": 1.2174817323684692, + "learning_rate": 1.876104477145931e-05, + "loss": 1.8881, + "mean_token_accuracy": 0.5795369744300842, + "num_tokens": 1833836610.0, + "step": 3587 + }, + { + "epoch": 0.9702541914548405, + "grad_norm": 1.5188039541244507, + "learning_rate": 1.8760246424611883e-05, + "loss": 2.2253, + "mean_token_accuracy": 0.5157367587089539, + "num_tokens": 1834360769.0, + "step": 3588 + }, + { + "epoch": 0.9705246078961601, + "grad_norm": 1.3077826499938965, + "learning_rate": 1.8759447839657273e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.5072181224822998, + "num_tokens": 1834884951.0, + "step": 3589 + }, + { + "epoch": 0.9707950243374798, + "grad_norm": 1.38231360912323, + "learning_rate": 1.8758649016619987e-05, + "loss": 2.0895, + "mean_token_accuracy": 0.5212154984474182, + "num_tokens": 1835408962.0, + "step": 3590 + }, + { + "epoch": 0.9710654407787993, + "grad_norm": 1.4664583206176758, + "learning_rate": 1.875784995552453e-05, + "loss": 2.0658, + "mean_token_accuracy": 0.534752607345581, + "num_tokens": 1835904003.0, + "step": 3591 + }, + { + "epoch": 0.971335857220119, + "grad_norm": 1.4236799478530884, + "learning_rate": 1.875705065639542e-05, + "loss": 2.1914, + "mean_token_accuracy": 0.50108802318573, + "num_tokens": 1836372076.0, + "step": 3592 + }, + { + "epoch": 0.9716062736614386, + "grad_norm": 1.316184639930725, + "learning_rate": 1.875625111925719e-05, + "loss": 2.0922, + "mean_token_accuracy": 0.5408489108085632, + "num_tokens": 1836896266.0, + "step": 3593 + }, + { + "epoch": 0.9718766901027582, + "grad_norm": 1.3593028783798218, + "learning_rate": 1.875545134413436e-05, + "loss": 2.1711, + "mean_token_accuracy": 0.5102579593658447, + "num_tokens": 1837420430.0, + "step": 3594 + }, + { + "epoch": 0.9721471065440779, + "grad_norm": 1.3652311563491821, + "learning_rate": 1.8754651331051475e-05, + "loss": 1.9289, + "mean_token_accuracy": 0.5541203618049622, + "num_tokens": 1837944611.0, + "step": 3595 + }, + { + "epoch": 0.9724175229853975, + "grad_norm": 1.3001337051391602, + "learning_rate": 1.8753851080033087e-05, + "loss": 2.042, + "mean_token_accuracy": 0.5373517274856567, + "num_tokens": 1838468756.0, + "step": 3596 + }, + { + "epoch": 0.9726879394267172, + "grad_norm": 1.546716332435608, + "learning_rate": 1.8753050591103743e-05, + "loss": 2.1157, + "mean_token_accuracy": 0.5417243242263794, + "num_tokens": 1838938775.0, + "step": 3597 + }, + { + "epoch": 0.9729583558680368, + "grad_norm": 1.3563934564590454, + "learning_rate": 1.875224986428801e-05, + "loss": 2.026, + "mean_token_accuracy": 0.5571235418319702, + "num_tokens": 1839462984.0, + "step": 3598 + }, + { + "epoch": 0.9732287723093564, + "grad_norm": 1.3694595098495483, + "learning_rate": 1.8751448899610447e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.5385140180587769, + "num_tokens": 1839987229.0, + "step": 3599 + }, + { + "epoch": 0.9734991887506761, + "grad_norm": 1.2424052953720093, + "learning_rate": 1.875064769709564e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.5059524774551392, + "num_tokens": 1840511507.0, + "step": 3600 + }, + { + "epoch": 0.9737696051919956, + "grad_norm": 0.8841679692268372, + "learning_rate": 1.874984625676817e-05, + "loss": 1.2325, + "mean_token_accuracy": 0.6625326871871948, + "num_tokens": 1841035741.0, + "step": 3601 + }, + { + "epoch": 0.9740400216333153, + "grad_norm": 1.8644511699676514, + "learning_rate": 1.874904457865262e-05, + "loss": 2.1295, + "mean_token_accuracy": 0.5245357751846313, + "num_tokens": 1841559966.0, + "step": 3602 + }, + { + "epoch": 0.9743104380746349, + "grad_norm": 1.4505914449691772, + "learning_rate": 1.87482426627736e-05, + "loss": 2.066, + "mean_token_accuracy": 0.531521201133728, + "num_tokens": 1842084162.0, + "step": 3603 + }, + { + "epoch": 0.9745808545159546, + "grad_norm": 1.5267356634140015, + "learning_rate": 1.8747440509155705e-05, + "loss": 2.1122, + "mean_token_accuracy": 0.5416538119316101, + "num_tokens": 1842583456.0, + "step": 3604 + }, + { + "epoch": 0.9748512709572742, + "grad_norm": 1.6637321710586548, + "learning_rate": 1.8746638117823552e-05, + "loss": 2.0707, + "mean_token_accuracy": 0.5425288677215576, + "num_tokens": 1843107652.0, + "step": 3605 + }, + { + "epoch": 0.9751216873985938, + "grad_norm": 1.6061277389526367, + "learning_rate": 1.8745835488801763e-05, + "loss": 2.1842, + "mean_token_accuracy": 0.5083199739456177, + "num_tokens": 1843631895.0, + "step": 3606 + }, + { + "epoch": 0.9753921038399135, + "grad_norm": 1.7205920219421387, + "learning_rate": 1.8745032622114958e-05, + "loss": 2.1083, + "mean_token_accuracy": 0.5182367563247681, + "num_tokens": 1844156049.0, + "step": 3607 + }, + { + "epoch": 0.9756625202812331, + "grad_norm": 1.6084810495376587, + "learning_rate": 1.874422951778778e-05, + "loss": 2.1074, + "mean_token_accuracy": 0.525713324546814, + "num_tokens": 1844680307.0, + "step": 3608 + }, + { + "epoch": 0.9759329367225528, + "grad_norm": 1.3926429748535156, + "learning_rate": 1.8743426175844862e-05, + "loss": 2.0612, + "mean_token_accuracy": 0.544356107711792, + "num_tokens": 1845150626.0, + "step": 3609 + }, + { + "epoch": 0.9762033531638724, + "grad_norm": 1.289021611213684, + "learning_rate": 1.874262259631086e-05, + "loss": 2.0058, + "mean_token_accuracy": 0.5308140516281128, + "num_tokens": 1845674889.0, + "step": 3610 + }, + { + "epoch": 0.9764737696051919, + "grad_norm": 1.2999029159545898, + "learning_rate": 1.8741818779210426e-05, + "loss": 2.2689, + "mean_token_accuracy": 0.4990358054637909, + "num_tokens": 1846198959.0, + "step": 3611 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 1.290183663368225, + "learning_rate": 1.8741014724568226e-05, + "loss": 2.0809, + "mean_token_accuracy": 0.5332265496253967, + "num_tokens": 1846723227.0, + "step": 3612 + }, + { + "epoch": 0.9770146024878312, + "grad_norm": 1.5700632333755493, + "learning_rate": 1.874021043240893e-05, + "loss": 2.1649, + "mean_token_accuracy": 0.53211909532547, + "num_tokens": 1847222379.0, + "step": 3613 + }, + { + "epoch": 0.9772850189291509, + "grad_norm": 1.436323642730713, + "learning_rate": 1.873940590275721e-05, + "loss": 2.0872, + "mean_token_accuracy": 0.5519385933876038, + "num_tokens": 1847746550.0, + "step": 3614 + }, + { + "epoch": 0.9775554353704705, + "grad_norm": 1.501802682876587, + "learning_rate": 1.8738601135637764e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.5419832468032837, + "num_tokens": 1848270744.0, + "step": 3615 + }, + { + "epoch": 0.9778258518117902, + "grad_norm": 1.5952650308609009, + "learning_rate": 1.8737796131075273e-05, + "loss": 2.195, + "mean_token_accuracy": 0.5295903086662292, + "num_tokens": 1848795020.0, + "step": 3616 + }, + { + "epoch": 0.9780962682531098, + "grad_norm": 1.4078415632247925, + "learning_rate": 1.8736990889094438e-05, + "loss": 2.1268, + "mean_token_accuracy": 0.5223028063774109, + "num_tokens": 1849319282.0, + "step": 3617 + }, + { + "epoch": 0.9783666846944294, + "grad_norm": 1.3349417448043823, + "learning_rate": 1.8736185409719975e-05, + "loss": 2.1723, + "mean_token_accuracy": 0.5121864080429077, + "num_tokens": 1849840651.0, + "step": 3618 + }, + { + "epoch": 0.9786371011357491, + "grad_norm": 1.7072269916534424, + "learning_rate": 1.8735379692976593e-05, + "loss": 2.1572, + "mean_token_accuracy": 0.5189391374588013, + "num_tokens": 1850364931.0, + "step": 3619 + }, + { + "epoch": 0.9789075175770687, + "grad_norm": 1.5614315271377563, + "learning_rate": 1.8734573738889013e-05, + "loss": 2.0743, + "mean_token_accuracy": 0.544964075088501, + "num_tokens": 1850889203.0, + "step": 3620 + }, + { + "epoch": 0.9791779340183883, + "grad_norm": 0.5597161054611206, + "learning_rate": 1.8733767547481965e-05, + "loss": 1.0895, + "mean_token_accuracy": 0.7155909538269043, + "num_tokens": 1851413242.0, + "step": 3621 + }, + { + "epoch": 0.9794483504597079, + "grad_norm": 2.451908588409424, + "learning_rate": 1.873296111878018e-05, + "loss": 2.1977, + "mean_token_accuracy": 0.5360158085823059, + "num_tokens": 1851874216.0, + "step": 3622 + }, + { + "epoch": 0.9797187669010275, + "grad_norm": 1.5156205892562866, + "learning_rate": 1.8732154452808415e-05, + "loss": 2.0409, + "mean_token_accuracy": 0.5444839000701904, + "num_tokens": 1852398413.0, + "step": 3623 + }, + { + "epoch": 0.9799891833423472, + "grad_norm": 1.5182673931121826, + "learning_rate": 1.873134754959141e-05, + "loss": 2.1426, + "mean_token_accuracy": 0.5225485563278198, + "num_tokens": 1852881133.0, + "step": 3624 + }, + { + "epoch": 0.9802595997836668, + "grad_norm": 1.4538015127182007, + "learning_rate": 1.8730540409153925e-05, + "loss": 2.1315, + "mean_token_accuracy": 0.5241137742996216, + "num_tokens": 1853405400.0, + "step": 3625 + }, + { + "epoch": 0.9805300162249865, + "grad_norm": 1.3772289752960205, + "learning_rate": 1.8729733031520723e-05, + "loss": 2.0703, + "mean_token_accuracy": 0.5440021753311157, + "num_tokens": 1853929551.0, + "step": 3626 + }, + { + "epoch": 0.9808004326663061, + "grad_norm": 1.3193696737289429, + "learning_rate": 1.8728925416716582e-05, + "loss": 2.017, + "mean_token_accuracy": 0.5449279546737671, + "num_tokens": 1854453815.0, + "step": 3627 + }, + { + "epoch": 0.9810708491076258, + "grad_norm": 1.3501806259155273, + "learning_rate": 1.8728117564766283e-05, + "loss": 1.9736, + "mean_token_accuracy": 0.5566884279251099, + "num_tokens": 1854942206.0, + "step": 3628 + }, + { + "epoch": 0.9813412655489454, + "grad_norm": 1.2841447591781616, + "learning_rate": 1.8727309475694605e-05, + "loss": 1.9904, + "mean_token_accuracy": 0.5750815868377686, + "num_tokens": 1855466360.0, + "step": 3629 + }, + { + "epoch": 0.981611681990265, + "grad_norm": 1.5009419918060303, + "learning_rate": 1.8726501149526354e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.5132421255111694, + "num_tokens": 1855990611.0, + "step": 3630 + }, + { + "epoch": 0.9818820984315847, + "grad_norm": 1.1754193305969238, + "learning_rate": 1.872569258628632e-05, + "loss": 2.0225, + "mean_token_accuracy": 0.5478553771972656, + "num_tokens": 1856466340.0, + "step": 3631 + }, + { + "epoch": 0.9821525148729042, + "grad_norm": 2.044769763946533, + "learning_rate": 1.872488378599932e-05, + "loss": 2.0549, + "mean_token_accuracy": 0.5399582386016846, + "num_tokens": 1856990546.0, + "step": 3632 + }, + { + "epoch": 0.9824229313142239, + "grad_norm": 1.5483736991882324, + "learning_rate": 1.8724074748690167e-05, + "loss": 1.9765, + "mean_token_accuracy": 0.582927405834198, + "num_tokens": 1857514806.0, + "step": 3633 + }, + { + "epoch": 0.9826933477555435, + "grad_norm": 1.6950560808181763, + "learning_rate": 1.8723265474383684e-05, + "loss": 2.0455, + "mean_token_accuracy": 0.5578628182411194, + "num_tokens": 1858007661.0, + "step": 3634 + }, + { + "epoch": 0.9829637641968632, + "grad_norm": 1.436941385269165, + "learning_rate": 1.8722455963104707e-05, + "loss": 2.1783, + "mean_token_accuracy": 0.5319501161575317, + "num_tokens": 1858458062.0, + "step": 3635 + }, + { + "epoch": 0.9832341806381828, + "grad_norm": 1.2119295597076416, + "learning_rate": 1.8721646214878072e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5461077690124512, + "num_tokens": 1858982329.0, + "step": 3636 + }, + { + "epoch": 0.9835045970795024, + "grad_norm": 1.3109047412872314, + "learning_rate": 1.872083622972862e-05, + "loss": 1.9907, + "mean_token_accuracy": 0.5560054779052734, + "num_tokens": 1859506608.0, + "step": 3637 + }, + { + "epoch": 0.9837750135208221, + "grad_norm": 1.3939175605773926, + "learning_rate": 1.8720026007681208e-05, + "loss": 2.159, + "mean_token_accuracy": 0.5277074575424194, + "num_tokens": 1860030824.0, + "step": 3638 + }, + { + "epoch": 0.9840454299621417, + "grad_norm": 1.3909841775894165, + "learning_rate": 1.8719215548760693e-05, + "loss": 2.1166, + "mean_token_accuracy": 0.546204686164856, + "num_tokens": 1860473933.0, + "step": 3639 + }, + { + "epoch": 0.9843158464034614, + "grad_norm": 1.3174997568130493, + "learning_rate": 1.871840485299195e-05, + "loss": 2.1331, + "mean_token_accuracy": 0.5109991431236267, + "num_tokens": 1860998180.0, + "step": 3640 + }, + { + "epoch": 0.984586262844781, + "grad_norm": 0.8974982500076294, + "learning_rate": 1.871759392039984e-05, + "loss": 1.2022, + "mean_token_accuracy": 0.689599335193634, + "num_tokens": 1861482472.0, + "step": 3641 + }, + { + "epoch": 0.9848566792861005, + "grad_norm": 2.432298421859741, + "learning_rate": 1.871678275100926e-05, + "loss": 2.0804, + "mean_token_accuracy": 0.5386736392974854, + "num_tokens": 1862006679.0, + "step": 3642 + }, + { + "epoch": 0.9851270957274202, + "grad_norm": 2.114161491394043, + "learning_rate": 1.8715971344845086e-05, + "loss": 2.0732, + "mean_token_accuracy": 0.532296895980835, + "num_tokens": 1862530887.0, + "step": 3643 + }, + { + "epoch": 0.9853975121687398, + "grad_norm": 1.645024061203003, + "learning_rate": 1.8715159701932224e-05, + "loss": 2.1893, + "mean_token_accuracy": 0.522928774356842, + "num_tokens": 1863055084.0, + "step": 3644 + }, + { + "epoch": 0.9856679286100595, + "grad_norm": 1.9425361156463623, + "learning_rate": 1.871434782229557e-05, + "loss": 1.9243, + "mean_token_accuracy": 0.5847312211990356, + "num_tokens": 1863514410.0, + "step": 3645 + }, + { + "epoch": 0.9859383450513791, + "grad_norm": 1.8082088232040405, + "learning_rate": 1.8713535705960042e-05, + "loss": 2.0818, + "mean_token_accuracy": 0.525177001953125, + "num_tokens": 1864005388.0, + "step": 3646 + }, + { + "epoch": 0.9862087614926988, + "grad_norm": 1.770944356918335, + "learning_rate": 1.871272335295055e-05, + "loss": 2.0526, + "mean_token_accuracy": 0.5326244235038757, + "num_tokens": 1864477515.0, + "step": 3647 + }, + { + "epoch": 0.9864791779340184, + "grad_norm": 2.088620185852051, + "learning_rate": 1.8711910763292028e-05, + "loss": 1.9377, + "mean_token_accuracy": 0.5573447942733765, + "num_tokens": 1864994028.0, + "step": 3648 + }, + { + "epoch": 0.986749594375338, + "grad_norm": 2.213958263397217, + "learning_rate": 1.8711097937009405e-05, + "loss": 2.1415, + "mean_token_accuracy": 0.5202970504760742, + "num_tokens": 1865518305.0, + "step": 3649 + }, + { + "epoch": 0.9870200108166577, + "grad_norm": 1.6068485975265503, + "learning_rate": 1.871028487412762e-05, + "loss": 2.1155, + "mean_token_accuracy": 0.5490260124206543, + "num_tokens": 1866042424.0, + "step": 3650 + }, + { + "epoch": 0.9872904272579773, + "grad_norm": 1.5849436521530151, + "learning_rate": 1.870947157467162e-05, + "loss": 1.968, + "mean_token_accuracy": 0.5321304798126221, + "num_tokens": 1866535521.0, + "step": 3651 + }, + { + "epoch": 0.9875608436992969, + "grad_norm": 1.5742342472076416, + "learning_rate": 1.8708658038666357e-05, + "loss": 1.9559, + "mean_token_accuracy": 0.5473365783691406, + "num_tokens": 1867037304.0, + "step": 3652 + }, + { + "epoch": 0.9878312601406165, + "grad_norm": 1.3386672735214233, + "learning_rate": 1.8707844266136794e-05, + "loss": 2.0777, + "mean_token_accuracy": 0.5302547216415405, + "num_tokens": 1867561406.0, + "step": 3653 + }, + { + "epoch": 0.9881016765819362, + "grad_norm": 1.1847807168960571, + "learning_rate": 1.8707030257107908e-05, + "loss": 2.0461, + "mean_token_accuracy": 0.5419583320617676, + "num_tokens": 1868085618.0, + "step": 3654 + }, + { + "epoch": 0.9883720930232558, + "grad_norm": 1.5600498914718628, + "learning_rate": 1.8706216011604664e-05, + "loss": 1.984, + "mean_token_accuracy": 0.5354404449462891, + "num_tokens": 1868609652.0, + "step": 3655 + }, + { + "epoch": 0.9886425094645754, + "grad_norm": 1.5709983110427856, + "learning_rate": 1.8705401529652055e-05, + "loss": 2.2268, + "mean_token_accuracy": 0.5156470537185669, + "num_tokens": 1869077297.0, + "step": 3656 + }, + { + "epoch": 0.9889129259058951, + "grad_norm": 1.3722124099731445, + "learning_rate": 1.8704586811275064e-05, + "loss": 2.2022, + "mean_token_accuracy": 0.509826123714447, + "num_tokens": 1869601551.0, + "step": 3657 + }, + { + "epoch": 0.9891833423472147, + "grad_norm": 1.708245873451233, + "learning_rate": 1.870377185649869e-05, + "loss": 2.1504, + "mean_token_accuracy": 0.525040864944458, + "num_tokens": 1870125768.0, + "step": 3658 + }, + { + "epoch": 0.9894537587885344, + "grad_norm": 1.5504734516143799, + "learning_rate": 1.870295666534794e-05, + "loss": 2.1509, + "mean_token_accuracy": 0.5193731784820557, + "num_tokens": 1870614292.0, + "step": 3659 + }, + { + "epoch": 0.989724175229854, + "grad_norm": 1.3306001424789429, + "learning_rate": 1.870214123784783e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.5320568680763245, + "num_tokens": 1871138391.0, + "step": 3660 + }, + { + "epoch": 0.9899945916711737, + "grad_norm": 0.7419589757919312, + "learning_rate": 1.8701325574023378e-05, + "loss": 1.1972, + "mean_token_accuracy": 0.6884564161300659, + "num_tokens": 1871662647.0, + "step": 3661 + }, + { + "epoch": 0.9902650081124933, + "grad_norm": 2.453000068664551, + "learning_rate": 1.870050967389961e-05, + "loss": 2.0969, + "mean_token_accuracy": 0.5397405624389648, + "num_tokens": 1872186846.0, + "step": 3662 + }, + { + "epoch": 0.9905354245538128, + "grad_norm": 1.868578314781189, + "learning_rate": 1.8699693537501552e-05, + "loss": 2.1018, + "mean_token_accuracy": 0.5432661771774292, + "num_tokens": 1872664401.0, + "step": 3663 + }, + { + "epoch": 0.9908058409951325, + "grad_norm": 1.3821182250976562, + "learning_rate": 1.8698877164854257e-05, + "loss": 2.0295, + "mean_token_accuracy": 0.5814230442047119, + "num_tokens": 1873123583.0, + "step": 3664 + }, + { + "epoch": 0.9910762574364521, + "grad_norm": 1.7375165224075317, + "learning_rate": 1.8698060555982767e-05, + "loss": 2.0868, + "mean_token_accuracy": 0.5421896576881409, + "num_tokens": 1873617469.0, + "step": 3665 + }, + { + "epoch": 0.9913466738777718, + "grad_norm": 1.4175403118133545, + "learning_rate": 1.8697243710912146e-05, + "loss": 2.1853, + "mean_token_accuracy": 0.5273336172103882, + "num_tokens": 1874141716.0, + "step": 3666 + }, + { + "epoch": 0.9916170903190914, + "grad_norm": 1.3285259008407593, + "learning_rate": 1.869642662966745e-05, + "loss": 2.0505, + "mean_token_accuracy": 0.5370315909385681, + "num_tokens": 1874665919.0, + "step": 3667 + }, + { + "epoch": 0.991887506760411, + "grad_norm": 1.2562522888183594, + "learning_rate": 1.8695609312273752e-05, + "loss": 2.019, + "mean_token_accuracy": 0.5528623461723328, + "num_tokens": 1875165189.0, + "step": 3668 + }, + { + "epoch": 0.9921579232017307, + "grad_norm": 1.1292027235031128, + "learning_rate": 1.8694791758756127e-05, + "loss": 1.9442, + "mean_token_accuracy": 0.5576960444450378, + "num_tokens": 1875676549.0, + "step": 3669 + }, + { + "epoch": 0.9924283396430503, + "grad_norm": 1.5145808458328247, + "learning_rate": 1.869397396913966e-05, + "loss": 2.1476, + "mean_token_accuracy": 0.5255885720252991, + "num_tokens": 1876160038.0, + "step": 3670 + }, + { + "epoch": 0.99269875608437, + "grad_norm": 1.3772870302200317, + "learning_rate": 1.8693155943449447e-05, + "loss": 2.1568, + "mean_token_accuracy": 0.5353766679763794, + "num_tokens": 1876684207.0, + "step": 3671 + }, + { + "epoch": 0.9929691725256896, + "grad_norm": 1.357801079750061, + "learning_rate": 1.869233768171059e-05, + "loss": 2.1805, + "mean_token_accuracy": 0.5369908213615417, + "num_tokens": 1877208383.0, + "step": 3672 + }, + { + "epoch": 0.9932395889670091, + "grad_norm": 1.7219196557998657, + "learning_rate": 1.8691519183948184e-05, + "loss": 2.0991, + "mean_token_accuracy": 0.5431872010231018, + "num_tokens": 1877732658.0, + "step": 3673 + }, + { + "epoch": 0.9935100054083288, + "grad_norm": 1.241278886795044, + "learning_rate": 1.8690700450187353e-05, + "loss": 2.0971, + "mean_token_accuracy": 0.521263837814331, + "num_tokens": 1878256918.0, + "step": 3674 + }, + { + "epoch": 0.9937804218496484, + "grad_norm": 1.4468785524368286, + "learning_rate": 1.8689881480453215e-05, + "loss": 2.1866, + "mean_token_accuracy": 0.5175731778144836, + "num_tokens": 1878780905.0, + "step": 3675 + }, + { + "epoch": 0.9940508382909681, + "grad_norm": 1.5758466720581055, + "learning_rate": 1.8689062274770898e-05, + "loss": 2.1241, + "mean_token_accuracy": 0.5271672010421753, + "num_tokens": 1879305086.0, + "step": 3676 + }, + { + "epoch": 0.9943212547322877, + "grad_norm": 1.4293595552444458, + "learning_rate": 1.8688242833165544e-05, + "loss": 2.0137, + "mean_token_accuracy": 0.549246072769165, + "num_tokens": 1879791733.0, + "step": 3677 + }, + { + "epoch": 0.9945916711736074, + "grad_norm": 1.7617759704589844, + "learning_rate": 1.8687423155662283e-05, + "loss": 2.1166, + "mean_token_accuracy": 0.5245968103408813, + "num_tokens": 1880265014.0, + "step": 3678 + }, + { + "epoch": 0.994862087614927, + "grad_norm": 1.5013583898544312, + "learning_rate": 1.868660324228628e-05, + "loss": 2.2144, + "mean_token_accuracy": 0.5344693660736084, + "num_tokens": 1880778919.0, + "step": 3679 + }, + { + "epoch": 0.9951325040562466, + "grad_norm": 1.6896586418151855, + "learning_rate": 1.8685783093062676e-05, + "loss": 2.1575, + "mean_token_accuracy": 0.5259087085723877, + "num_tokens": 1881303107.0, + "step": 3680 + }, + { + "epoch": 0.9954029204975663, + "grad_norm": 0.7583405375480652, + "learning_rate": 1.868496270801665e-05, + "loss": 1.1684, + "mean_token_accuracy": 0.6837255954742432, + "num_tokens": 1881827282.0, + "step": 3681 + }, + { + "epoch": 0.9956733369388859, + "grad_norm": 2.064842939376831, + "learning_rate": 1.8684142087173366e-05, + "loss": 2.0447, + "mean_token_accuracy": 0.5205317139625549, + "num_tokens": 1882351499.0, + "step": 3682 + }, + { + "epoch": 0.9959437533802055, + "grad_norm": 1.6572457551956177, + "learning_rate": 1.8683321230558006e-05, + "loss": 2.046, + "mean_token_accuracy": 0.5482823252677917, + "num_tokens": 1882875754.0, + "step": 3683 + }, + { + "epoch": 0.9962141698215251, + "grad_norm": 1.431368112564087, + "learning_rate": 1.8682500138195752e-05, + "loss": 2.1541, + "mean_token_accuracy": 0.516918420791626, + "num_tokens": 1883399966.0, + "step": 3684 + }, + { + "epoch": 0.9964845862628448, + "grad_norm": 1.583105206489563, + "learning_rate": 1.8681678810111805e-05, + "loss": 2.0136, + "mean_token_accuracy": 0.5129697918891907, + "num_tokens": 1883924079.0, + "step": 3685 + }, + { + "epoch": 0.9967550027041644, + "grad_norm": 1.5963313579559326, + "learning_rate": 1.8680857246331362e-05, + "loss": 2.1373, + "mean_token_accuracy": 0.5305863618850708, + "num_tokens": 1884448238.0, + "step": 3686 + }, + { + "epoch": 0.997025419145484, + "grad_norm": 1.4570378065109253, + "learning_rate": 1.868003544687963e-05, + "loss": 2.1001, + "mean_token_accuracy": 0.5345909595489502, + "num_tokens": 1884972508.0, + "step": 3687 + }, + { + "epoch": 0.9972958355868037, + "grad_norm": 1.534919261932373, + "learning_rate": 1.8679213411781823e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.543402910232544, + "num_tokens": 1885496775.0, + "step": 3688 + }, + { + "epoch": 0.9975662520281233, + "grad_norm": 1.4361426830291748, + "learning_rate": 1.867839114106317e-05, + "loss": 2.1237, + "mean_token_accuracy": 0.5293265581130981, + "num_tokens": 1886003934.0, + "step": 3689 + }, + { + "epoch": 0.997836668469443, + "grad_norm": 1.505721092224121, + "learning_rate": 1.867756863474889e-05, + "loss": 1.9884, + "mean_token_accuracy": 0.53346848487854, + "num_tokens": 1886528197.0, + "step": 3690 + }, + { + "epoch": 0.9981070849107626, + "grad_norm": 1.3893754482269287, + "learning_rate": 1.867674589286423e-05, + "loss": 1.9514, + "mean_token_accuracy": 0.5562463402748108, + "num_tokens": 1887052448.0, + "step": 3691 + }, + { + "epoch": 0.9983775013520823, + "grad_norm": 1.1698765754699707, + "learning_rate": 1.867592291543443e-05, + "loss": 1.958, + "mean_token_accuracy": 0.5417982935905457, + "num_tokens": 1887576709.0, + "step": 3692 + }, + { + "epoch": 0.9986479177934018, + "grad_norm": 1.6550142765045166, + "learning_rate": 1.867509970248474e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.5177998542785645, + "num_tokens": 1888100991.0, + "step": 3693 + }, + { + "epoch": 0.9989183342347214, + "grad_norm": 1.2540888786315918, + "learning_rate": 1.8674276254040424e-05, + "loss": 2.0811, + "mean_token_accuracy": 0.5351106524467468, + "num_tokens": 1888625266.0, + "step": 3694 + }, + { + "epoch": 0.9991887506760411, + "grad_norm": 1.716618537902832, + "learning_rate": 1.867345257012674e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.5101892948150635, + "num_tokens": 1889149502.0, + "step": 3695 + }, + { + "epoch": 0.9994591671173607, + "grad_norm": 1.4392321109771729, + "learning_rate": 1.8672628650768964e-05, + "loss": 2.2069, + "mean_token_accuracy": 0.5426144003868103, + "num_tokens": 1889538206.0, + "step": 3696 + }, + { + "epoch": 0.9997295835586804, + "grad_norm": 1.482047200202942, + "learning_rate": 1.867180449599238e-05, + "loss": 1.9652, + "mean_token_accuracy": 0.5616731643676758, + "num_tokens": 1890062338.0, + "step": 3697 + }, + { + "epoch": 1.0, + "grad_norm": 1.5538361072540283, + "learning_rate": 1.8670980105822272e-05, + "loss": 2.171, + "mean_token_accuracy": 0.5366696119308472, + "num_tokens": 1890324479.0, + "step": 3698 + }, + { + "epoch": 1.0002704164413196, + "grad_norm": 1.3925338983535767, + "learning_rate": 1.8670155480283937e-05, + "loss": 2.0583, + "mean_token_accuracy": 0.5436079502105713, + "num_tokens": 1890848514.0, + "step": 3699 + }, + { + "epoch": 1.0005408328826393, + "grad_norm": 1.659204363822937, + "learning_rate": 1.866933061940267e-05, + "loss": 2.2253, + "mean_token_accuracy": 0.52586829662323, + "num_tokens": 1891372686.0, + "step": 3700 + }, + { + "epoch": 1.000811249323959, + "grad_norm": 0.7238206267356873, + "learning_rate": 1.866850552320379e-05, + "loss": 1.0987, + "mean_token_accuracy": 0.7123465538024902, + "num_tokens": 1891863149.0, + "step": 3701 + }, + { + "epoch": 1.0010816657652786, + "grad_norm": 2.8373448848724365, + "learning_rate": 1.866768019171261e-05, + "loss": 2.0919, + "mean_token_accuracy": 0.5499080419540405, + "num_tokens": 1892352735.0, + "step": 3702 + }, + { + "epoch": 1.0013520822065982, + "grad_norm": 2.145596981048584, + "learning_rate": 1.866685462495445e-05, + "loss": 2.078, + "mean_token_accuracy": 0.5375428795814514, + "num_tokens": 1892876872.0, + "step": 3703 + }, + { + "epoch": 1.0016224986479179, + "grad_norm": 1.3026772737503052, + "learning_rate": 1.8666028822954644e-05, + "loss": 1.9792, + "mean_token_accuracy": 0.5741406679153442, + "num_tokens": 1893401106.0, + "step": 3704 + }, + { + "epoch": 1.0018929150892375, + "grad_norm": 2.2258365154266357, + "learning_rate": 1.866520278573853e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.5317258834838867, + "num_tokens": 1893925293.0, + "step": 3705 + }, + { + "epoch": 1.0021633315305571, + "grad_norm": 2.2578623294830322, + "learning_rate": 1.866437651333145e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.5309945344924927, + "num_tokens": 1894441583.0, + "step": 3706 + }, + { + "epoch": 1.0024337479718768, + "grad_norm": 1.5822633504867554, + "learning_rate": 1.8663550005758762e-05, + "loss": 1.991, + "mean_token_accuracy": 0.5578569173812866, + "num_tokens": 1894965206.0, + "step": 3707 + }, + { + "epoch": 1.0027041644131964, + "grad_norm": 1.8178749084472656, + "learning_rate": 1.8662723263045824e-05, + "loss": 2.1892, + "mean_token_accuracy": 0.5116317272186279, + "num_tokens": 1895489403.0, + "step": 3708 + }, + { + "epoch": 1.0029745808545159, + "grad_norm": 1.7891765832901, + "learning_rate": 1.8661896285217997e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.5401831865310669, + "num_tokens": 1896013674.0, + "step": 3709 + }, + { + "epoch": 1.0032449972958355, + "grad_norm": 1.5102721452713013, + "learning_rate": 1.8661069072300667e-05, + "loss": 2.0792, + "mean_token_accuracy": 0.5313277244567871, + "num_tokens": 1896511556.0, + "step": 3710 + }, + { + "epoch": 1.0035154137371551, + "grad_norm": 1.4245857000350952, + "learning_rate": 1.86602416243192e-05, + "loss": 2.1504, + "mean_token_accuracy": 0.5311034917831421, + "num_tokens": 1897019577.0, + "step": 3711 + }, + { + "epoch": 1.0037858301784748, + "grad_norm": 1.4878708124160767, + "learning_rate": 1.8659413941299e-05, + "loss": 2.1081, + "mean_token_accuracy": 0.5253729820251465, + "num_tokens": 1897492233.0, + "step": 3712 + }, + { + "epoch": 1.0040562466197944, + "grad_norm": 1.4056999683380127, + "learning_rate": 1.865858602326545e-05, + "loss": 2.1696, + "mean_token_accuracy": 0.532268226146698, + "num_tokens": 1898016404.0, + "step": 3713 + }, + { + "epoch": 1.004326663061114, + "grad_norm": 1.4685275554656982, + "learning_rate": 1.8657757870243963e-05, + "loss": 2.0364, + "mean_token_accuracy": 0.5338144898414612, + "num_tokens": 1898535422.0, + "step": 3714 + }, + { + "epoch": 1.0045970795024337, + "grad_norm": 1.6554590463638306, + "learning_rate": 1.8656929482259937e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5487486124038696, + "num_tokens": 1899059560.0, + "step": 3715 + }, + { + "epoch": 1.0048674959437534, + "grad_norm": 1.4619358777999878, + "learning_rate": 1.86561008593388e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.5077492594718933, + "num_tokens": 1899583827.0, + "step": 3716 + }, + { + "epoch": 1.005137912385073, + "grad_norm": 1.4924185276031494, + "learning_rate": 1.8655272001505978e-05, + "loss": 2.1295, + "mean_token_accuracy": 0.5330953598022461, + "num_tokens": 1900023029.0, + "step": 3717 + }, + { + "epoch": 1.0054083288263926, + "grad_norm": 1.2656630277633667, + "learning_rate": 1.8654442908786898e-05, + "loss": 2.065, + "mean_token_accuracy": 0.5395591259002686, + "num_tokens": 1900547313.0, + "step": 3718 + }, + { + "epoch": 1.0056787452677123, + "grad_norm": 1.660566806793213, + "learning_rate": 1.8653613581206994e-05, + "loss": 2.0452, + "mean_token_accuracy": 0.5420196056365967, + "num_tokens": 1901019097.0, + "step": 3719 + }, + { + "epoch": 1.005949161709032, + "grad_norm": 1.7201350927352905, + "learning_rate": 1.8652784018791722e-05, + "loss": 2.131, + "mean_token_accuracy": 0.5426150560379028, + "num_tokens": 1901492270.0, + "step": 3720 + }, + { + "epoch": 1.0062195781503516, + "grad_norm": 0.7983320951461792, + "learning_rate": 1.8651954221566527e-05, + "loss": 1.2032, + "mean_token_accuracy": 0.685767412185669, + "num_tokens": 1902016535.0, + "step": 3721 + }, + { + "epoch": 1.0064899945916712, + "grad_norm": 2.833818197250366, + "learning_rate": 1.865112418955688e-05, + "loss": 2.0439, + "mean_token_accuracy": 0.5340520739555359, + "num_tokens": 1902540817.0, + "step": 3722 + }, + { + "epoch": 1.0067604110329909, + "grad_norm": 2.5102956295013428, + "learning_rate": 1.8650293922788238e-05, + "loss": 2.1755, + "mean_token_accuracy": 0.5071306228637695, + "num_tokens": 1903065076.0, + "step": 3723 + }, + { + "epoch": 1.0070308274743105, + "grad_norm": 1.7454530000686646, + "learning_rate": 1.8649463421286077e-05, + "loss": 2.071, + "mean_token_accuracy": 0.5458676815032959, + "num_tokens": 1903579433.0, + "step": 3724 + }, + { + "epoch": 1.0073012439156301, + "grad_norm": 2.21905779838562, + "learning_rate": 1.8648632685075888e-05, + "loss": 2.1665, + "mean_token_accuracy": 0.5267132520675659, + "num_tokens": 1904103598.0, + "step": 3725 + }, + { + "epoch": 1.0075716603569498, + "grad_norm": 1.9348238706588745, + "learning_rate": 1.8647801714183152e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5504130721092224, + "num_tokens": 1904627743.0, + "step": 3726 + }, + { + "epoch": 1.0078420767982694, + "grad_norm": 2.32373309135437, + "learning_rate": 1.8646970508633372e-05, + "loss": 2.0759, + "mean_token_accuracy": 0.5109173059463501, + "num_tokens": 1905151985.0, + "step": 3727 + }, + { + "epoch": 1.008112493239589, + "grad_norm": 2.754122734069824, + "learning_rate": 1.8646139068452047e-05, + "loss": 2.0918, + "mean_token_accuracy": 0.5489393472671509, + "num_tokens": 1905630300.0, + "step": 3728 + }, + { + "epoch": 1.0083829096809085, + "grad_norm": 2.097630023956299, + "learning_rate": 1.864530739366469e-05, + "loss": 1.9962, + "mean_token_accuracy": 0.5712932348251343, + "num_tokens": 1906154463.0, + "step": 3729 + }, + { + "epoch": 1.0086533261222281, + "grad_norm": 2.0946242809295654, + "learning_rate": 1.8644475484296816e-05, + "loss": 2.0086, + "mean_token_accuracy": 0.5581051111221313, + "num_tokens": 1906656754.0, + "step": 3730 + }, + { + "epoch": 1.0089237425635478, + "grad_norm": 2.3835809230804443, + "learning_rate": 1.8643643340373952e-05, + "loss": 2.1811, + "mean_token_accuracy": 0.5212585926055908, + "num_tokens": 1907181009.0, + "step": 3731 + }, + { + "epoch": 1.0091941590048674, + "grad_norm": 1.4639934301376343, + "learning_rate": 1.8642810961921638e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.5377751588821411, + "num_tokens": 1907642860.0, + "step": 3732 + }, + { + "epoch": 1.009464575446187, + "grad_norm": 1.65915846824646, + "learning_rate": 1.8641978348965403e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.5194535255432129, + "num_tokens": 1908167042.0, + "step": 3733 + }, + { + "epoch": 1.0097349918875067, + "grad_norm": 1.9796375036239624, + "learning_rate": 1.86411455015308e-05, + "loss": 2.0689, + "mean_token_accuracy": 0.5340542793273926, + "num_tokens": 1908691299.0, + "step": 3734 + }, + { + "epoch": 1.0100054083288263, + "grad_norm": 1.969061017036438, + "learning_rate": 1.8640312419643383e-05, + "loss": 1.9946, + "mean_token_accuracy": 0.5485696196556091, + "num_tokens": 1909215481.0, + "step": 3735 + }, + { + "epoch": 1.010275824770146, + "grad_norm": 1.7690231800079346, + "learning_rate": 1.863947910332871e-05, + "loss": 2.0646, + "mean_token_accuracy": 0.5649882555007935, + "num_tokens": 1909704435.0, + "step": 3736 + }, + { + "epoch": 1.0105462412114656, + "grad_norm": 2.095127582550049, + "learning_rate": 1.8638645552612353e-05, + "loss": 2.1899, + "mean_token_accuracy": 0.5392510294914246, + "num_tokens": 1910171103.0, + "step": 3737 + }, + { + "epoch": 1.0108166576527853, + "grad_norm": 1.6442577838897705, + "learning_rate": 1.8637811767519887e-05, + "loss": 2.071, + "mean_token_accuracy": 0.5250784754753113, + "num_tokens": 1910695370.0, + "step": 3738 + }, + { + "epoch": 1.011087074094105, + "grad_norm": 2.086820125579834, + "learning_rate": 1.8636977748076895e-05, + "loss": 2.228, + "mean_token_accuracy": 0.5048056840896606, + "num_tokens": 1911219601.0, + "step": 3739 + }, + { + "epoch": 1.0113574905354246, + "grad_norm": 1.6282659769058228, + "learning_rate": 1.8636143494308964e-05, + "loss": 2.032, + "mean_token_accuracy": 0.5444517135620117, + "num_tokens": 1911743583.0, + "step": 3740 + }, + { + "epoch": 1.0116279069767442, + "grad_norm": 1.0623513460159302, + "learning_rate": 1.8635309006241697e-05, + "loss": 1.2159, + "mean_token_accuracy": 0.6807768940925598, + "num_tokens": 1912267676.0, + "step": 3741 + }, + { + "epoch": 1.0118983234180638, + "grad_norm": 2.471268653869629, + "learning_rate": 1.8634474283900695e-05, + "loss": 2.0494, + "mean_token_accuracy": 0.5358389616012573, + "num_tokens": 1912762993.0, + "step": 3742 + }, + { + "epoch": 1.0121687398593835, + "grad_norm": 2.0433132648468018, + "learning_rate": 1.8633639327311573e-05, + "loss": 2.0795, + "mean_token_accuracy": 0.5276553630828857, + "num_tokens": 1913287131.0, + "step": 3743 + }, + { + "epoch": 1.0124391563007031, + "grad_norm": 1.5869139432907104, + "learning_rate": 1.8632804136499945e-05, + "loss": 2.0302, + "mean_token_accuracy": 0.5609850883483887, + "num_tokens": 1913764543.0, + "step": 3744 + }, + { + "epoch": 1.0127095727420228, + "grad_norm": 2.0044636726379395, + "learning_rate": 1.863196871149144e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.5368988513946533, + "num_tokens": 1914231648.0, + "step": 3745 + }, + { + "epoch": 1.0129799891833424, + "grad_norm": 1.8433834314346313, + "learning_rate": 1.863113305231169e-05, + "loss": 1.998, + "mean_token_accuracy": 0.5470367670059204, + "num_tokens": 1914755926.0, + "step": 3746 + }, + { + "epoch": 1.013250405624662, + "grad_norm": 1.7327966690063477, + "learning_rate": 1.8630297158986335e-05, + "loss": 2.0691, + "mean_token_accuracy": 0.5334510803222656, + "num_tokens": 1915280120.0, + "step": 3747 + }, + { + "epoch": 1.0135208220659817, + "grad_norm": 2.042924642562866, + "learning_rate": 1.8629461031541025e-05, + "loss": 2.0968, + "mean_token_accuracy": 0.5380619764328003, + "num_tokens": 1915804281.0, + "step": 3748 + }, + { + "epoch": 1.0137912385073014, + "grad_norm": 1.5731133222579956, + "learning_rate": 1.8628624670001417e-05, + "loss": 2.0631, + "mean_token_accuracy": 0.5557007789611816, + "num_tokens": 1916280472.0, + "step": 3749 + }, + { + "epoch": 1.0140616549486208, + "grad_norm": 1.621637225151062, + "learning_rate": 1.8627788074393164e-05, + "loss": 2.0451, + "mean_token_accuracy": 0.5455504655838013, + "num_tokens": 1916804710.0, + "step": 3750 + }, + { + "epoch": 1.0143320713899404, + "grad_norm": 2.401313066482544, + "learning_rate": 1.8626951244741943e-05, + "loss": 2.0827, + "mean_token_accuracy": 0.5394260883331299, + "num_tokens": 1917328976.0, + "step": 3751 + }, + { + "epoch": 1.01460248783126, + "grad_norm": 1.4745680093765259, + "learning_rate": 1.862611418107343e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.5279103517532349, + "num_tokens": 1917770998.0, + "step": 3752 + }, + { + "epoch": 1.0148729042725797, + "grad_norm": 1.8350430727005005, + "learning_rate": 1.8625276883413308e-05, + "loss": 2.0893, + "mean_token_accuracy": 0.5327574610710144, + "num_tokens": 1918295158.0, + "step": 3753 + }, + { + "epoch": 1.0151433207138993, + "grad_norm": 1.8817675113677979, + "learning_rate": 1.862443935178726e-05, + "loss": 2.0394, + "mean_token_accuracy": 0.552963137626648, + "num_tokens": 1918819275.0, + "step": 3754 + }, + { + "epoch": 1.015413737155219, + "grad_norm": 1.4429975748062134, + "learning_rate": 1.8623601586221e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.5372348427772522, + "num_tokens": 1919343449.0, + "step": 3755 + }, + { + "epoch": 1.0156841535965386, + "grad_norm": 1.524972915649414, + "learning_rate": 1.862276358674022e-05, + "loss": 2.0914, + "mean_token_accuracy": 0.5313022136688232, + "num_tokens": 1919867606.0, + "step": 3756 + }, + { + "epoch": 1.0159545700378583, + "grad_norm": 1.6840806007385254, + "learning_rate": 1.8621925353370637e-05, + "loss": 1.9105, + "mean_token_accuracy": 0.5891019105911255, + "num_tokens": 1920391781.0, + "step": 3757 + }, + { + "epoch": 1.016224986479178, + "grad_norm": 1.5454721450805664, + "learning_rate": 1.8621086886137967e-05, + "loss": 2.0453, + "mean_token_accuracy": 0.5392612814903259, + "num_tokens": 1920915961.0, + "step": 3758 + }, + { + "epoch": 1.0164954029204976, + "grad_norm": 1.4045137166976929, + "learning_rate": 1.862024818506794e-05, + "loss": 2.1879, + "mean_token_accuracy": 0.533239483833313, + "num_tokens": 1921440231.0, + "step": 3759 + }, + { + "epoch": 1.0167658193618172, + "grad_norm": 1.316797137260437, + "learning_rate": 1.861940925018629e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.5198217034339905, + "num_tokens": 1921964379.0, + "step": 3760 + }, + { + "epoch": 1.0170362358031368, + "grad_norm": 0.7478447556495667, + "learning_rate": 1.8618570081518755e-05, + "loss": 1.1339, + "mean_token_accuracy": 0.6970477104187012, + "num_tokens": 1922488592.0, + "step": 3761 + }, + { + "epoch": 1.0173066522444565, + "grad_norm": 2.855562210083008, + "learning_rate": 1.8617730679091086e-05, + "loss": 2.1279, + "mean_token_accuracy": 0.5411010980606079, + "num_tokens": 1923006901.0, + "step": 3762 + }, + { + "epoch": 1.0175770686857761, + "grad_norm": 2.1232426166534424, + "learning_rate": 1.8616891042929036e-05, + "loss": 1.9858, + "mean_token_accuracy": 0.5369304418563843, + "num_tokens": 1923531158.0, + "step": 3763 + }, + { + "epoch": 1.0178474851270958, + "grad_norm": 1.3745092153549194, + "learning_rate": 1.8616051173058374e-05, + "loss": 1.9805, + "mean_token_accuracy": 0.5548393726348877, + "num_tokens": 1924055335.0, + "step": 3764 + }, + { + "epoch": 1.0181179015684154, + "grad_norm": 2.072387456893921, + "learning_rate": 1.861521106950486e-05, + "loss": 2.1676, + "mean_token_accuracy": 0.5141689777374268, + "num_tokens": 1924579459.0, + "step": 3765 + }, + { + "epoch": 1.018388318009735, + "grad_norm": 1.979711651802063, + "learning_rate": 1.8614370732294278e-05, + "loss": 2.1595, + "mean_token_accuracy": 0.5282601118087769, + "num_tokens": 1925103629.0, + "step": 3766 + }, + { + "epoch": 1.0186587344510547, + "grad_norm": 1.4967840909957886, + "learning_rate": 1.8613530161452407e-05, + "loss": 2.1949, + "mean_token_accuracy": 0.5115534067153931, + "num_tokens": 1925627873.0, + "step": 3767 + }, + { + "epoch": 1.0189291508923743, + "grad_norm": 2.110017776489258, + "learning_rate": 1.8612689357005043e-05, + "loss": 2.0803, + "mean_token_accuracy": 0.5481574535369873, + "num_tokens": 1926130018.0, + "step": 3768 + }, + { + "epoch": 1.019199567333694, + "grad_norm": 2.0906827449798584, + "learning_rate": 1.8611848318977983e-05, + "loss": 2.132, + "mean_token_accuracy": 0.5487714409828186, + "num_tokens": 1926609931.0, + "step": 3769 + }, + { + "epoch": 1.0194699837750134, + "grad_norm": 1.8020799160003662, + "learning_rate": 1.861100704739703e-05, + "loss": 2.2149, + "mean_token_accuracy": 0.5151528120040894, + "num_tokens": 1927110597.0, + "step": 3770 + }, + { + "epoch": 1.019740400216333, + "grad_norm": 1.7441643476486206, + "learning_rate": 1.8610165542287997e-05, + "loss": 2.0515, + "mean_token_accuracy": 0.5443954467773438, + "num_tokens": 1927634879.0, + "step": 3771 + }, + { + "epoch": 1.0200108166576527, + "grad_norm": 1.5304628610610962, + "learning_rate": 1.8609323803676703e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5767700672149658, + "num_tokens": 1928159126.0, + "step": 3772 + }, + { + "epoch": 1.0202812330989723, + "grad_norm": 1.446612000465393, + "learning_rate": 1.8608481831588983e-05, + "loss": 2.0786, + "mean_token_accuracy": 0.5138448476791382, + "num_tokens": 1928683279.0, + "step": 3773 + }, + { + "epoch": 1.020551649540292, + "grad_norm": 1.4429010152816772, + "learning_rate": 1.8607639626050662e-05, + "loss": 2.1159, + "mean_token_accuracy": 0.5226731896400452, + "num_tokens": 1929207483.0, + "step": 3774 + }, + { + "epoch": 1.0208220659816116, + "grad_norm": 1.7034907341003418, + "learning_rate": 1.8606797187087587e-05, + "loss": 2.1807, + "mean_token_accuracy": 0.5241184234619141, + "num_tokens": 1929731731.0, + "step": 3775 + }, + { + "epoch": 1.0210924824229313, + "grad_norm": 1.3487179279327393, + "learning_rate": 1.8605954514725603e-05, + "loss": 2.1255, + "mean_token_accuracy": 0.5436452627182007, + "num_tokens": 1930256016.0, + "step": 3776 + }, + { + "epoch": 1.021362898864251, + "grad_norm": 1.3206145763397217, + "learning_rate": 1.8605111608990566e-05, + "loss": 2.1301, + "mean_token_accuracy": 0.5477367639541626, + "num_tokens": 1930721057.0, + "step": 3777 + }, + { + "epoch": 1.0216333153055706, + "grad_norm": 1.1447285413742065, + "learning_rate": 1.8604268469908343e-05, + "loss": 2.0084, + "mean_token_accuracy": 0.5521541833877563, + "num_tokens": 1931182030.0, + "step": 3778 + }, + { + "epoch": 1.0219037317468902, + "grad_norm": 1.406466007232666, + "learning_rate": 1.8603425097504796e-05, + "loss": 2.1392, + "mean_token_accuracy": 0.5328783988952637, + "num_tokens": 1931706261.0, + "step": 3779 + }, + { + "epoch": 1.0221741481882098, + "grad_norm": 1.1420400142669678, + "learning_rate": 1.8602581491805806e-05, + "loss": 2.0872, + "mean_token_accuracy": 0.5400331020355225, + "num_tokens": 1932180011.0, + "step": 3780 + }, + { + "epoch": 1.0224445646295295, + "grad_norm": 0.7381772398948669, + "learning_rate": 1.860173765283726e-05, + "loss": 1.2042, + "mean_token_accuracy": 0.6869558691978455, + "num_tokens": 1932692739.0, + "step": 3781 + }, + { + "epoch": 1.0227149810708491, + "grad_norm": 1.8587446212768555, + "learning_rate": 1.860089358062505e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.5804258584976196, + "num_tokens": 1933151668.0, + "step": 3782 + }, + { + "epoch": 1.0229853975121688, + "grad_norm": 1.7931017875671387, + "learning_rate": 1.860004927519507e-05, + "loss": 2.04, + "mean_token_accuracy": 0.5511513948440552, + "num_tokens": 1933675874.0, + "step": 3783 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 1.634425401687622, + "learning_rate": 1.859920473657323e-05, + "loss": 2.1753, + "mean_token_accuracy": 0.5312925577163696, + "num_tokens": 1934200118.0, + "step": 3784 + }, + { + "epoch": 1.023526230394808, + "grad_norm": 1.6809910535812378, + "learning_rate": 1.8598359964785437e-05, + "loss": 2.1574, + "mean_token_accuracy": 0.5255944728851318, + "num_tokens": 1934724293.0, + "step": 3785 + }, + { + "epoch": 1.0237966468361277, + "grad_norm": 1.803151249885559, + "learning_rate": 1.8597514959857618e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.5375911593437195, + "num_tokens": 1935213230.0, + "step": 3786 + }, + { + "epoch": 1.0240670632774473, + "grad_norm": 1.423020601272583, + "learning_rate": 1.8596669721815697e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.5366488695144653, + "num_tokens": 1935737489.0, + "step": 3787 + }, + { + "epoch": 1.024337479718767, + "grad_norm": 1.5263899564743042, + "learning_rate": 1.8595824250685606e-05, + "loss": 1.9446, + "mean_token_accuracy": 0.5348371863365173, + "num_tokens": 1936261620.0, + "step": 3788 + }, + { + "epoch": 1.0246078961600866, + "grad_norm": 2.0407073497772217, + "learning_rate": 1.859497854649329e-05, + "loss": 2.0793, + "mean_token_accuracy": 0.546489953994751, + "num_tokens": 1936667211.0, + "step": 3789 + }, + { + "epoch": 1.0248783126014063, + "grad_norm": 1.5622830390930176, + "learning_rate": 1.85941326092647e-05, + "loss": 2.0891, + "mean_token_accuracy": 0.5282431840896606, + "num_tokens": 1937191426.0, + "step": 3790 + }, + { + "epoch": 1.0251487290427257, + "grad_norm": 1.5430713891983032, + "learning_rate": 1.8593286439025784e-05, + "loss": 2.1163, + "mean_token_accuracy": 0.5446996688842773, + "num_tokens": 1937715646.0, + "step": 3791 + }, + { + "epoch": 1.0254191454840453, + "grad_norm": 1.8737462759017944, + "learning_rate": 1.8592440035802515e-05, + "loss": 2.1713, + "mean_token_accuracy": 0.5509107708930969, + "num_tokens": 1938148518.0, + "step": 3792 + }, + { + "epoch": 1.025689561925365, + "grad_norm": 1.617722988128662, + "learning_rate": 1.8591593399620855e-05, + "loss": 2.0944, + "mean_token_accuracy": 0.5253862738609314, + "num_tokens": 1938672699.0, + "step": 3793 + }, + { + "epoch": 1.0259599783666846, + "grad_norm": 1.7336862087249756, + "learning_rate": 1.8590746530506785e-05, + "loss": 2.1742, + "mean_token_accuracy": 0.5245962738990784, + "num_tokens": 1939196744.0, + "step": 3794 + }, + { + "epoch": 1.0262303948080043, + "grad_norm": 1.587717890739441, + "learning_rate": 1.8589899428486287e-05, + "loss": 2.2255, + "mean_token_accuracy": 0.5184333324432373, + "num_tokens": 1939721024.0, + "step": 3795 + }, + { + "epoch": 1.026500811249324, + "grad_norm": 1.3063386678695679, + "learning_rate": 1.8589052093585357e-05, + "loss": 2.0062, + "mean_token_accuracy": 0.5445489287376404, + "num_tokens": 1940245221.0, + "step": 3796 + }, + { + "epoch": 1.0267712276906436, + "grad_norm": 1.5802271366119385, + "learning_rate": 1.8588204525829987e-05, + "loss": 2.3028, + "mean_token_accuracy": 0.49898335337638855, + "num_tokens": 1940769403.0, + "step": 3797 + }, + { + "epoch": 1.0270416441319632, + "grad_norm": 1.2623189687728882, + "learning_rate": 1.8587356725246188e-05, + "loss": 2.1386, + "mean_token_accuracy": 0.5304889678955078, + "num_tokens": 1941293514.0, + "step": 3798 + }, + { + "epoch": 1.0273120605732828, + "grad_norm": 1.6316280364990234, + "learning_rate": 1.858650869185997e-05, + "loss": 2.1243, + "mean_token_accuracy": 0.5440071821212769, + "num_tokens": 1941817769.0, + "step": 3799 + }, + { + "epoch": 1.0275824770146025, + "grad_norm": 1.3822987079620361, + "learning_rate": 1.8585660425697358e-05, + "loss": 2.1386, + "mean_token_accuracy": 0.5287947654724121, + "num_tokens": 1942342006.0, + "step": 3800 + }, + { + "epoch": 1.0278528934559221, + "grad_norm": 0.7164426445960999, + "learning_rate": 1.8584811926784374e-05, + "loss": 1.0798, + "mean_token_accuracy": 0.7119520306587219, + "num_tokens": 1942834853.0, + "step": 3801 + }, + { + "epoch": 1.0281233098972418, + "grad_norm": 2.099813461303711, + "learning_rate": 1.8583963195147056e-05, + "loss": 1.926, + "mean_token_accuracy": 0.5417894124984741, + "num_tokens": 1943359091.0, + "step": 3802 + }, + { + "epoch": 1.0283937263385614, + "grad_norm": 2.1363625526428223, + "learning_rate": 1.8583114230811447e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.519741415977478, + "num_tokens": 1943883215.0, + "step": 3803 + }, + { + "epoch": 1.028664142779881, + "grad_norm": 1.5383093357086182, + "learning_rate": 1.8582265033803588e-05, + "loss": 2.1274, + "mean_token_accuracy": 0.5302051305770874, + "num_tokens": 1944407408.0, + "step": 3804 + }, + { + "epoch": 1.0289345592212007, + "grad_norm": 2.2723960876464844, + "learning_rate": 1.8581415604149542e-05, + "loss": 2.1773, + "mean_token_accuracy": 0.516339123249054, + "num_tokens": 1944931677.0, + "step": 3805 + }, + { + "epoch": 1.0292049756625203, + "grad_norm": 2.100994110107422, + "learning_rate": 1.8580565941875366e-05, + "loss": 1.9741, + "mean_token_accuracy": 0.5478913187980652, + "num_tokens": 1945455770.0, + "step": 3806 + }, + { + "epoch": 1.02947539210384, + "grad_norm": 2.0566089153289795, + "learning_rate": 1.857971604700714e-05, + "loss": 2.1264, + "mean_token_accuracy": 0.5365780591964722, + "num_tokens": 1945979938.0, + "step": 3807 + }, + { + "epoch": 1.0297458085451596, + "grad_norm": 1.8973848819732666, + "learning_rate": 1.857886591957093e-05, + "loss": 1.9052, + "mean_token_accuracy": 0.5506665706634521, + "num_tokens": 1946504213.0, + "step": 3808 + }, + { + "epoch": 1.0300162249864793, + "grad_norm": 1.8809378147125244, + "learning_rate": 1.8578015559592827e-05, + "loss": 1.9773, + "mean_token_accuracy": 0.5692790150642395, + "num_tokens": 1946989285.0, + "step": 3809 + }, + { + "epoch": 1.030286641427799, + "grad_norm": 1.5897704362869263, + "learning_rate": 1.8577164967098923e-05, + "loss": 2.0654, + "mean_token_accuracy": 0.5268670916557312, + "num_tokens": 1947471231.0, + "step": 3810 + }, + { + "epoch": 1.0305570578691183, + "grad_norm": 1.4911311864852905, + "learning_rate": 1.8576314142115315e-05, + "loss": 2.1773, + "mean_token_accuracy": 0.5406098365783691, + "num_tokens": 1947976838.0, + "step": 3811 + }, + { + "epoch": 1.030827474310438, + "grad_norm": 1.5987876653671265, + "learning_rate": 1.8575463084668106e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.5451842546463013, + "num_tokens": 1948501108.0, + "step": 3812 + }, + { + "epoch": 1.0310978907517576, + "grad_norm": 1.7853114604949951, + "learning_rate": 1.8574611794783415e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.5133589506149292, + "num_tokens": 1949025382.0, + "step": 3813 + }, + { + "epoch": 1.0313683071930773, + "grad_norm": 1.3009036779403687, + "learning_rate": 1.8573760272487357e-05, + "loss": 2.1041, + "mean_token_accuracy": 0.5186773538589478, + "num_tokens": 1949538380.0, + "step": 3814 + }, + { + "epoch": 1.031638723634397, + "grad_norm": 1.698317527770996, + "learning_rate": 1.857290851780606e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.5508068203926086, + "num_tokens": 1950062632.0, + "step": 3815 + }, + { + "epoch": 1.0319091400757165, + "grad_norm": 1.5905399322509766, + "learning_rate": 1.857205653076566e-05, + "loss": 2.0808, + "mean_token_accuracy": 0.5524230599403381, + "num_tokens": 1950563407.0, + "step": 3816 + }, + { + "epoch": 1.0321795565170362, + "grad_norm": 1.7888474464416504, + "learning_rate": 1.8571204311392297e-05, + "loss": 2.214, + "mean_token_accuracy": 0.5280987620353699, + "num_tokens": 1951087620.0, + "step": 3817 + }, + { + "epoch": 1.0324499729583558, + "grad_norm": 1.765399694442749, + "learning_rate": 1.8570351859712123e-05, + "loss": 2.1881, + "mean_token_accuracy": 0.5354629755020142, + "num_tokens": 1951611883.0, + "step": 3818 + }, + { + "epoch": 1.0327203893996755, + "grad_norm": 1.362446904182434, + "learning_rate": 1.8569499175751286e-05, + "loss": 2.0932, + "mean_token_accuracy": 0.5239291787147522, + "num_tokens": 1952130003.0, + "step": 3819 + }, + { + "epoch": 1.0329908058409951, + "grad_norm": 1.4804182052612305, + "learning_rate": 1.8568646259535956e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.5197316408157349, + "num_tokens": 1952654268.0, + "step": 3820 + }, + { + "epoch": 1.0332612222823148, + "grad_norm": 0.7444384694099426, + "learning_rate": 1.85677931110923e-05, + "loss": 1.187, + "mean_token_accuracy": 0.6940256357192993, + "num_tokens": 1953178535.0, + "step": 3821 + }, + { + "epoch": 1.0335316387236344, + "grad_norm": 1.7245538234710693, + "learning_rate": 1.85669397304465e-05, + "loss": 2.0882, + "mean_token_accuracy": 0.5493342876434326, + "num_tokens": 1953638506.0, + "step": 3822 + }, + { + "epoch": 1.033802055164954, + "grad_norm": 1.3147733211517334, + "learning_rate": 1.8566086117624735e-05, + "loss": 2.1049, + "mean_token_accuracy": 0.5198502540588379, + "num_tokens": 1954162614.0, + "step": 3823 + }, + { + "epoch": 1.0340724716062737, + "grad_norm": 1.1672083139419556, + "learning_rate": 1.8565232272653194e-05, + "loss": 1.8499, + "mean_token_accuracy": 0.5995321273803711, + "num_tokens": 1954621390.0, + "step": 3824 + }, + { + "epoch": 1.0343428880475933, + "grad_norm": 1.2745544910430908, + "learning_rate": 1.8564378195558082e-05, + "loss": 2.002, + "mean_token_accuracy": 0.5466207265853882, + "num_tokens": 1955085655.0, + "step": 3825 + }, + { + "epoch": 1.034613304488913, + "grad_norm": 1.2464052438735962, + "learning_rate": 1.85635238863656e-05, + "loss": 2.1152, + "mean_token_accuracy": 0.5243855714797974, + "num_tokens": 1955609924.0, + "step": 3826 + }, + { + "epoch": 1.0348837209302326, + "grad_norm": 1.3054287433624268, + "learning_rate": 1.856266934510196e-05, + "loss": 1.9428, + "mean_token_accuracy": 0.5614868402481079, + "num_tokens": 1956095936.0, + "step": 3827 + }, + { + "epoch": 1.0351541373715523, + "grad_norm": 1.3118896484375, + "learning_rate": 1.856181457179339e-05, + "loss": 2.101, + "mean_token_accuracy": 0.5177321434020996, + "num_tokens": 1956620143.0, + "step": 3828 + }, + { + "epoch": 1.035424553812872, + "grad_norm": 1.2607150077819824, + "learning_rate": 1.8560959566466106e-05, + "loss": 1.9204, + "mean_token_accuracy": 0.5536612868309021, + "num_tokens": 1957140747.0, + "step": 3829 + }, + { + "epoch": 1.0356949702541915, + "grad_norm": 1.3806368112564087, + "learning_rate": 1.856010432914635e-05, + "loss": 2.0639, + "mean_token_accuracy": 0.5447561740875244, + "num_tokens": 1957664843.0, + "step": 3830 + }, + { + "epoch": 1.0359653866955112, + "grad_norm": 1.3630203008651733, + "learning_rate": 1.8559248859860358e-05, + "loss": 2.1054, + "mean_token_accuracy": 0.5397785305976868, + "num_tokens": 1958188968.0, + "step": 3831 + }, + { + "epoch": 1.0362358031368306, + "grad_norm": 1.4004578590393066, + "learning_rate": 1.8558393158634386e-05, + "loss": 2.0928, + "mean_token_accuracy": 0.5231291651725769, + "num_tokens": 1958713243.0, + "step": 3832 + }, + { + "epoch": 1.0365062195781503, + "grad_norm": 1.5954504013061523, + "learning_rate": 1.855753722549468e-05, + "loss": 2.1882, + "mean_token_accuracy": 0.5270717144012451, + "num_tokens": 1959197236.0, + "step": 3833 + }, + { + "epoch": 1.03677663601947, + "grad_norm": 1.3156826496124268, + "learning_rate": 1.8556681060467506e-05, + "loss": 2.0582, + "mean_token_accuracy": 0.532873809337616, + "num_tokens": 1959721368.0, + "step": 3834 + }, + { + "epoch": 1.0370470524607895, + "grad_norm": 1.636793851852417, + "learning_rate": 1.8555824663579135e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.5378515720367432, + "num_tokens": 1960245620.0, + "step": 3835 + }, + { + "epoch": 1.0373174689021092, + "grad_norm": 1.6081368923187256, + "learning_rate": 1.8554968034855843e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.530788004398346, + "num_tokens": 1960769837.0, + "step": 3836 + }, + { + "epoch": 1.0375878853434288, + "grad_norm": 1.3579654693603516, + "learning_rate": 1.8554111174323913e-05, + "loss": 2.1819, + "mean_token_accuracy": 0.5257223844528198, + "num_tokens": 1961287700.0, + "step": 3837 + }, + { + "epoch": 1.0378583017847485, + "grad_norm": 1.6551029682159424, + "learning_rate": 1.8553254082009637e-05, + "loss": 2.1537, + "mean_token_accuracy": 0.5140513181686401, + "num_tokens": 1961811824.0, + "step": 3838 + }, + { + "epoch": 1.0381287182260681, + "grad_norm": 1.4502136707305908, + "learning_rate": 1.8552396757939316e-05, + "loss": 2.1551, + "mean_token_accuracy": 0.5093422532081604, + "num_tokens": 1962335932.0, + "step": 3839 + }, + { + "epoch": 1.0383991346673878, + "grad_norm": 1.606992483139038, + "learning_rate": 1.855153920213925e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.523120641708374, + "num_tokens": 1962803307.0, + "step": 3840 + }, + { + "epoch": 1.0386695511087074, + "grad_norm": 0.9382590055465698, + "learning_rate": 1.8550681414635756e-05, + "loss": 1.1748, + "mean_token_accuracy": 0.6774864196777344, + "num_tokens": 1963327557.0, + "step": 3841 + }, + { + "epoch": 1.038939967550027, + "grad_norm": 1.972917914390564, + "learning_rate": 1.854982339545515e-05, + "loss": 2.0171, + "mean_token_accuracy": 0.5389298796653748, + "num_tokens": 1963813645.0, + "step": 3842 + }, + { + "epoch": 1.0392103839913467, + "grad_norm": 2.041486978530884, + "learning_rate": 1.854896514462376e-05, + "loss": 2.0324, + "mean_token_accuracy": 0.5443378686904907, + "num_tokens": 1964330513.0, + "step": 3843 + }, + { + "epoch": 1.0394808004326663, + "grad_norm": 1.2048583030700684, + "learning_rate": 1.8548106662167914e-05, + "loss": 2.1922, + "mean_token_accuracy": 0.5192128419876099, + "num_tokens": 1964834174.0, + "step": 3844 + }, + { + "epoch": 1.039751216873986, + "grad_norm": 1.5927433967590332, + "learning_rate": 1.8547247948113966e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.542858362197876, + "num_tokens": 1965358352.0, + "step": 3845 + }, + { + "epoch": 1.0400216333153056, + "grad_norm": 1.6268919706344604, + "learning_rate": 1.854638900248825e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.535344123840332, + "num_tokens": 1965863905.0, + "step": 3846 + }, + { + "epoch": 1.0402920497566253, + "grad_norm": 1.4229261875152588, + "learning_rate": 1.854552982531713e-05, + "loss": 1.9998, + "mean_token_accuracy": 0.5395522117614746, + "num_tokens": 1966388158.0, + "step": 3847 + }, + { + "epoch": 1.040562466197945, + "grad_norm": 1.1427239179611206, + "learning_rate": 1.8544670416626965e-05, + "loss": 2.1211, + "mean_token_accuracy": 0.521443247795105, + "num_tokens": 1966912431.0, + "step": 3848 + }, + { + "epoch": 1.0408328826392645, + "grad_norm": 1.5855211019515991, + "learning_rate": 1.8543810776444123e-05, + "loss": 1.9617, + "mean_token_accuracy": 0.5463341474533081, + "num_tokens": 1967436656.0, + "step": 3849 + }, + { + "epoch": 1.0411032990805842, + "grad_norm": 1.563759684562683, + "learning_rate": 1.8542950904794982e-05, + "loss": 2.135, + "mean_token_accuracy": 0.533219039440155, + "num_tokens": 1967960837.0, + "step": 3850 + }, + { + "epoch": 1.0413737155219038, + "grad_norm": 1.511131763458252, + "learning_rate": 1.8542090801705926e-05, + "loss": 2.0738, + "mean_token_accuracy": 0.5369688272476196, + "num_tokens": 1968485010.0, + "step": 3851 + }, + { + "epoch": 1.0416441319632233, + "grad_norm": 1.8504674434661865, + "learning_rate": 1.8541230467203343e-05, + "loss": 2.0374, + "mean_token_accuracy": 0.5553141832351685, + "num_tokens": 1969009209.0, + "step": 3852 + }, + { + "epoch": 1.041914548404543, + "grad_norm": 1.6348533630371094, + "learning_rate": 1.8540369901313632e-05, + "loss": 2.0464, + "mean_token_accuracy": 0.536922812461853, + "num_tokens": 1969533484.0, + "step": 3853 + }, + { + "epoch": 1.0421849648458625, + "grad_norm": 1.9044266939163208, + "learning_rate": 1.8539509104063197e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.5214473605155945, + "num_tokens": 1970029406.0, + "step": 3854 + }, + { + "epoch": 1.0424553812871822, + "grad_norm": 2.0458731651306152, + "learning_rate": 1.853864807547845e-05, + "loss": 2.1822, + "mean_token_accuracy": 0.5235764980316162, + "num_tokens": 1970553670.0, + "step": 3855 + }, + { + "epoch": 1.0427257977285018, + "grad_norm": 1.9083266258239746, + "learning_rate": 1.8537786815585815e-05, + "loss": 1.8528, + "mean_token_accuracy": 0.5889562368392944, + "num_tokens": 1971017370.0, + "step": 3856 + }, + { + "epoch": 1.0429962141698215, + "grad_norm": 1.8322556018829346, + "learning_rate": 1.8536925324411708e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.5548378825187683, + "num_tokens": 1971487881.0, + "step": 3857 + }, + { + "epoch": 1.043266630611141, + "grad_norm": 3.811253786087036, + "learning_rate": 1.853606360198257e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5638374090194702, + "num_tokens": 1971974800.0, + "step": 3858 + }, + { + "epoch": 1.0435370470524608, + "grad_norm": 2.0729212760925293, + "learning_rate": 1.853520164832484e-05, + "loss": 2.0362, + "mean_token_accuracy": 0.555515468120575, + "num_tokens": 1972499048.0, + "step": 3859 + }, + { + "epoch": 1.0438074634937804, + "grad_norm": 1.703709602355957, + "learning_rate": 1.853433946346496e-05, + "loss": 2.1428, + "mean_token_accuracy": 0.5314841270446777, + "num_tokens": 1973023219.0, + "step": 3860 + }, + { + "epoch": 1.0440778799351, + "grad_norm": 0.5711586475372314, + "learning_rate": 1.8533477047429388e-05, + "loss": 1.2041, + "mean_token_accuracy": 0.6685306429862976, + "num_tokens": 1973547397.0, + "step": 3861 + }, + { + "epoch": 1.0443482963764197, + "grad_norm": 2.5494699478149414, + "learning_rate": 1.853261440024459e-05, + "loss": 2.0109, + "mean_token_accuracy": 0.5302548408508301, + "num_tokens": 1974071541.0, + "step": 3862 + }, + { + "epoch": 1.0446187128177393, + "grad_norm": 2.307377576828003, + "learning_rate": 1.8531751521937023e-05, + "loss": 2.1333, + "mean_token_accuracy": 0.5219815373420715, + "num_tokens": 1974568369.0, + "step": 3863 + }, + { + "epoch": 1.044889129259059, + "grad_norm": 1.7868235111236572, + "learning_rate": 1.8530888412533177e-05, + "loss": 2.1315, + "mean_token_accuracy": 0.5342113375663757, + "num_tokens": 1975092601.0, + "step": 3864 + }, + { + "epoch": 1.0451595457003786, + "grad_norm": 1.9486886262893677, + "learning_rate": 1.8530025072059524e-05, + "loss": 2.1343, + "mean_token_accuracy": 0.5300981402397156, + "num_tokens": 1975606459.0, + "step": 3865 + }, + { + "epoch": 1.0454299621416983, + "grad_norm": 1.8206663131713867, + "learning_rate": 1.8529161500542562e-05, + "loss": 1.9887, + "mean_token_accuracy": 0.5522982478141785, + "num_tokens": 1976130704.0, + "step": 3866 + }, + { + "epoch": 1.045700378583018, + "grad_norm": 1.7479143142700195, + "learning_rate": 1.8528297698008778e-05, + "loss": 2.1659, + "mean_token_accuracy": 0.5225772261619568, + "num_tokens": 1976654985.0, + "step": 3867 + }, + { + "epoch": 1.0459707950243375, + "grad_norm": 1.6936209201812744, + "learning_rate": 1.8527433664484684e-05, + "loss": 2.0359, + "mean_token_accuracy": 0.5381797552108765, + "num_tokens": 1977179038.0, + "step": 3868 + }, + { + "epoch": 1.0462412114656572, + "grad_norm": 1.8802125453948975, + "learning_rate": 1.8526569399996786e-05, + "loss": 2.0518, + "mean_token_accuracy": 0.551552414894104, + "num_tokens": 1977671984.0, + "step": 3869 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 1.536742091178894, + "learning_rate": 1.852570490457161e-05, + "loss": 1.9013, + "mean_token_accuracy": 0.5669217109680176, + "num_tokens": 1978139429.0, + "step": 3870 + }, + { + "epoch": 1.0467820443482965, + "grad_norm": 1.661218523979187, + "learning_rate": 1.8524840178235673e-05, + "loss": 2.1511, + "mean_token_accuracy": 0.5407922267913818, + "num_tokens": 1978657877.0, + "step": 3871 + }, + { + "epoch": 1.0470524607896161, + "grad_norm": 22.325483322143555, + "learning_rate": 1.852397522101551e-05, + "loss": 1.9094, + "mean_token_accuracy": 0.5691879987716675, + "num_tokens": 1979182053.0, + "step": 3872 + }, + { + "epoch": 1.0473228772309355, + "grad_norm": 2.524433135986328, + "learning_rate": 1.852311003293766e-05, + "loss": 1.9103, + "mean_token_accuracy": 0.5792233347892761, + "num_tokens": 1979706066.0, + "step": 3873 + }, + { + "epoch": 1.0475932936722552, + "grad_norm": 2.14388370513916, + "learning_rate": 1.852224461402867e-05, + "loss": 1.7115, + "mean_token_accuracy": 0.5975438356399536, + "num_tokens": 1980192517.0, + "step": 3874 + }, + { + "epoch": 1.0478637101135748, + "grad_norm": 1.7179341316223145, + "learning_rate": 1.8521378964315093e-05, + "loss": 2.1686, + "mean_token_accuracy": 0.5258005857467651, + "num_tokens": 1980716797.0, + "step": 3875 + }, + { + "epoch": 1.0481341265548945, + "grad_norm": 1.9858949184417725, + "learning_rate": 1.852051308382349e-05, + "loss": 2.0822, + "mean_token_accuracy": 0.5279009938240051, + "num_tokens": 1981240896.0, + "step": 3876 + }, + { + "epoch": 1.048404542996214, + "grad_norm": 1.6876753568649292, + "learning_rate": 1.8519646972580432e-05, + "loss": 1.9949, + "mean_token_accuracy": 0.566460371017456, + "num_tokens": 1981702547.0, + "step": 3877 + }, + { + "epoch": 1.0486749594375337, + "grad_norm": 1.6735280752182007, + "learning_rate": 1.8518780630612487e-05, + "loss": 2.1814, + "mean_token_accuracy": 0.5163631439208984, + "num_tokens": 1982201124.0, + "step": 3878 + }, + { + "epoch": 1.0489453758788534, + "grad_norm": 1.6091524362564087, + "learning_rate": 1.8517914057946248e-05, + "loss": 2.168, + "mean_token_accuracy": 0.5235211849212646, + "num_tokens": 1982725242.0, + "step": 3879 + }, + { + "epoch": 1.049215792320173, + "grad_norm": 1.3230702877044678, + "learning_rate": 1.8517047254608293e-05, + "loss": 2.1085, + "mean_token_accuracy": 0.528907835483551, + "num_tokens": 1983249490.0, + "step": 3880 + }, + { + "epoch": 1.0494862087614927, + "grad_norm": 0.7627261281013489, + "learning_rate": 1.851618022062522e-05, + "loss": 1.1337, + "mean_token_accuracy": 0.7080608010292053, + "num_tokens": 1983773753.0, + "step": 3881 + }, + { + "epoch": 1.0497566252028123, + "grad_norm": 2.6976208686828613, + "learning_rate": 1.8515312956023636e-05, + "loss": 2.119, + "mean_token_accuracy": 0.539010763168335, + "num_tokens": 1984280180.0, + "step": 3882 + }, + { + "epoch": 1.050027041644132, + "grad_norm": 1.7662192583084106, + "learning_rate": 1.8514445460830152e-05, + "loss": 2.0985, + "mean_token_accuracy": 0.5300300717353821, + "num_tokens": 1984804420.0, + "step": 3883 + }, + { + "epoch": 1.0502974580854516, + "grad_norm": 1.5587130784988403, + "learning_rate": 1.8513577735071376e-05, + "loss": 2.0008, + "mean_token_accuracy": 0.5441229343414307, + "num_tokens": 1985328703.0, + "step": 3884 + }, + { + "epoch": 1.0505678745267713, + "grad_norm": 1.9308193922042847, + "learning_rate": 1.8512709778773944e-05, + "loss": 2.0459, + "mean_token_accuracy": 0.5499798655509949, + "num_tokens": 1985852841.0, + "step": 3885 + }, + { + "epoch": 1.050838290968091, + "grad_norm": 1.922317624092102, + "learning_rate": 1.8511841591964483e-05, + "loss": 2.1538, + "mean_token_accuracy": 0.5203615427017212, + "num_tokens": 1986377057.0, + "step": 3886 + }, + { + "epoch": 1.0511087074094105, + "grad_norm": 1.4348732233047485, + "learning_rate": 1.8510973174669628e-05, + "loss": 2.0962, + "mean_token_accuracy": 0.5357657074928284, + "num_tokens": 1986901327.0, + "step": 3887 + }, + { + "epoch": 1.0513791238507302, + "grad_norm": 1.9310517311096191, + "learning_rate": 1.8510104526916027e-05, + "loss": 2.0781, + "mean_token_accuracy": 0.5483993291854858, + "num_tokens": 1987317112.0, + "step": 3888 + }, + { + "epoch": 1.0516495402920498, + "grad_norm": 1.5555098056793213, + "learning_rate": 1.850923564873034e-05, + "loss": 2.0533, + "mean_token_accuracy": 0.5180134177207947, + "num_tokens": 1987841379.0, + "step": 3889 + }, + { + "epoch": 1.0519199567333695, + "grad_norm": 1.315827488899231, + "learning_rate": 1.8508366540139213e-05, + "loss": 2.0484, + "mean_token_accuracy": 0.5371809005737305, + "num_tokens": 1988365504.0, + "step": 3890 + }, + { + "epoch": 1.052190373174689, + "grad_norm": 1.7128263711929321, + "learning_rate": 1.8507497201169326e-05, + "loss": 2.149, + "mean_token_accuracy": 0.5292686223983765, + "num_tokens": 1988859604.0, + "step": 3891 + }, + { + "epoch": 1.0524607896160088, + "grad_norm": 1.593693494796753, + "learning_rate": 1.850662763184734e-05, + "loss": 2.0203, + "mean_token_accuracy": 0.551680862903595, + "num_tokens": 1989326408.0, + "step": 3892 + }, + { + "epoch": 1.0527312060573282, + "grad_norm": 4.67126989364624, + "learning_rate": 1.8505757832199947e-05, + "loss": 1.7784, + "mean_token_accuracy": 0.5926428437232971, + "num_tokens": 1989829055.0, + "step": 3893 + }, + { + "epoch": 1.0530016224986478, + "grad_norm": 2.5741095542907715, + "learning_rate": 1.850488780225383e-05, + "loss": 2.1226, + "mean_token_accuracy": 0.5383256077766418, + "num_tokens": 1990353166.0, + "step": 3894 + }, + { + "epoch": 1.0532720389399675, + "grad_norm": 1.8846485614776611, + "learning_rate": 1.8504017542035686e-05, + "loss": 2.0526, + "mean_token_accuracy": 0.5303981900215149, + "num_tokens": 1990865070.0, + "step": 3895 + }, + { + "epoch": 1.053542455381287, + "grad_norm": 1.433171033859253, + "learning_rate": 1.8503147051572215e-05, + "loss": 2.0726, + "mean_token_accuracy": 0.5419377088546753, + "num_tokens": 1991389258.0, + "step": 3896 + }, + { + "epoch": 1.0538128718226067, + "grad_norm": 1.4687491655349731, + "learning_rate": 1.8502276330890126e-05, + "loss": 2.0315, + "mean_token_accuracy": 0.5336191058158875, + "num_tokens": 1991913478.0, + "step": 3897 + }, + { + "epoch": 1.0540832882639264, + "grad_norm": 1.6540155410766602, + "learning_rate": 1.8501405380016137e-05, + "loss": 2.2007, + "mean_token_accuracy": 0.5280395746231079, + "num_tokens": 1992437734.0, + "step": 3898 + }, + { + "epoch": 1.054353704705246, + "grad_norm": 1.4290586709976196, + "learning_rate": 1.850053419897697e-05, + "loss": 2.0124, + "mean_token_accuracy": 0.5515873432159424, + "num_tokens": 1992961480.0, + "step": 3899 + }, + { + "epoch": 1.0546241211465657, + "grad_norm": 1.513973593711853, + "learning_rate": 1.8499662787799355e-05, + "loss": 2.1266, + "mean_token_accuracy": 0.543893575668335, + "num_tokens": 1993485571.0, + "step": 3900 + }, + { + "epoch": 1.0548945375878853, + "grad_norm": 0.6644322872161865, + "learning_rate": 1.8498791146510034e-05, + "loss": 1.1447, + "mean_token_accuracy": 0.7112492918968201, + "num_tokens": 1994009773.0, + "step": 3901 + }, + { + "epoch": 1.055164954029205, + "grad_norm": 2.3466720581054688, + "learning_rate": 1.849791927513575e-05, + "loss": 2.1222, + "mean_token_accuracy": 0.5363713502883911, + "num_tokens": 1994503993.0, + "step": 3902 + }, + { + "epoch": 1.0554353704705246, + "grad_norm": 2.2204954624176025, + "learning_rate": 1.8497047173703247e-05, + "loss": 2.0042, + "mean_token_accuracy": 0.5457782745361328, + "num_tokens": 1995028259.0, + "step": 3903 + }, + { + "epoch": 1.0557057869118442, + "grad_norm": 1.3075557947158813, + "learning_rate": 1.8496174842239293e-05, + "loss": 1.9912, + "mean_token_accuracy": 0.5444789528846741, + "num_tokens": 1995552455.0, + "step": 3904 + }, + { + "epoch": 1.055976203353164, + "grad_norm": 1.4403730630874634, + "learning_rate": 1.8495302280770647e-05, + "loss": 2.051, + "mean_token_accuracy": 0.541527271270752, + "num_tokens": 1996076576.0, + "step": 3905 + }, + { + "epoch": 1.0562466197944835, + "grad_norm": 1.6071337461471558, + "learning_rate": 1.849442948932409e-05, + "loss": 2.0561, + "mean_token_accuracy": 0.5398750901222229, + "num_tokens": 1996582121.0, + "step": 3906 + }, + { + "epoch": 1.0565170362358032, + "grad_norm": 1.5787544250488281, + "learning_rate": 1.849355646792639e-05, + "loss": 2.1378, + "mean_token_accuracy": 0.5424847602844238, + "num_tokens": 1997082319.0, + "step": 3907 + }, + { + "epoch": 1.0567874526771228, + "grad_norm": 1.716339111328125, + "learning_rate": 1.8492683216604346e-05, + "loss": 2.1124, + "mean_token_accuracy": 0.5344662666320801, + "num_tokens": 1997606601.0, + "step": 3908 + }, + { + "epoch": 1.0570578691184425, + "grad_norm": 1.5124199390411377, + "learning_rate": 1.849180973538474e-05, + "loss": 2.0641, + "mean_token_accuracy": 0.5458939075469971, + "num_tokens": 1998124722.0, + "step": 3909 + }, + { + "epoch": 1.057328285559762, + "grad_norm": 1.47077214717865, + "learning_rate": 1.8490936024294387e-05, + "loss": 2.1617, + "mean_token_accuracy": 0.5248521566390991, + "num_tokens": 1998648866.0, + "step": 3910 + }, + { + "epoch": 1.0575987020010817, + "grad_norm": 1.6913357973098755, + "learning_rate": 1.8490062083360084e-05, + "loss": 2.0277, + "mean_token_accuracy": 0.5521259307861328, + "num_tokens": 1999172929.0, + "step": 3911 + }, + { + "epoch": 1.0578691184424014, + "grad_norm": 1.8330328464508057, + "learning_rate": 1.848918791260865e-05, + "loss": 2.2373, + "mean_token_accuracy": 0.5013054609298706, + "num_tokens": 1999697018.0, + "step": 3912 + }, + { + "epoch": 1.058139534883721, + "grad_norm": 1.8474228382110596, + "learning_rate": 1.8488313512066903e-05, + "loss": 2.1407, + "mean_token_accuracy": 0.5438550710678101, + "num_tokens": 2000189918.0, + "step": 3913 + }, + { + "epoch": 1.0584099513250407, + "grad_norm": 1.5343825817108154, + "learning_rate": 1.848743888176168e-05, + "loss": 2.2065, + "mean_token_accuracy": 0.5378494262695312, + "num_tokens": 2000653553.0, + "step": 3914 + }, + { + "epoch": 1.05868036776636, + "grad_norm": 1.3209118843078613, + "learning_rate": 1.8486564021719814e-05, + "loss": 2.029, + "mean_token_accuracy": 0.5446265935897827, + "num_tokens": 2001143388.0, + "step": 3915 + }, + { + "epoch": 1.0589507842076797, + "grad_norm": 1.638019323348999, + "learning_rate": 1.848568893196814e-05, + "loss": 2.1966, + "mean_token_accuracy": 0.518024206161499, + "num_tokens": 2001667655.0, + "step": 3916 + }, + { + "epoch": 1.0592212006489994, + "grad_norm": 1.5421701669692993, + "learning_rate": 1.848481361253352e-05, + "loss": 2.1039, + "mean_token_accuracy": 0.5605484247207642, + "num_tokens": 2002191935.0, + "step": 3917 + }, + { + "epoch": 1.059491617090319, + "grad_norm": 1.211029291152954, + "learning_rate": 1.8483938063442807e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.5274901390075684, + "num_tokens": 2002716208.0, + "step": 3918 + }, + { + "epoch": 1.0597620335316387, + "grad_norm": 1.5181788206100464, + "learning_rate": 1.8483062284722866e-05, + "loss": 2.0043, + "mean_token_accuracy": 0.550290584564209, + "num_tokens": 2003240474.0, + "step": 3919 + }, + { + "epoch": 1.0600324499729583, + "grad_norm": 1.6558188199996948, + "learning_rate": 1.8482186276400568e-05, + "loss": 2.0877, + "mean_token_accuracy": 0.5391985774040222, + "num_tokens": 2003764726.0, + "step": 3920 + }, + { + "epoch": 1.060302866414278, + "grad_norm": 0.8668359518051147, + "learning_rate": 1.848131003850279e-05, + "loss": 1.2767, + "mean_token_accuracy": 0.6567175984382629, + "num_tokens": 2004288918.0, + "step": 3921 + }, + { + "epoch": 1.0605732828555976, + "grad_norm": 2.149980306625366, + "learning_rate": 1.8480433571056423e-05, + "loss": 2.1692, + "mean_token_accuracy": 0.521973729133606, + "num_tokens": 2004813130.0, + "step": 3922 + }, + { + "epoch": 1.0608436992969172, + "grad_norm": 1.9767472743988037, + "learning_rate": 1.847955687408835e-05, + "loss": 1.9885, + "mean_token_accuracy": 0.5502211451530457, + "num_tokens": 2005337236.0, + "step": 3923 + }, + { + "epoch": 1.0611141157382369, + "grad_norm": 1.1750410795211792, + "learning_rate": 1.8478679947625478e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.5315382480621338, + "num_tokens": 2005847809.0, + "step": 3924 + }, + { + "epoch": 1.0613845321795565, + "grad_norm": 1.5220218896865845, + "learning_rate": 1.8477802791694713e-05, + "loss": 1.9451, + "mean_token_accuracy": 0.552559494972229, + "num_tokens": 2006372085.0, + "step": 3925 + }, + { + "epoch": 1.0616549486208762, + "grad_norm": 1.6766036748886108, + "learning_rate": 1.847692540632297e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.5463714599609375, + "num_tokens": 2006896328.0, + "step": 3926 + }, + { + "epoch": 1.0619253650621958, + "grad_norm": 1.7658723592758179, + "learning_rate": 1.847604779153716e-05, + "loss": 2.1707, + "mean_token_accuracy": 0.5271289944648743, + "num_tokens": 2007420549.0, + "step": 3927 + }, + { + "epoch": 1.0621957815035155, + "grad_norm": 1.6440123319625854, + "learning_rate": 1.8475169947364224e-05, + "loss": 1.9398, + "mean_token_accuracy": 0.5574942231178284, + "num_tokens": 2007944726.0, + "step": 3928 + }, + { + "epoch": 1.062466197944835, + "grad_norm": 1.6810427904129028, + "learning_rate": 1.847429187383109e-05, + "loss": 2.0918, + "mean_token_accuracy": 0.5173981785774231, + "num_tokens": 2008468857.0, + "step": 3929 + }, + { + "epoch": 1.0627366143861547, + "grad_norm": 1.5830179452896118, + "learning_rate": 1.84734135709647e-05, + "loss": 2.1435, + "mean_token_accuracy": 0.5269029140472412, + "num_tokens": 2008993129.0, + "step": 3930 + }, + { + "epoch": 1.0630070308274744, + "grad_norm": 2.06785249710083, + "learning_rate": 1.8472535038792004e-05, + "loss": 2.1532, + "mean_token_accuracy": 0.5147174596786499, + "num_tokens": 2009517353.0, + "step": 3931 + }, + { + "epoch": 1.063277447268794, + "grad_norm": 1.8896684646606445, + "learning_rate": 1.8471656277339957e-05, + "loss": 2.2129, + "mean_token_accuracy": 0.5317245721817017, + "num_tokens": 2010041630.0, + "step": 3932 + }, + { + "epoch": 1.0635478637101137, + "grad_norm": 1.6083009243011475, + "learning_rate": 1.8470777286635522e-05, + "loss": 2.148, + "mean_token_accuracy": 0.502208411693573, + "num_tokens": 2010565840.0, + "step": 3933 + }, + { + "epoch": 1.063818280151433, + "grad_norm": 1.8855719566345215, + "learning_rate": 1.8469898066705673e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.5253108739852905, + "num_tokens": 2011090094.0, + "step": 3934 + }, + { + "epoch": 1.0640886965927527, + "grad_norm": 1.5406029224395752, + "learning_rate": 1.846901861757738e-05, + "loss": 2.0486, + "mean_token_accuracy": 0.526209831237793, + "num_tokens": 2011614343.0, + "step": 3935 + }, + { + "epoch": 1.0643591130340724, + "grad_norm": 1.6852301359176636, + "learning_rate": 1.8468138939277635e-05, + "loss": 2.1496, + "mean_token_accuracy": 0.5305800437927246, + "num_tokens": 2012138520.0, + "step": 3936 + }, + { + "epoch": 1.064629529475392, + "grad_norm": 2.35032057762146, + "learning_rate": 1.8467259031833422e-05, + "loss": 2.1814, + "mean_token_accuracy": 0.5299127697944641, + "num_tokens": 2012662734.0, + "step": 3937 + }, + { + "epoch": 1.0648999459167117, + "grad_norm": 1.826833963394165, + "learning_rate": 1.846637889527174e-05, + "loss": 2.0239, + "mean_token_accuracy": 0.5325219035148621, + "num_tokens": 2013186938.0, + "step": 3938 + }, + { + "epoch": 1.0651703623580313, + "grad_norm": 1.280405879020691, + "learning_rate": 1.8465498529619604e-05, + "loss": 2.139, + "mean_token_accuracy": 0.5263437032699585, + "num_tokens": 2013698388.0, + "step": 3939 + }, + { + "epoch": 1.065440778799351, + "grad_norm": 1.5288448333740234, + "learning_rate": 1.8464617934904012e-05, + "loss": 2.0532, + "mean_token_accuracy": 0.544633150100708, + "num_tokens": 2014222631.0, + "step": 3940 + }, + { + "epoch": 1.0657111952406706, + "grad_norm": 0.9893345236778259, + "learning_rate": 1.8463737111151992e-05, + "loss": 1.1563, + "mean_token_accuracy": 0.6926689743995667, + "num_tokens": 2014746647.0, + "step": 3941 + }, + { + "epoch": 1.0659816116819902, + "grad_norm": 2.725172996520996, + "learning_rate": 1.8462856058390567e-05, + "loss": 2.1832, + "mean_token_accuracy": 0.5312942862510681, + "num_tokens": 2015214382.0, + "step": 3942 + }, + { + "epoch": 1.0662520281233099, + "grad_norm": 1.9431852102279663, + "learning_rate": 1.8461974776646772e-05, + "loss": 2.2418, + "mean_token_accuracy": 0.5062038898468018, + "num_tokens": 2015738664.0, + "step": 3943 + }, + { + "epoch": 1.0665224445646295, + "grad_norm": 1.7228134870529175, + "learning_rate": 1.8461093265947643e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.538092851638794, + "num_tokens": 2016262645.0, + "step": 3944 + }, + { + "epoch": 1.0667928610059492, + "grad_norm": 2.3176112174987793, + "learning_rate": 1.8460211526320236e-05, + "loss": 2.0536, + "mean_token_accuracy": 0.556132435798645, + "num_tokens": 2016786757.0, + "step": 3945 + }, + { + "epoch": 1.0670632774472688, + "grad_norm": 1.2521963119506836, + "learning_rate": 1.84593295577916e-05, + "loss": 1.8849, + "mean_token_accuracy": 0.588538646697998, + "num_tokens": 2017310940.0, + "step": 3946 + }, + { + "epoch": 1.0673336938885885, + "grad_norm": 1.4634233713150024, + "learning_rate": 1.8458447360388793e-05, + "loss": 1.9136, + "mean_token_accuracy": 0.5624656677246094, + "num_tokens": 2017835096.0, + "step": 3947 + }, + { + "epoch": 1.067604110329908, + "grad_norm": 1.5299023389816284, + "learning_rate": 1.845756493413889e-05, + "loss": 2.0245, + "mean_token_accuracy": 0.5451210141181946, + "num_tokens": 2018359243.0, + "step": 3948 + }, + { + "epoch": 1.0678745267712277, + "grad_norm": 1.6099389791488647, + "learning_rate": 1.8456682279068963e-05, + "loss": 2.0992, + "mean_token_accuracy": 0.5401042699813843, + "num_tokens": 2018883423.0, + "step": 3949 + }, + { + "epoch": 1.0681449432125474, + "grad_norm": 1.3728653192520142, + "learning_rate": 1.8455799395206096e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.5492620468139648, + "num_tokens": 2019400842.0, + "step": 3950 + }, + { + "epoch": 1.068415359653867, + "grad_norm": 1.4162088632583618, + "learning_rate": 1.8454916282577376e-05, + "loss": 2.0141, + "mean_token_accuracy": 0.5569863319396973, + "num_tokens": 2019853933.0, + "step": 3951 + }, + { + "epoch": 1.0686857760951867, + "grad_norm": 1.4815078973770142, + "learning_rate": 1.84540329412099e-05, + "loss": 1.9678, + "mean_token_accuracy": 0.575640082359314, + "num_tokens": 2020378102.0, + "step": 3952 + }, + { + "epoch": 1.0689561925365063, + "grad_norm": 1.6107434034347534, + "learning_rate": 1.8453149371130774e-05, + "loss": 2.2149, + "mean_token_accuracy": 0.5110396146774292, + "num_tokens": 2020902357.0, + "step": 3953 + }, + { + "epoch": 1.069226608977826, + "grad_norm": 1.7613811492919922, + "learning_rate": 1.845226557236711e-05, + "loss": 2.0348, + "mean_token_accuracy": 0.5416510105133057, + "num_tokens": 2021408174.0, + "step": 3954 + }, + { + "epoch": 1.0694970254191456, + "grad_norm": 1.7714439630508423, + "learning_rate": 1.8451381544946018e-05, + "loss": 2.2082, + "mean_token_accuracy": 0.5156338810920715, + "num_tokens": 2021932397.0, + "step": 3955 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 1.4545609951019287, + "learning_rate": 1.845049728889463e-05, + "loss": 2.1201, + "mean_token_accuracy": 0.5123986005783081, + "num_tokens": 2022456613.0, + "step": 3956 + }, + { + "epoch": 1.0700378583017847, + "grad_norm": 1.4542183876037598, + "learning_rate": 1.8449612804240073e-05, + "loss": 2.1592, + "mean_token_accuracy": 0.526408314704895, + "num_tokens": 2022950263.0, + "step": 3957 + }, + { + "epoch": 1.0703082747431043, + "grad_norm": 1.5914828777313232, + "learning_rate": 1.8448728091009487e-05, + "loss": 2.1717, + "mean_token_accuracy": 0.5349000692367554, + "num_tokens": 2023442311.0, + "step": 3958 + }, + { + "epoch": 1.070578691184424, + "grad_norm": 1.3196810483932495, + "learning_rate": 1.8447843149230018e-05, + "loss": 2.016, + "mean_token_accuracy": 0.5465899705886841, + "num_tokens": 2023966550.0, + "step": 3959 + }, + { + "epoch": 1.0708491076257436, + "grad_norm": 1.4982494115829468, + "learning_rate": 1.844695797892882e-05, + "loss": 1.9824, + "mean_token_accuracy": 0.5496031045913696, + "num_tokens": 2024490812.0, + "step": 3960 + }, + { + "epoch": 1.0711195240670632, + "grad_norm": 0.7123029232025146, + "learning_rate": 1.8446072580133048e-05, + "loss": 1.0728, + "mean_token_accuracy": 0.7125418186187744, + "num_tokens": 2025014905.0, + "step": 3961 + }, + { + "epoch": 1.0713899405083829, + "grad_norm": 1.8236567974090576, + "learning_rate": 1.8445186952869876e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.532008171081543, + "num_tokens": 2025539113.0, + "step": 3962 + }, + { + "epoch": 1.0716603569497025, + "grad_norm": 1.317630410194397, + "learning_rate": 1.844430109716647e-05, + "loss": 2.0238, + "mean_token_accuracy": 0.5435583591461182, + "num_tokens": 2026063367.0, + "step": 3963 + }, + { + "epoch": 1.0719307733910222, + "grad_norm": 1.643742561340332, + "learning_rate": 1.8443415013050014e-05, + "loss": 1.9847, + "mean_token_accuracy": 0.5412863492965698, + "num_tokens": 2026548671.0, + "step": 3964 + }, + { + "epoch": 1.0722011898323418, + "grad_norm": 1.8593482971191406, + "learning_rate": 1.8442528700547698e-05, + "loss": 2.1636, + "mean_token_accuracy": 0.5266402959823608, + "num_tokens": 2027072864.0, + "step": 3965 + }, + { + "epoch": 1.0724716062736614, + "grad_norm": 1.3261266946792603, + "learning_rate": 1.8441642159686715e-05, + "loss": 2.1259, + "mean_token_accuracy": 0.5504059791564941, + "num_tokens": 2027569439.0, + "step": 3966 + }, + { + "epoch": 1.072742022714981, + "grad_norm": 1.6115703582763672, + "learning_rate": 1.8440755390494262e-05, + "loss": 2.0347, + "mean_token_accuracy": 0.5475673675537109, + "num_tokens": 2028033526.0, + "step": 3967 + }, + { + "epoch": 1.0730124391563007, + "grad_norm": 1.49750554561615, + "learning_rate": 1.8439868392997557e-05, + "loss": 2.1521, + "mean_token_accuracy": 0.5262544751167297, + "num_tokens": 2028557696.0, + "step": 3968 + }, + { + "epoch": 1.0732828555976204, + "grad_norm": 1.2208179235458374, + "learning_rate": 1.8438981167223805e-05, + "loss": 2.0757, + "mean_token_accuracy": 0.532802164554596, + "num_tokens": 2029081736.0, + "step": 3969 + }, + { + "epoch": 1.07355327203894, + "grad_norm": 1.828279972076416, + "learning_rate": 1.8438093713200237e-05, + "loss": 2.1534, + "mean_token_accuracy": 0.528269350528717, + "num_tokens": 2029605907.0, + "step": 3970 + }, + { + "epoch": 1.0738236884802597, + "grad_norm": 1.5404558181762695, + "learning_rate": 1.843720603095408e-05, + "loss": 2.1134, + "mean_token_accuracy": 0.530798077583313, + "num_tokens": 2030129917.0, + "step": 3971 + }, + { + "epoch": 1.0740941049215793, + "grad_norm": 1.30927574634552, + "learning_rate": 1.8436318120512572e-05, + "loss": 2.0736, + "mean_token_accuracy": 0.5206292271614075, + "num_tokens": 2030642211.0, + "step": 3972 + }, + { + "epoch": 1.074364521362899, + "grad_norm": 1.5773255825042725, + "learning_rate": 1.8435429981902953e-05, + "loss": 2.0697, + "mean_token_accuracy": 0.5406229496002197, + "num_tokens": 2031163602.0, + "step": 3973 + }, + { + "epoch": 1.0746349378042186, + "grad_norm": 1.6375486850738525, + "learning_rate": 1.8434541615152477e-05, + "loss": 1.9995, + "mean_token_accuracy": 0.5513132810592651, + "num_tokens": 2031687875.0, + "step": 3974 + }, + { + "epoch": 1.074905354245538, + "grad_norm": 1.2131637334823608, + "learning_rate": 1.84336530202884e-05, + "loss": 2.0725, + "mean_token_accuracy": 0.5352702140808105, + "num_tokens": 2032212135.0, + "step": 3975 + }, + { + "epoch": 1.0751757706868577, + "grad_norm": 1.4755761623382568, + "learning_rate": 1.8432764197337988e-05, + "loss": 2.0877, + "mean_token_accuracy": 0.5335612297058105, + "num_tokens": 2032736289.0, + "step": 3976 + }, + { + "epoch": 1.0754461871281773, + "grad_norm": 1.7433944940567017, + "learning_rate": 1.843187514632851e-05, + "loss": 2.1696, + "mean_token_accuracy": 0.5213116407394409, + "num_tokens": 2033260454.0, + "step": 3977 + }, + { + "epoch": 1.075716603569497, + "grad_norm": 1.0300540924072266, + "learning_rate": 1.843098586728725e-05, + "loss": 2.0278, + "mean_token_accuracy": 0.5420596599578857, + "num_tokens": 2033784517.0, + "step": 3978 + }, + { + "epoch": 1.0759870200108166, + "grad_norm": 1.3423365354537964, + "learning_rate": 1.843009636024149e-05, + "loss": 2.0871, + "mean_token_accuracy": 0.5337424278259277, + "num_tokens": 2034274541.0, + "step": 3979 + }, + { + "epoch": 1.0762574364521362, + "grad_norm": 1.766964077949524, + "learning_rate": 1.8429206625218525e-05, + "loss": 2.1596, + "mean_token_accuracy": 0.5081081986427307, + "num_tokens": 2034798802.0, + "step": 3980 + }, + { + "epoch": 1.0765278528934559, + "grad_norm": 0.6273404955863953, + "learning_rate": 1.842831666224565e-05, + "loss": 1.1467, + "mean_token_accuracy": 0.6955840587615967, + "num_tokens": 2035322975.0, + "step": 3981 + }, + { + "epoch": 1.0767982693347755, + "grad_norm": 2.230889320373535, + "learning_rate": 1.8427426471350173e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5279075503349304, + "num_tokens": 2035847240.0, + "step": 3982 + }, + { + "epoch": 1.0770686857760952, + "grad_norm": 1.7477339506149292, + "learning_rate": 1.8426536052559413e-05, + "loss": 1.8687, + "mean_token_accuracy": 0.5769405961036682, + "num_tokens": 2036371452.0, + "step": 3983 + }, + { + "epoch": 1.0773391022174148, + "grad_norm": 1.327603816986084, + "learning_rate": 1.8425645405900684e-05, + "loss": 2.0746, + "mean_token_accuracy": 0.5308018326759338, + "num_tokens": 2036895697.0, + "step": 3984 + }, + { + "epoch": 1.0776095186587344, + "grad_norm": 1.669356346130371, + "learning_rate": 1.8424754531401316e-05, + "loss": 2.1311, + "mean_token_accuracy": 0.531028687953949, + "num_tokens": 2037419963.0, + "step": 3985 + }, + { + "epoch": 1.077879935100054, + "grad_norm": 1.6089009046554565, + "learning_rate": 1.8423863429088643e-05, + "loss": 2.062, + "mean_token_accuracy": 0.5433451533317566, + "num_tokens": 2037944179.0, + "step": 3986 + }, + { + "epoch": 1.0781503515413737, + "grad_norm": 1.359204888343811, + "learning_rate": 1.842297209899001e-05, + "loss": 2.1514, + "mean_token_accuracy": 0.5151901245117188, + "num_tokens": 2038468449.0, + "step": 3987 + }, + { + "epoch": 1.0784207679826934, + "grad_norm": 1.52017343044281, + "learning_rate": 1.8422080541132768e-05, + "loss": 2.0528, + "mean_token_accuracy": 0.5342470407485962, + "num_tokens": 2038992730.0, + "step": 3988 + }, + { + "epoch": 1.078691184424013, + "grad_norm": 1.6539024114608765, + "learning_rate": 1.8421188755544264e-05, + "loss": 2.0655, + "mean_token_accuracy": 0.537635087966919, + "num_tokens": 2039516795.0, + "step": 3989 + }, + { + "epoch": 1.0789616008653327, + "grad_norm": 1.819474220275879, + "learning_rate": 1.842029674225186e-05, + "loss": 1.9807, + "mean_token_accuracy": 0.5386221408843994, + "num_tokens": 2040041051.0, + "step": 3990 + }, + { + "epoch": 1.0792320173066523, + "grad_norm": 1.904453158378601, + "learning_rate": 1.8419404501282937e-05, + "loss": 2.156, + "mean_token_accuracy": 0.5303133726119995, + "num_tokens": 2040531391.0, + "step": 3991 + }, + { + "epoch": 1.079502433747972, + "grad_norm": 1.8853678703308105, + "learning_rate": 1.8418512032664866e-05, + "loss": 1.9287, + "mean_token_accuracy": 0.5603387355804443, + "num_tokens": 2041032379.0, + "step": 3992 + }, + { + "epoch": 1.0797728501892916, + "grad_norm": 1.4582834243774414, + "learning_rate": 1.8417619336425024e-05, + "loss": 2.1044, + "mean_token_accuracy": 0.5456554889678955, + "num_tokens": 2041494298.0, + "step": 3993 + }, + { + "epoch": 1.0800432666306112, + "grad_norm": 1.6452898979187012, + "learning_rate": 1.841672641259081e-05, + "loss": 2.1645, + "mean_token_accuracy": 0.5188905000686646, + "num_tokens": 2042018483.0, + "step": 3994 + }, + { + "epoch": 1.0803136830719309, + "grad_norm": 1.4226998090744019, + "learning_rate": 1.8415833261189617e-05, + "loss": 1.9081, + "mean_token_accuracy": 0.5655589699745178, + "num_tokens": 2042542657.0, + "step": 3995 + }, + { + "epoch": 1.0805840995132505, + "grad_norm": 1.5614955425262451, + "learning_rate": 1.8414939882248852e-05, + "loss": 2.02, + "mean_token_accuracy": 0.5438286662101746, + "num_tokens": 2043030424.0, + "step": 3996 + }, + { + "epoch": 1.08085451595457, + "grad_norm": 1.3657885789871216, + "learning_rate": 1.8414046275795926e-05, + "loss": 2.0335, + "mean_token_accuracy": 0.5448278188705444, + "num_tokens": 2043554471.0, + "step": 3997 + }, + { + "epoch": 1.0811249323958896, + "grad_norm": 1.5850799083709717, + "learning_rate": 1.8413152441858254e-05, + "loss": 2.0332, + "mean_token_accuracy": 0.5439857840538025, + "num_tokens": 2044078570.0, + "step": 3998 + }, + { + "epoch": 1.0813953488372092, + "grad_norm": 1.345631718635559, + "learning_rate": 1.8412258380463264e-05, + "loss": 1.9297, + "mean_token_accuracy": 0.5487096309661865, + "num_tokens": 2044602665.0, + "step": 3999 + }, + { + "epoch": 1.0816657652785289, + "grad_norm": 1.377095103263855, + "learning_rate": 1.8411364091638393e-05, + "loss": 2.057, + "mean_token_accuracy": 0.5244452953338623, + "num_tokens": 2045126735.0, + "step": 4000 + }, + { + "epoch": 1.0819361817198485, + "grad_norm": 0.8053818345069885, + "learning_rate": 1.8410469575411075e-05, + "loss": 1.1653, + "mean_token_accuracy": 0.6870012283325195, + "num_tokens": 2045651018.0, + "step": 4001 + }, + { + "epoch": 1.0822065981611682, + "grad_norm": 2.0119731426239014, + "learning_rate": 1.840957483180876e-05, + "loss": 2.1432, + "mean_token_accuracy": 0.533072292804718, + "num_tokens": 2046175268.0, + "step": 4002 + }, + { + "epoch": 1.0824770146024878, + "grad_norm": 1.5629860162734985, + "learning_rate": 1.8408679860858895e-05, + "loss": 2.0884, + "mean_token_accuracy": 0.5305628776550293, + "num_tokens": 2046699466.0, + "step": 4003 + }, + { + "epoch": 1.0827474310438074, + "grad_norm": 1.5096383094787598, + "learning_rate": 1.8407784662588944e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5632402896881104, + "num_tokens": 2047099068.0, + "step": 4004 + }, + { + "epoch": 1.083017847485127, + "grad_norm": 1.4954278469085693, + "learning_rate": 1.8406889237026376e-05, + "loss": 2.0327, + "mean_token_accuracy": 0.5377896428108215, + "num_tokens": 2047623098.0, + "step": 4005 + }, + { + "epoch": 1.0832882639264467, + "grad_norm": 1.7515488862991333, + "learning_rate": 1.8405993584198668e-05, + "loss": 2.1013, + "mean_token_accuracy": 0.5401440262794495, + "num_tokens": 2048127919.0, + "step": 4006 + }, + { + "epoch": 1.0835586803677664, + "grad_norm": 1.621400237083435, + "learning_rate": 1.840509770413329e-05, + "loss": 2.0964, + "mean_token_accuracy": 0.5347954034805298, + "num_tokens": 2048652123.0, + "step": 4007 + }, + { + "epoch": 1.083829096809086, + "grad_norm": 1.6618698835372925, + "learning_rate": 1.8404201596857746e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.5433768033981323, + "num_tokens": 2049176344.0, + "step": 4008 + }, + { + "epoch": 1.0840995132504057, + "grad_norm": 1.2128090858459473, + "learning_rate": 1.8403305262399515e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.5303032398223877, + "num_tokens": 2049700477.0, + "step": 4009 + }, + { + "epoch": 1.0843699296917253, + "grad_norm": 1.3411965370178223, + "learning_rate": 1.840240870078611e-05, + "loss": 2.0214, + "mean_token_accuracy": 0.5460729002952576, + "num_tokens": 2050224738.0, + "step": 4010 + }, + { + "epoch": 1.084640346133045, + "grad_norm": 1.358093500137329, + "learning_rate": 1.8401511912045037e-05, + "loss": 2.1065, + "mean_token_accuracy": 0.5307056307792664, + "num_tokens": 2050748881.0, + "step": 4011 + }, + { + "epoch": 1.0849107625743646, + "grad_norm": 1.4524651765823364, + "learning_rate": 1.840061489620381e-05, + "loss": 2.038, + "mean_token_accuracy": 0.5368546843528748, + "num_tokens": 2051273039.0, + "step": 4012 + }, + { + "epoch": 1.0851811790156842, + "grad_norm": 1.2699997425079346, + "learning_rate": 1.8399717653289955e-05, + "loss": 2.1007, + "mean_token_accuracy": 0.5297950506210327, + "num_tokens": 2051797270.0, + "step": 4013 + }, + { + "epoch": 1.0854515954570039, + "grad_norm": 1.5286468267440796, + "learning_rate": 1.8398820183331004e-05, + "loss": 2.0898, + "mean_token_accuracy": 0.5258837342262268, + "num_tokens": 2052321405.0, + "step": 4014 + }, + { + "epoch": 1.0857220118983235, + "grad_norm": 1.513478398323059, + "learning_rate": 1.8397922486354486e-05, + "loss": 2.2085, + "mean_token_accuracy": 0.5442726612091064, + "num_tokens": 2052845485.0, + "step": 4015 + }, + { + "epoch": 1.085992428339643, + "grad_norm": 1.329415202140808, + "learning_rate": 1.8397024562387955e-05, + "loss": 2.1249, + "mean_token_accuracy": 0.5364950895309448, + "num_tokens": 2053369730.0, + "step": 4016 + }, + { + "epoch": 1.0862628447809626, + "grad_norm": 1.3203519582748413, + "learning_rate": 1.8396126411458956e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.5540114641189575, + "num_tokens": 2053859738.0, + "step": 4017 + }, + { + "epoch": 1.0865332612222822, + "grad_norm": 1.399020791053772, + "learning_rate": 1.839522803359505e-05, + "loss": 2.1027, + "mean_token_accuracy": 0.5390892624855042, + "num_tokens": 2054383956.0, + "step": 4018 + }, + { + "epoch": 1.0868036776636019, + "grad_norm": 1.6779307126998901, + "learning_rate": 1.83943294288238e-05, + "loss": 2.1718, + "mean_token_accuracy": 0.5161486268043518, + "num_tokens": 2054908210.0, + "step": 4019 + }, + { + "epoch": 1.0870740941049215, + "grad_norm": 1.3333289623260498, + "learning_rate": 1.839343059717277e-05, + "loss": 2.0553, + "mean_token_accuracy": 0.5242369174957275, + "num_tokens": 2055432367.0, + "step": 4020 + }, + { + "epoch": 1.0873445105462411, + "grad_norm": 0.8496848344802856, + "learning_rate": 1.8392531538669554e-05, + "loss": 1.2014, + "mean_token_accuracy": 0.6810814142227173, + "num_tokens": 2055906279.0, + "step": 4021 + }, + { + "epoch": 1.0876149269875608, + "grad_norm": 2.040236711502075, + "learning_rate": 1.839163225334173e-05, + "loss": 2.0553, + "mean_token_accuracy": 0.5517628788948059, + "num_tokens": 2056393742.0, + "step": 4022 + }, + { + "epoch": 1.0878853434288804, + "grad_norm": 1.6323174238204956, + "learning_rate": 1.839073274121689e-05, + "loss": 2.111, + "mean_token_accuracy": 0.5395231246948242, + "num_tokens": 2056917903.0, + "step": 4023 + }, + { + "epoch": 1.0881557598702, + "grad_norm": 1.5297738313674927, + "learning_rate": 1.8389833002322634e-05, + "loss": 1.9651, + "mean_token_accuracy": 0.5617228746414185, + "num_tokens": 2057418216.0, + "step": 4024 + }, + { + "epoch": 1.0884261763115197, + "grad_norm": 1.6505892276763916, + "learning_rate": 1.8388933036686572e-05, + "loss": 2.1906, + "mean_token_accuracy": 0.5246555805206299, + "num_tokens": 2057923913.0, + "step": 4025 + }, + { + "epoch": 1.0886965927528394, + "grad_norm": 1.3893320560455322, + "learning_rate": 1.838803284433631e-05, + "loss": 2.0082, + "mean_token_accuracy": 0.5539015531539917, + "num_tokens": 2058448116.0, + "step": 4026 + }, + { + "epoch": 1.088967009194159, + "grad_norm": 1.6131036281585693, + "learning_rate": 1.8387132425299478e-05, + "loss": 2.0366, + "mean_token_accuracy": 0.5426205396652222, + "num_tokens": 2058972294.0, + "step": 4027 + }, + { + "epoch": 1.0892374256354787, + "grad_norm": 1.4469128847122192, + "learning_rate": 1.83862317796037e-05, + "loss": 1.9613, + "mean_token_accuracy": 0.5309133529663086, + "num_tokens": 2059484172.0, + "step": 4028 + }, + { + "epoch": 1.0895078420767983, + "grad_norm": 1.3124473094940186, + "learning_rate": 1.8385330907276604e-05, + "loss": 2.0052, + "mean_token_accuracy": 0.5479758381843567, + "num_tokens": 2060008304.0, + "step": 4029 + }, + { + "epoch": 1.089778258518118, + "grad_norm": 1.3559942245483398, + "learning_rate": 1.838442980834584e-05, + "loss": 2.141, + "mean_token_accuracy": 0.5420316457748413, + "num_tokens": 2060473844.0, + "step": 4030 + }, + { + "epoch": 1.0900486749594376, + "grad_norm": 1.3561469316482544, + "learning_rate": 1.8383528482839054e-05, + "loss": 1.9664, + "mean_token_accuracy": 0.5605107545852661, + "num_tokens": 2060997986.0, + "step": 4031 + }, + { + "epoch": 1.0903190914007572, + "grad_norm": 1.5317903757095337, + "learning_rate": 1.83826269307839e-05, + "loss": 2.1727, + "mean_token_accuracy": 0.5172919034957886, + "num_tokens": 2061522232.0, + "step": 4032 + }, + { + "epoch": 1.0905895078420769, + "grad_norm": 1.35044527053833, + "learning_rate": 1.838172515220804e-05, + "loss": 2.1138, + "mean_token_accuracy": 0.5339010953903198, + "num_tokens": 2062046482.0, + "step": 4033 + }, + { + "epoch": 1.0908599242833965, + "grad_norm": 1.5436360836029053, + "learning_rate": 1.8380823147139145e-05, + "loss": 2.046, + "mean_token_accuracy": 0.5453068017959595, + "num_tokens": 2062570719.0, + "step": 4034 + }, + { + "epoch": 1.0911303407247162, + "grad_norm": 1.7482119798660278, + "learning_rate": 1.837992091560489e-05, + "loss": 1.7565, + "mean_token_accuracy": 0.5574867725372314, + "num_tokens": 2063094802.0, + "step": 4035 + }, + { + "epoch": 1.0914007571660358, + "grad_norm": 1.606095552444458, + "learning_rate": 1.837901845763296e-05, + "loss": 1.9951, + "mean_token_accuracy": 0.5351207852363586, + "num_tokens": 2063560773.0, + "step": 4036 + }, + { + "epoch": 1.0916711736073554, + "grad_norm": 35.84149169921875, + "learning_rate": 1.8378115773251042e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.5520150065422058, + "num_tokens": 2064085027.0, + "step": 4037 + }, + { + "epoch": 1.0919415900486749, + "grad_norm": 1.6369588375091553, + "learning_rate": 1.8377212862486833e-05, + "loss": 2.0045, + "mean_token_accuracy": 0.5436100959777832, + "num_tokens": 2064549404.0, + "step": 4038 + }, + { + "epoch": 1.0922120064899945, + "grad_norm": 1.5108340978622437, + "learning_rate": 1.837630972536804e-05, + "loss": 2.062, + "mean_token_accuracy": 0.5377272367477417, + "num_tokens": 2065073533.0, + "step": 4039 + }, + { + "epoch": 1.0924824229313141, + "grad_norm": 1.6478288173675537, + "learning_rate": 1.837540636192237e-05, + "loss": 2.0873, + "mean_token_accuracy": 0.5379433035850525, + "num_tokens": 2065597638.0, + "step": 4040 + }, + { + "epoch": 1.0927528393726338, + "grad_norm": 0.6774631142616272, + "learning_rate": 1.8374502772177546e-05, + "loss": 1.1872, + "mean_token_accuracy": 0.6779702305793762, + "num_tokens": 2066121797.0, + "step": 4041 + }, + { + "epoch": 1.0930232558139534, + "grad_norm": 1.9691635370254517, + "learning_rate": 1.8373598956161292e-05, + "loss": 2.0696, + "mean_token_accuracy": 0.5402600765228271, + "num_tokens": 2066646024.0, + "step": 4042 + }, + { + "epoch": 1.093293672255273, + "grad_norm": 2.1325559616088867, + "learning_rate": 1.8372694913901334e-05, + "loss": 2.1568, + "mean_token_accuracy": 0.537811279296875, + "num_tokens": 2067170264.0, + "step": 4043 + }, + { + "epoch": 1.0935640886965927, + "grad_norm": 1.5696661472320557, + "learning_rate": 1.8371790645425416e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.5442466735839844, + "num_tokens": 2067694472.0, + "step": 4044 + }, + { + "epoch": 1.0938345051379124, + "grad_norm": 33.9906120300293, + "learning_rate": 1.837088615076128e-05, + "loss": 2.038, + "mean_token_accuracy": 0.5368565320968628, + "num_tokens": 2068218490.0, + "step": 4045 + }, + { + "epoch": 1.094104921579232, + "grad_norm": 2.5075693130493164, + "learning_rate": 1.8369981429936682e-05, + "loss": 1.8954, + "mean_token_accuracy": 0.5491360425949097, + "num_tokens": 2068742623.0, + "step": 4046 + }, + { + "epoch": 1.0943753380205516, + "grad_norm": 2.9474456310272217, + "learning_rate": 1.836907648297938e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.5332121253013611, + "num_tokens": 2069266826.0, + "step": 4047 + }, + { + "epoch": 1.0946457544618713, + "grad_norm": 1.800518274307251, + "learning_rate": 1.836817130991714e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.530283510684967, + "num_tokens": 2069749467.0, + "step": 4048 + }, + { + "epoch": 1.094916170903191, + "grad_norm": 2.048076629638672, + "learning_rate": 1.8367265910777737e-05, + "loss": 2.1474, + "mean_token_accuracy": 0.5283030867576599, + "num_tokens": 2070273558.0, + "step": 4049 + }, + { + "epoch": 1.0951865873445106, + "grad_norm": 1.8805299997329712, + "learning_rate": 1.8366360285588947e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.5504502058029175, + "num_tokens": 2070797774.0, + "step": 4050 + }, + { + "epoch": 1.0954570037858302, + "grad_norm": 1.63334059715271, + "learning_rate": 1.8365454434378563e-05, + "loss": 2.0129, + "mean_token_accuracy": 0.5740917921066284, + "num_tokens": 2071257853.0, + "step": 4051 + }, + { + "epoch": 1.0957274202271499, + "grad_norm": 1.7638274431228638, + "learning_rate": 1.8364548357174375e-05, + "loss": 2.0305, + "mean_token_accuracy": 0.5603835582733154, + "num_tokens": 2071720491.0, + "step": 4052 + }, + { + "epoch": 1.0959978366684695, + "grad_norm": 1.8094284534454346, + "learning_rate": 1.8363642054004188e-05, + "loss": 2.0715, + "mean_token_accuracy": 0.5389821529388428, + "num_tokens": 2072244623.0, + "step": 4053 + }, + { + "epoch": 1.0962682531097891, + "grad_norm": 1.529118537902832, + "learning_rate": 1.8362735524895806e-05, + "loss": 1.9738, + "mean_token_accuracy": 0.5449256896972656, + "num_tokens": 2072768815.0, + "step": 4054 + }, + { + "epoch": 1.0965386695511088, + "grad_norm": 1.5644084215164185, + "learning_rate": 1.836182876987705e-05, + "loss": 2.0775, + "mean_token_accuracy": 0.5459713935852051, + "num_tokens": 2073235189.0, + "step": 4055 + }, + { + "epoch": 1.0968090859924284, + "grad_norm": 1.7363646030426025, + "learning_rate": 1.836092178897573e-05, + "loss": 2.0254, + "mean_token_accuracy": 0.5599873065948486, + "num_tokens": 2073759469.0, + "step": 4056 + }, + { + "epoch": 1.0970795024337479, + "grad_norm": 1.6009790897369385, + "learning_rate": 1.836001458221969e-05, + "loss": 2.0561, + "mean_token_accuracy": 0.553753137588501, + "num_tokens": 2074244309.0, + "step": 4057 + }, + { + "epoch": 1.0973499188750675, + "grad_norm": 1.6351665258407593, + "learning_rate": 1.8359107149636753e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.5416091680526733, + "num_tokens": 2074703543.0, + "step": 4058 + }, + { + "epoch": 1.0976203353163871, + "grad_norm": 1.4758754968643188, + "learning_rate": 1.8358199491254766e-05, + "loss": 1.9392, + "mean_token_accuracy": 0.5538565516471863, + "num_tokens": 2075185522.0, + "step": 4059 + }, + { + "epoch": 1.0978907517577068, + "grad_norm": 1.5005652904510498, + "learning_rate": 1.8357291607101585e-05, + "loss": 2.092, + "mean_token_accuracy": 0.5339230895042419, + "num_tokens": 2075709692.0, + "step": 4060 + }, + { + "epoch": 1.0981611681990264, + "grad_norm": 0.6977151036262512, + "learning_rate": 1.8356383497205056e-05, + "loss": 1.1427, + "mean_token_accuracy": 0.6901605725288391, + "num_tokens": 2076175854.0, + "step": 4061 + }, + { + "epoch": 1.098431584640346, + "grad_norm": 2.6845359802246094, + "learning_rate": 1.835547516159305e-05, + "loss": 2.0608, + "mean_token_accuracy": 0.5504481196403503, + "num_tokens": 2076700049.0, + "step": 4062 + }, + { + "epoch": 1.0987020010816657, + "grad_norm": 2.597484827041626, + "learning_rate": 1.8354566600293435e-05, + "loss": 2.0927, + "mean_token_accuracy": 0.5374242067337036, + "num_tokens": 2077224318.0, + "step": 4063 + }, + { + "epoch": 1.0989724175229854, + "grad_norm": 1.7511773109436035, + "learning_rate": 1.8353657813334086e-05, + "loss": 1.9294, + "mean_token_accuracy": 0.5639561414718628, + "num_tokens": 2077691154.0, + "step": 4064 + }, + { + "epoch": 1.099242833964305, + "grad_norm": 2.145993232727051, + "learning_rate": 1.8352748800742894e-05, + "loss": 2.0838, + "mean_token_accuracy": 0.5354099869728088, + "num_tokens": 2078208091.0, + "step": 4065 + }, + { + "epoch": 1.0995132504056246, + "grad_norm": 1.576081395149231, + "learning_rate": 1.8351839562547743e-05, + "loss": 2.0903, + "mean_token_accuracy": 0.5330612659454346, + "num_tokens": 2078719013.0, + "step": 4066 + }, + { + "epoch": 1.0997836668469443, + "grad_norm": 1.8460272550582886, + "learning_rate": 1.8350930098776533e-05, + "loss": 2.1419, + "mean_token_accuracy": 0.5287388563156128, + "num_tokens": 2079243168.0, + "step": 4067 + }, + { + "epoch": 1.100054083288264, + "grad_norm": 1.5873159170150757, + "learning_rate": 1.8350020409457174e-05, + "loss": 1.9633, + "mean_token_accuracy": 0.5490965247154236, + "num_tokens": 2079757453.0, + "step": 4068 + }, + { + "epoch": 1.1003244997295836, + "grad_norm": 1.5093649625778198, + "learning_rate": 1.834911049461757e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.5178850293159485, + "num_tokens": 2080281657.0, + "step": 4069 + }, + { + "epoch": 1.1005949161709032, + "grad_norm": 1.4829298257827759, + "learning_rate": 1.8348200354285643e-05, + "loss": 2.0408, + "mean_token_accuracy": 0.5278388857841492, + "num_tokens": 2080805904.0, + "step": 4070 + }, + { + "epoch": 1.1008653326122229, + "grad_norm": 1.700779676437378, + "learning_rate": 1.8347289988489323e-05, + "loss": 2.1758, + "mean_token_accuracy": 0.5005062222480774, + "num_tokens": 2081330124.0, + "step": 4071 + }, + { + "epoch": 1.1011357490535425, + "grad_norm": 1.5370614528656006, + "learning_rate": 1.8346379397256538e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.5277424454689026, + "num_tokens": 2081854396.0, + "step": 4072 + }, + { + "epoch": 1.1014061654948621, + "grad_norm": 1.4638926982879639, + "learning_rate": 1.834546858061523e-05, + "loss": 2.0281, + "mean_token_accuracy": 0.552793025970459, + "num_tokens": 2082378634.0, + "step": 4073 + }, + { + "epoch": 1.1016765819361818, + "grad_norm": 1.3682838678359985, + "learning_rate": 1.834455753859334e-05, + "loss": 2.0772, + "mean_token_accuracy": 0.5454683899879456, + "num_tokens": 2082844742.0, + "step": 4074 + }, + { + "epoch": 1.1019469983775014, + "grad_norm": 1.3039164543151855, + "learning_rate": 1.8343646271218826e-05, + "loss": 2.1101, + "mean_token_accuracy": 0.5277325510978699, + "num_tokens": 2083297432.0, + "step": 4075 + }, + { + "epoch": 1.102217414818821, + "grad_norm": 1.3552082777023315, + "learning_rate": 1.834273477851965e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.5464208126068115, + "num_tokens": 2083821616.0, + "step": 4076 + }, + { + "epoch": 1.1024878312601407, + "grad_norm": 1.3372663259506226, + "learning_rate": 1.834182306052378e-05, + "loss": 1.898, + "mean_token_accuracy": 0.5694732666015625, + "num_tokens": 2084345771.0, + "step": 4077 + }, + { + "epoch": 1.1027582477014604, + "grad_norm": 1.2569963932037354, + "learning_rate": 1.8340911117259185e-05, + "loss": 2.043, + "mean_token_accuracy": 0.5455068945884705, + "num_tokens": 2084771102.0, + "step": 4078 + }, + { + "epoch": 1.1030286641427798, + "grad_norm": 1.2992517948150635, + "learning_rate": 1.833999894875385e-05, + "loss": 2.1132, + "mean_token_accuracy": 0.5387201309204102, + "num_tokens": 2085265995.0, + "step": 4079 + }, + { + "epoch": 1.1032990805840994, + "grad_norm": 1.3611034154891968, + "learning_rate": 1.8339086555035757e-05, + "loss": 2.0832, + "mean_token_accuracy": 0.5268661379814148, + "num_tokens": 2085790195.0, + "step": 4080 + }, + { + "epoch": 1.103569497025419, + "grad_norm": 0.7705757021903992, + "learning_rate": 1.833817393613291e-05, + "loss": 1.1717, + "mean_token_accuracy": 0.6887503266334534, + "num_tokens": 2086314384.0, + "step": 4081 + }, + { + "epoch": 1.1038399134667387, + "grad_norm": 1.8184137344360352, + "learning_rate": 1.833726109207331e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.5386613607406616, + "num_tokens": 2086838513.0, + "step": 4082 + }, + { + "epoch": 1.1041103299080584, + "grad_norm": 1.4456126689910889, + "learning_rate": 1.8336348022884957e-05, + "loss": 2.2005, + "mean_token_accuracy": 0.530486524105072, + "num_tokens": 2087362762.0, + "step": 4083 + }, + { + "epoch": 1.104380746349378, + "grad_norm": 1.2070953845977783, + "learning_rate": 1.8335434728595875e-05, + "loss": 1.9062, + "mean_token_accuracy": 0.5495983362197876, + "num_tokens": 2087839160.0, + "step": 4084 + }, + { + "epoch": 1.1046511627906976, + "grad_norm": 1.9869928359985352, + "learning_rate": 1.8334521209234078e-05, + "loss": 2.0997, + "mean_token_accuracy": 0.5322022438049316, + "num_tokens": 2088363275.0, + "step": 4085 + }, + { + "epoch": 1.1049215792320173, + "grad_norm": 1.8413547277450562, + "learning_rate": 1.8333607464827608e-05, + "loss": 2.0901, + "mean_token_accuracy": 0.5483182668685913, + "num_tokens": 2088887558.0, + "step": 4086 + }, + { + "epoch": 1.105191995673337, + "grad_norm": 1.5979669094085693, + "learning_rate": 1.8332693495404488e-05, + "loss": 2.1921, + "mean_token_accuracy": 0.5603090524673462, + "num_tokens": 2089348515.0, + "step": 4087 + }, + { + "epoch": 1.1054624121146566, + "grad_norm": 3.472417116165161, + "learning_rate": 1.833177930099277e-05, + "loss": 1.6739, + "mean_token_accuracy": 0.6034507751464844, + "num_tokens": 2089817832.0, + "step": 4088 + }, + { + "epoch": 1.1057328285559762, + "grad_norm": 1.8570499420166016, + "learning_rate": 1.8330864881620503e-05, + "loss": 1.9903, + "mean_token_accuracy": 0.5562180280685425, + "num_tokens": 2090295376.0, + "step": 4089 + }, + { + "epoch": 1.1060032449972959, + "grad_norm": 1.9573215246200562, + "learning_rate": 1.8329950237315744e-05, + "loss": 2.0587, + "mean_token_accuracy": 0.5513936281204224, + "num_tokens": 2090819659.0, + "step": 4090 + }, + { + "epoch": 1.1062736614386155, + "grad_norm": 1.3939244747161865, + "learning_rate": 1.8329035368106552e-05, + "loss": 2.0997, + "mean_token_accuracy": 0.5151059627532959, + "num_tokens": 2091343755.0, + "step": 4091 + }, + { + "epoch": 1.1065440778799351, + "grad_norm": 1.5016580820083618, + "learning_rate": 1.8328120274021003e-05, + "loss": 2.0321, + "mean_token_accuracy": 0.5462995171546936, + "num_tokens": 2091868037.0, + "step": 4092 + }, + { + "epoch": 1.1068144943212548, + "grad_norm": 1.7623497247695923, + "learning_rate": 1.8327204955087172e-05, + "loss": 2.1292, + "mean_token_accuracy": 0.5219749808311462, + "num_tokens": 2092392205.0, + "step": 4093 + }, + { + "epoch": 1.1070849107625744, + "grad_norm": 1.5602037906646729, + "learning_rate": 1.8326289411333146e-05, + "loss": 2.1569, + "mean_token_accuracy": 0.5211756229400635, + "num_tokens": 2092916473.0, + "step": 4094 + }, + { + "epoch": 1.107355327203894, + "grad_norm": 1.6151694059371948, + "learning_rate": 1.8325373642787017e-05, + "loss": 2.0699, + "mean_token_accuracy": 0.5337749719619751, + "num_tokens": 2093440577.0, + "step": 4095 + }, + { + "epoch": 1.1076257436452137, + "grad_norm": 1.8910231590270996, + "learning_rate": 1.8324457649476877e-05, + "loss": 1.9266, + "mean_token_accuracy": 0.5769334435462952, + "num_tokens": 2093924796.0, + "step": 4096 + }, + { + "epoch": 1.1078961600865334, + "grad_norm": 1.4250179529190063, + "learning_rate": 1.8323541431430845e-05, + "loss": 1.9205, + "mean_token_accuracy": 0.5512937903404236, + "num_tokens": 2094449073.0, + "step": 4097 + }, + { + "epoch": 1.1081665765278528, + "grad_norm": 1.3524987697601318, + "learning_rate": 1.832262498867702e-05, + "loss": 1.9056, + "mean_token_accuracy": 0.5412023663520813, + "num_tokens": 2094973307.0, + "step": 4098 + }, + { + "epoch": 1.1084369929691724, + "grad_norm": 1.6600335836410522, + "learning_rate": 1.8321708321243526e-05, + "loss": 1.99, + "mean_token_accuracy": 0.5464854836463928, + "num_tokens": 2095497486.0, + "step": 4099 + }, + { + "epoch": 1.108707409410492, + "grad_norm": 1.3453843593597412, + "learning_rate": 1.832079142915849e-05, + "loss": 2.1305, + "mean_token_accuracy": 0.5312622785568237, + "num_tokens": 2096021671.0, + "step": 4100 + }, + { + "epoch": 1.1089778258518117, + "grad_norm": 0.833000659942627, + "learning_rate": 1.8319874312450044e-05, + "loss": 1.2002, + "mean_token_accuracy": 0.6867294311523438, + "num_tokens": 2096540174.0, + "step": 4101 + }, + { + "epoch": 1.1092482422931313, + "grad_norm": 2.3339664936065674, + "learning_rate": 1.8318956971146325e-05, + "loss": 2.1785, + "mean_token_accuracy": 0.5252162218093872, + "num_tokens": 2097064420.0, + "step": 4102 + }, + { + "epoch": 1.109518658734451, + "grad_norm": 2.109138011932373, + "learning_rate": 1.8318039405275483e-05, + "loss": 2.0664, + "mean_token_accuracy": 0.5350214242935181, + "num_tokens": 2097536701.0, + "step": 4103 + }, + { + "epoch": 1.1097890751757706, + "grad_norm": 1.326375961303711, + "learning_rate": 1.8317121614865674e-05, + "loss": 2.02, + "mean_token_accuracy": 0.5426758527755737, + "num_tokens": 2098026553.0, + "step": 4104 + }, + { + "epoch": 1.1100594916170903, + "grad_norm": 1.7964469194412231, + "learning_rate": 1.8316203599945052e-05, + "loss": 2.133, + "mean_token_accuracy": 0.531730055809021, + "num_tokens": 2098550776.0, + "step": 4105 + }, + { + "epoch": 1.11032990805841, + "grad_norm": 1.6036298274993896, + "learning_rate": 1.8315285360541796e-05, + "loss": 1.9868, + "mean_token_accuracy": 0.5578147172927856, + "num_tokens": 2099075060.0, + "step": 4106 + }, + { + "epoch": 1.1106003244997296, + "grad_norm": 1.514735221862793, + "learning_rate": 1.8314366896684063e-05, + "loss": 2.0881, + "mean_token_accuracy": 0.5262295007705688, + "num_tokens": 2099599170.0, + "step": 4107 + }, + { + "epoch": 1.1108707409410492, + "grad_norm": 1.2645540237426758, + "learning_rate": 1.8313448208400047e-05, + "loss": 2.026, + "mean_token_accuracy": 0.5388360023498535, + "num_tokens": 2100123235.0, + "step": 4108 + }, + { + "epoch": 1.1111411573823688, + "grad_norm": 1.38470458984375, + "learning_rate": 1.8312529295717932e-05, + "loss": 2.0985, + "mean_token_accuracy": 0.5411406755447388, + "num_tokens": 2100574324.0, + "step": 4109 + }, + { + "epoch": 1.1114115738236885, + "grad_norm": 1.6103990077972412, + "learning_rate": 1.8311610158665914e-05, + "loss": 2.1287, + "mean_token_accuracy": 0.527497410774231, + "num_tokens": 2101098528.0, + "step": 4110 + }, + { + "epoch": 1.1116819902650081, + "grad_norm": 1.4553961753845215, + "learning_rate": 1.8310690797272194e-05, + "loss": 2.1021, + "mean_token_accuracy": 0.5498764514923096, + "num_tokens": 2101622727.0, + "step": 4111 + }, + { + "epoch": 1.1119524067063278, + "grad_norm": 1.34955632686615, + "learning_rate": 1.830977121156498e-05, + "loss": 2.044, + "mean_token_accuracy": 0.5380970239639282, + "num_tokens": 2102146929.0, + "step": 4112 + }, + { + "epoch": 1.1122228231476474, + "grad_norm": 1.418704867362976, + "learning_rate": 1.8308851401572492e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5182901620864868, + "num_tokens": 2102630186.0, + "step": 4113 + }, + { + "epoch": 1.112493239588967, + "grad_norm": 1.580398440361023, + "learning_rate": 1.8307931367322945e-05, + "loss": 2.0442, + "mean_token_accuracy": 0.5517978668212891, + "num_tokens": 2103154207.0, + "step": 4114 + }, + { + "epoch": 1.1127636560302867, + "grad_norm": 1.648781657218933, + "learning_rate": 1.8307011108844573e-05, + "loss": 2.0726, + "mean_token_accuracy": 0.5344078540802002, + "num_tokens": 2103678393.0, + "step": 4115 + }, + { + "epoch": 1.1130340724716064, + "grad_norm": 1.4530186653137207, + "learning_rate": 1.8306090626165617e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5225948095321655, + "num_tokens": 2104202605.0, + "step": 4116 + }, + { + "epoch": 1.113304488912926, + "grad_norm": 1.3531988859176636, + "learning_rate": 1.8305169919314308e-05, + "loss": 2.0405, + "mean_token_accuracy": 0.5443658828735352, + "num_tokens": 2104726725.0, + "step": 4117 + }, + { + "epoch": 1.1135749053542456, + "grad_norm": 1.5796207189559937, + "learning_rate": 1.8304248988318908e-05, + "loss": 2.205, + "mean_token_accuracy": 0.5202706456184387, + "num_tokens": 2105250865.0, + "step": 4118 + }, + { + "epoch": 1.1138453217955653, + "grad_norm": 1.306984305381775, + "learning_rate": 1.8303327833207664e-05, + "loss": 1.9727, + "mean_token_accuracy": 0.5372878313064575, + "num_tokens": 2105775017.0, + "step": 4119 + }, + { + "epoch": 1.1141157382368847, + "grad_norm": 1.6239632368087769, + "learning_rate": 1.830240645400885e-05, + "loss": 2.0551, + "mean_token_accuracy": 0.5355967283248901, + "num_tokens": 2106299170.0, + "step": 4120 + }, + { + "epoch": 1.1143861546782043, + "grad_norm": 0.6923421621322632, + "learning_rate": 1.8301484850750724e-05, + "loss": 1.1088, + "mean_token_accuracy": 0.7042737603187561, + "num_tokens": 2106809365.0, + "step": 4121 + }, + { + "epoch": 1.114656571119524, + "grad_norm": 1.7551909685134888, + "learning_rate": 1.8300563023461574e-05, + "loss": 2.111, + "mean_token_accuracy": 0.5420271158218384, + "num_tokens": 2107273948.0, + "step": 4122 + }, + { + "epoch": 1.1149269875608436, + "grad_norm": 1.3118513822555542, + "learning_rate": 1.829964097216968e-05, + "loss": 2.0603, + "mean_token_accuracy": 0.5420781970024109, + "num_tokens": 2107798161.0, + "step": 4123 + }, + { + "epoch": 1.1151974040021633, + "grad_norm": 1.8818413019180298, + "learning_rate": 1.8298718696903338e-05, + "loss": 2.0795, + "mean_token_accuracy": 0.5448753237724304, + "num_tokens": 2108322377.0, + "step": 4124 + }, + { + "epoch": 1.115467820443483, + "grad_norm": 2.0413265228271484, + "learning_rate": 1.8297796197690838e-05, + "loss": 1.7781, + "mean_token_accuracy": 0.5755777359008789, + "num_tokens": 2108846640.0, + "step": 4125 + }, + { + "epoch": 1.1157382368848026, + "grad_norm": 2.1227521896362305, + "learning_rate": 1.829687347456049e-05, + "loss": 2.0939, + "mean_token_accuracy": 0.5515387654304504, + "num_tokens": 2109370837.0, + "step": 4126 + }, + { + "epoch": 1.1160086533261222, + "grad_norm": 1.3638793230056763, + "learning_rate": 1.8295950527540606e-05, + "loss": 1.9022, + "mean_token_accuracy": 0.5633072853088379, + "num_tokens": 2109870447.0, + "step": 4127 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 2.2463490962982178, + "learning_rate": 1.8295027356659503e-05, + "loss": 2.2419, + "mean_token_accuracy": 0.5148137807846069, + "num_tokens": 2110394682.0, + "step": 4128 + }, + { + "epoch": 1.1165494862087615, + "grad_norm": 1.675907015800476, + "learning_rate": 1.829410396194551e-05, + "loss": 2.0635, + "mean_token_accuracy": 0.5204426646232605, + "num_tokens": 2110918774.0, + "step": 4129 + }, + { + "epoch": 1.1168199026500811, + "grad_norm": 3.250450372695923, + "learning_rate": 1.8293180343426952e-05, + "loss": 2.147, + "mean_token_accuracy": 0.5290542840957642, + "num_tokens": 2111394089.0, + "step": 4130 + }, + { + "epoch": 1.1170903190914008, + "grad_norm": 2.5727455615997314, + "learning_rate": 1.8292256501132177e-05, + "loss": 1.9985, + "mean_token_accuracy": 0.5420478582382202, + "num_tokens": 2111918356.0, + "step": 4131 + }, + { + "epoch": 1.1173607355327204, + "grad_norm": 2.2699921131134033, + "learning_rate": 1.8291332435089525e-05, + "loss": 2.2139, + "mean_token_accuracy": 0.5244907140731812, + "num_tokens": 2112442631.0, + "step": 4132 + }, + { + "epoch": 1.11763115197404, + "grad_norm": 1.7157026529312134, + "learning_rate": 1.8290408145327352e-05, + "loss": 2.1246, + "mean_token_accuracy": 0.5461128354072571, + "num_tokens": 2112966898.0, + "step": 4133 + }, + { + "epoch": 1.1179015684153597, + "grad_norm": 2.1439170837402344, + "learning_rate": 1.828948363187402e-05, + "loss": 2.0389, + "mean_token_accuracy": 0.5778989791870117, + "num_tokens": 2113320809.0, + "step": 4134 + }, + { + "epoch": 1.1181719848566793, + "grad_norm": 1.820751428604126, + "learning_rate": 1.8288558894757892e-05, + "loss": 2.0751, + "mean_token_accuracy": 0.5442646741867065, + "num_tokens": 2113784525.0, + "step": 4135 + }, + { + "epoch": 1.118442401297999, + "grad_norm": 1.3297821283340454, + "learning_rate": 1.8287633934007342e-05, + "loss": 2.1007, + "mean_token_accuracy": 0.5336854457855225, + "num_tokens": 2114308717.0, + "step": 4136 + }, + { + "epoch": 1.1187128177393186, + "grad_norm": 1.4898061752319336, + "learning_rate": 1.828670874965075e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.5358985662460327, + "num_tokens": 2114832862.0, + "step": 4137 + }, + { + "epoch": 1.1189832341806383, + "grad_norm": 1.6795679330825806, + "learning_rate": 1.8285783341716507e-05, + "loss": 2.1219, + "mean_token_accuracy": 0.5347686409950256, + "num_tokens": 2115357000.0, + "step": 4138 + }, + { + "epoch": 1.119253650621958, + "grad_norm": 1.2484118938446045, + "learning_rate": 1.8284857710233004e-05, + "loss": 2.092, + "mean_token_accuracy": 0.5399219989776611, + "num_tokens": 2115881228.0, + "step": 4139 + }, + { + "epoch": 1.1195240670632773, + "grad_norm": 1.2794468402862549, + "learning_rate": 1.8283931855228646e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5766587257385254, + "num_tokens": 2116405408.0, + "step": 4140 + }, + { + "epoch": 1.119794483504597, + "grad_norm": 0.7498619556427002, + "learning_rate": 1.8283005776731833e-05, + "loss": 1.1233, + "mean_token_accuracy": 0.6980754137039185, + "num_tokens": 2116906983.0, + "step": 4141 + }, + { + "epoch": 1.1200648999459166, + "grad_norm": 1.7392852306365967, + "learning_rate": 1.8282079474770988e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.5198714733123779, + "num_tokens": 2117431258.0, + "step": 4142 + }, + { + "epoch": 1.1203353163872363, + "grad_norm": 1.57829749584198, + "learning_rate": 1.8281152949374527e-05, + "loss": 2.0283, + "mean_token_accuracy": 0.5567423701286316, + "num_tokens": 2117894995.0, + "step": 4143 + }, + { + "epoch": 1.120605732828556, + "grad_norm": 1.4070651531219482, + "learning_rate": 1.8280226200570886e-05, + "loss": 2.1551, + "mean_token_accuracy": 0.5218303799629211, + "num_tokens": 2118419224.0, + "step": 4144 + }, + { + "epoch": 1.1208761492698756, + "grad_norm": 13.283198356628418, + "learning_rate": 1.8279299228388492e-05, + "loss": 1.9841, + "mean_token_accuracy": 0.5798506736755371, + "num_tokens": 2118918827.0, + "step": 4145 + }, + { + "epoch": 1.1211465657111952, + "grad_norm": 2.9479944705963135, + "learning_rate": 1.827837203285579e-05, + "loss": 2.1668, + "mean_token_accuracy": 0.5040387511253357, + "num_tokens": 2119443079.0, + "step": 4146 + }, + { + "epoch": 1.1214169821525148, + "grad_norm": 2.3596794605255127, + "learning_rate": 1.827744461400123e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.5282328128814697, + "num_tokens": 2119967278.0, + "step": 4147 + }, + { + "epoch": 1.1216873985938345, + "grad_norm": 1.4229950904846191, + "learning_rate": 1.8276516971853268e-05, + "loss": 2.0865, + "mean_token_accuracy": 0.5520826578140259, + "num_tokens": 2120416829.0, + "step": 4148 + }, + { + "epoch": 1.1219578150351541, + "grad_norm": 1.833031415939331, + "learning_rate": 1.8275589106440367e-05, + "loss": 2.0416, + "mean_token_accuracy": 0.5692459940910339, + "num_tokens": 2120877879.0, + "step": 4149 + }, + { + "epoch": 1.1222282314764738, + "grad_norm": 1.3245238065719604, + "learning_rate": 1.8274661017791e-05, + "loss": 2.065, + "mean_token_accuracy": 0.532504677772522, + "num_tokens": 2121402057.0, + "step": 4150 + }, + { + "epoch": 1.1224986479177934, + "grad_norm": 1.3915096521377563, + "learning_rate": 1.8273732705933635e-05, + "loss": 2.1127, + "mean_token_accuracy": 0.5292376279830933, + "num_tokens": 2121926335.0, + "step": 4151 + }, + { + "epoch": 1.122769064359113, + "grad_norm": 1.552597999572754, + "learning_rate": 1.8272804170896757e-05, + "loss": 2.093, + "mean_token_accuracy": 0.5338990688323975, + "num_tokens": 2122413721.0, + "step": 4152 + }, + { + "epoch": 1.1230394808004327, + "grad_norm": 1.4970824718475342, + "learning_rate": 1.827187541270886e-05, + "loss": 2.0116, + "mean_token_accuracy": 0.5712003707885742, + "num_tokens": 2122937903.0, + "step": 4153 + }, + { + "epoch": 1.1233098972417523, + "grad_norm": 1.4892911911010742, + "learning_rate": 1.8270946431398448e-05, + "loss": 2.0802, + "mean_token_accuracy": 0.5335183143615723, + "num_tokens": 2123462128.0, + "step": 4154 + }, + { + "epoch": 1.123580313683072, + "grad_norm": 1.4287307262420654, + "learning_rate": 1.827001722699401e-05, + "loss": 2.0285, + "mean_token_accuracy": 0.5565299987792969, + "num_tokens": 2123936375.0, + "step": 4155 + }, + { + "epoch": 1.1238507301243916, + "grad_norm": 1.3459023237228394, + "learning_rate": 1.8269087799524064e-05, + "loss": 2.1255, + "mean_token_accuracy": 0.5439181923866272, + "num_tokens": 2124455097.0, + "step": 4156 + }, + { + "epoch": 1.1241211465657113, + "grad_norm": 1.2204654216766357, + "learning_rate": 1.826815814901713e-05, + "loss": 1.8242, + "mean_token_accuracy": 0.5847766995429993, + "num_tokens": 2124979139.0, + "step": 4157 + }, + { + "epoch": 1.124391563007031, + "grad_norm": 1.2269346714019775, + "learning_rate": 1.826722827550173e-05, + "loss": 1.963, + "mean_token_accuracy": 0.5499211549758911, + "num_tokens": 2125503314.0, + "step": 4158 + }, + { + "epoch": 1.1246619794483506, + "grad_norm": 1.2099132537841797, + "learning_rate": 1.826629817900639e-05, + "loss": 1.973, + "mean_token_accuracy": 0.540285587310791, + "num_tokens": 2126021157.0, + "step": 4159 + }, + { + "epoch": 1.1249323958896702, + "grad_norm": 1.265227198600769, + "learning_rate": 1.826536785955966e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.5136024355888367, + "num_tokens": 2126545336.0, + "step": 4160 + }, + { + "epoch": 1.1252028123309898, + "grad_norm": 0.7919343709945679, + "learning_rate": 1.826443731719007e-05, + "loss": 1.1244, + "mean_token_accuracy": 0.7024986147880554, + "num_tokens": 2127010341.0, + "step": 4161 + }, + { + "epoch": 1.1254732287723093, + "grad_norm": 2.2395050525665283, + "learning_rate": 1.8263506551926186e-05, + "loss": 2.0997, + "mean_token_accuracy": 0.5292394161224365, + "num_tokens": 2127512667.0, + "step": 4162 + }, + { + "epoch": 1.125743645213629, + "grad_norm": 1.6267013549804688, + "learning_rate": 1.826257556379656e-05, + "loss": 2.0712, + "mean_token_accuracy": 0.534805178642273, + "num_tokens": 2128036840.0, + "step": 4163 + }, + { + "epoch": 1.1260140616549486, + "grad_norm": 1.394391655921936, + "learning_rate": 1.8261644352829757e-05, + "loss": 2.1443, + "mean_token_accuracy": 0.5225342512130737, + "num_tokens": 2128560978.0, + "step": 4164 + }, + { + "epoch": 1.1262844780962682, + "grad_norm": 1.4926975965499878, + "learning_rate": 1.826071291905435e-05, + "loss": 1.9516, + "mean_token_accuracy": 0.5520485639572144, + "num_tokens": 2129073113.0, + "step": 4165 + }, + { + "epoch": 1.1265548945375878, + "grad_norm": 1.6684291362762451, + "learning_rate": 1.8259781262498918e-05, + "loss": 2.1083, + "mean_token_accuracy": 0.5403068661689758, + "num_tokens": 2129597378.0, + "step": 4166 + }, + { + "epoch": 1.1268253109789075, + "grad_norm": 1.235256314277649, + "learning_rate": 1.8258849383192047e-05, + "loss": 1.9649, + "mean_token_accuracy": 0.546950101852417, + "num_tokens": 2130121412.0, + "step": 4167 + }, + { + "epoch": 1.1270957274202271, + "grad_norm": 1.2222795486450195, + "learning_rate": 1.825791728116233e-05, + "loss": 2.043, + "mean_token_accuracy": 0.5377896428108215, + "num_tokens": 2130645598.0, + "step": 4168 + }, + { + "epoch": 1.1273661438615468, + "grad_norm": 1.2259619235992432, + "learning_rate": 1.8256984956438367e-05, + "loss": 1.9877, + "mean_token_accuracy": 0.5653839111328125, + "num_tokens": 2131135309.0, + "step": 4169 + }, + { + "epoch": 1.1276365603028664, + "grad_norm": 1.3843523263931274, + "learning_rate": 1.8256052409048763e-05, + "loss": 2.0202, + "mean_token_accuracy": 0.5532740354537964, + "num_tokens": 2131659489.0, + "step": 4170 + }, + { + "epoch": 1.127906976744186, + "grad_norm": 1.3263351917266846, + "learning_rate": 1.8255119639022133e-05, + "loss": 2.1751, + "mean_token_accuracy": 0.5276479721069336, + "num_tokens": 2132183684.0, + "step": 4171 + }, + { + "epoch": 1.1281773931855057, + "grad_norm": 1.3133864402770996, + "learning_rate": 1.82541866463871e-05, + "loss": 2.1651, + "mean_token_accuracy": 0.51167893409729, + "num_tokens": 2132707860.0, + "step": 4172 + }, + { + "epoch": 1.1284478096268253, + "grad_norm": 2.269648313522339, + "learning_rate": 1.8253253431172284e-05, + "loss": 1.9841, + "mean_token_accuracy": 0.5838035345077515, + "num_tokens": 2133171088.0, + "step": 4173 + }, + { + "epoch": 1.128718226068145, + "grad_norm": 1.4935652017593384, + "learning_rate": 1.8252319993406322e-05, + "loss": 2.0462, + "mean_token_accuracy": 0.5369556546211243, + "num_tokens": 2133695249.0, + "step": 4174 + }, + { + "epoch": 1.1289886425094646, + "grad_norm": 1.4037187099456787, + "learning_rate": 1.8251386333117856e-05, + "loss": 2.0663, + "mean_token_accuracy": 0.5448148250579834, + "num_tokens": 2134200133.0, + "step": 4175 + }, + { + "epoch": 1.1292590589507843, + "grad_norm": 1.284981369972229, + "learning_rate": 1.8250452450335535e-05, + "loss": 2.0512, + "mean_token_accuracy": 0.5426493883132935, + "num_tokens": 2134672534.0, + "step": 4176 + }, + { + "epoch": 1.129529475392104, + "grad_norm": 1.4356341361999512, + "learning_rate": 1.8249518345088004e-05, + "loss": 2.0696, + "mean_token_accuracy": 0.5344012975692749, + "num_tokens": 2135196668.0, + "step": 4177 + }, + { + "epoch": 1.1297998918334236, + "grad_norm": 1.6243528127670288, + "learning_rate": 1.8248584017403935e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.5443171858787537, + "num_tokens": 2135720847.0, + "step": 4178 + }, + { + "epoch": 1.1300703082747432, + "grad_norm": 1.5655020475387573, + "learning_rate": 1.824764946731199e-05, + "loss": 1.995, + "mean_token_accuracy": 0.5588862895965576, + "num_tokens": 2136202908.0, + "step": 4179 + }, + { + "epoch": 1.1303407247160626, + "grad_norm": 1.3348119258880615, + "learning_rate": 1.8246714694840847e-05, + "loss": 2.0541, + "mean_token_accuracy": 0.5246476531028748, + "num_tokens": 2136727093.0, + "step": 4180 + }, + { + "epoch": 1.1306111411573823, + "grad_norm": 0.720305860042572, + "learning_rate": 1.8245779700019188e-05, + "loss": 1.2317, + "mean_token_accuracy": 0.6781693696975708, + "num_tokens": 2137251305.0, + "step": 4181 + }, + { + "epoch": 1.130881557598702, + "grad_norm": 2.2025368213653564, + "learning_rate": 1.8244844482875698e-05, + "loss": 2.05, + "mean_token_accuracy": 0.5562960505485535, + "num_tokens": 2137736199.0, + "step": 4182 + }, + { + "epoch": 1.1311519740400215, + "grad_norm": 2.1832854747772217, + "learning_rate": 1.824390904343907e-05, + "loss": 2.1502, + "mean_token_accuracy": 0.5175588130950928, + "num_tokens": 2138211819.0, + "step": 4183 + }, + { + "epoch": 1.1314223904813412, + "grad_norm": 1.3960195779800415, + "learning_rate": 1.824297338173801e-05, + "loss": 1.9995, + "mean_token_accuracy": 0.5455015301704407, + "num_tokens": 2138736058.0, + "step": 4184 + }, + { + "epoch": 1.1316928069226608, + "grad_norm": 1.6027065515518188, + "learning_rate": 1.824203749780123e-05, + "loss": 2.0258, + "mean_token_accuracy": 0.5512420535087585, + "num_tokens": 2139251801.0, + "step": 4185 + }, + { + "epoch": 1.1319632233639805, + "grad_norm": 1.7689621448516846, + "learning_rate": 1.8241101391657443e-05, + "loss": 2.0287, + "mean_token_accuracy": 0.549083948135376, + "num_tokens": 2139775881.0, + "step": 4186 + }, + { + "epoch": 1.1322336398053001, + "grad_norm": 1.3725922107696533, + "learning_rate": 1.8240165063335367e-05, + "loss": 2.0198, + "mean_token_accuracy": 0.5373750925064087, + "num_tokens": 2140300051.0, + "step": 4187 + }, + { + "epoch": 1.1325040562466198, + "grad_norm": 1.4375927448272705, + "learning_rate": 1.8239228512863735e-05, + "loss": 2.1752, + "mean_token_accuracy": 0.5175491571426392, + "num_tokens": 2140799367.0, + "step": 4188 + }, + { + "epoch": 1.1327744726879394, + "grad_norm": 1.5013335943222046, + "learning_rate": 1.8238291740271285e-05, + "loss": 1.9533, + "mean_token_accuracy": 0.5444902181625366, + "num_tokens": 2141323609.0, + "step": 4189 + }, + { + "epoch": 1.133044889129259, + "grad_norm": 1.1547300815582275, + "learning_rate": 1.823735474558676e-05, + "loss": 2.068, + "mean_token_accuracy": 0.5203713774681091, + "num_tokens": 2141847840.0, + "step": 4190 + }, + { + "epoch": 1.1333153055705787, + "grad_norm": 1.299707055091858, + "learning_rate": 1.8236417528838902e-05, + "loss": 2.1442, + "mean_token_accuracy": 0.5392428040504456, + "num_tokens": 2142372012.0, + "step": 4191 + }, + { + "epoch": 1.1335857220118983, + "grad_norm": 1.381458044052124, + "learning_rate": 1.8235480090056476e-05, + "loss": 2.0359, + "mean_token_accuracy": 0.5361989736557007, + "num_tokens": 2142896278.0, + "step": 4192 + }, + { + "epoch": 1.133856138453218, + "grad_norm": 1.4356186389923096, + "learning_rate": 1.8234542429268245e-05, + "loss": 2.1969, + "mean_token_accuracy": 0.5245218276977539, + "num_tokens": 2143420556.0, + "step": 4193 + }, + { + "epoch": 1.1341265548945376, + "grad_norm": 1.171653151512146, + "learning_rate": 1.8233604546502973e-05, + "loss": 2.0706, + "mean_token_accuracy": 0.5180974006652832, + "num_tokens": 2143944841.0, + "step": 4194 + }, + { + "epoch": 1.1343969713358573, + "grad_norm": 1.2252393960952759, + "learning_rate": 1.8232666441789444e-05, + "loss": 2.0207, + "mean_token_accuracy": 0.5371934175491333, + "num_tokens": 2144469079.0, + "step": 4195 + }, + { + "epoch": 1.134667387777177, + "grad_norm": 1.2941162586212158, + "learning_rate": 1.823172811515644e-05, + "loss": 2.215, + "mean_token_accuracy": 0.5139814019203186, + "num_tokens": 2144949000.0, + "step": 4196 + }, + { + "epoch": 1.1349378042184965, + "grad_norm": 1.336147665977478, + "learning_rate": 1.8230789566632744e-05, + "loss": 2.1727, + "mean_token_accuracy": 0.5254440307617188, + "num_tokens": 2145473163.0, + "step": 4197 + }, + { + "epoch": 1.1352082206598162, + "grad_norm": 1.1090879440307617, + "learning_rate": 1.8229850796247165e-05, + "loss": 1.9323, + "mean_token_accuracy": 0.5637364387512207, + "num_tokens": 2145960322.0, + "step": 4198 + }, + { + "epoch": 1.1354786371011358, + "grad_norm": 1.4683185815811157, + "learning_rate": 1.82289118040285e-05, + "loss": 2.0806, + "mean_token_accuracy": 0.5472371578216553, + "num_tokens": 2146439830.0, + "step": 4199 + }, + { + "epoch": 1.1357490535424555, + "grad_norm": 1.465862512588501, + "learning_rate": 1.822797259000556e-05, + "loss": 2.0548, + "mean_token_accuracy": 0.5448471307754517, + "num_tokens": 2146964067.0, + "step": 4200 + }, + { + "epoch": 1.1360194699837751, + "grad_norm": 0.7603212594985962, + "learning_rate": 1.8227033154207166e-05, + "loss": 1.1219, + "mean_token_accuracy": 0.7088892459869385, + "num_tokens": 2147488331.0, + "step": 4201 + }, + { + "epoch": 1.1362898864250948, + "grad_norm": 1.8165419101715088, + "learning_rate": 1.822609349666215e-05, + "loss": 2.04, + "mean_token_accuracy": 0.548227071762085, + "num_tokens": 2148012612.0, + "step": 4202 + }, + { + "epoch": 1.1365603028664142, + "grad_norm": 1.3027381896972656, + "learning_rate": 1.8225153617399325e-05, + "loss": 1.9784, + "mean_token_accuracy": 0.555211067199707, + "num_tokens": 2148536791.0, + "step": 4203 + }, + { + "epoch": 1.1368307193077338, + "grad_norm": 1.3747704029083252, + "learning_rate": 1.822421351644754e-05, + "loss": 1.7542, + "mean_token_accuracy": 0.5686135292053223, + "num_tokens": 2149061012.0, + "step": 4204 + }, + { + "epoch": 1.1371011357490535, + "grad_norm": 1.4021707773208618, + "learning_rate": 1.822327319383564e-05, + "loss": 1.9377, + "mean_token_accuracy": 0.5475569367408752, + "num_tokens": 2149585169.0, + "step": 4205 + }, + { + "epoch": 1.1373715521903731, + "grad_norm": 1.4097542762756348, + "learning_rate": 1.822233264959248e-05, + "loss": 2.0789, + "mean_token_accuracy": 0.5680193901062012, + "num_tokens": 2150109275.0, + "step": 4206 + }, + { + "epoch": 1.1376419686316928, + "grad_norm": 1.780828595161438, + "learning_rate": 1.822139188374691e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.5172778367996216, + "num_tokens": 2150633525.0, + "step": 4207 + }, + { + "epoch": 1.1379123850730124, + "grad_norm": 1.43477201461792, + "learning_rate": 1.82204508963278e-05, + "loss": 1.9953, + "mean_token_accuracy": 0.5519353151321411, + "num_tokens": 2151157718.0, + "step": 4208 + }, + { + "epoch": 1.138182801514332, + "grad_norm": 1.2322707176208496, + "learning_rate": 1.8219509687364027e-05, + "loss": 1.9903, + "mean_token_accuracy": 0.5590257048606873, + "num_tokens": 2151681883.0, + "step": 4209 + }, + { + "epoch": 1.1384532179556517, + "grad_norm": 1.5871975421905518, + "learning_rate": 1.8218568256884467e-05, + "loss": 2.1846, + "mean_token_accuracy": 0.5392849445343018, + "num_tokens": 2152165765.0, + "step": 4210 + }, + { + "epoch": 1.1387236343969713, + "grad_norm": 1.5007579326629639, + "learning_rate": 1.8217626604918e-05, + "loss": 2.0172, + "mean_token_accuracy": 0.5455468893051147, + "num_tokens": 2152689955.0, + "step": 4211 + }, + { + "epoch": 1.138994050838291, + "grad_norm": 1.2264798879623413, + "learning_rate": 1.821668473149352e-05, + "loss": 1.9576, + "mean_token_accuracy": 0.5747092366218567, + "num_tokens": 2153090160.0, + "step": 4212 + }, + { + "epoch": 1.1392644672796106, + "grad_norm": 1.2770909070968628, + "learning_rate": 1.821574263663994e-05, + "loss": 2.0433, + "mean_token_accuracy": 0.5505574345588684, + "num_tokens": 2153614362.0, + "step": 4213 + }, + { + "epoch": 1.1395348837209303, + "grad_norm": 1.264281988143921, + "learning_rate": 1.8214800320386147e-05, + "loss": 1.8927, + "mean_token_accuracy": 0.5676249265670776, + "num_tokens": 2154138631.0, + "step": 4214 + }, + { + "epoch": 1.13980530016225, + "grad_norm": 1.4295597076416016, + "learning_rate": 1.8213857782761064e-05, + "loss": 1.9346, + "mean_token_accuracy": 0.5723815560340881, + "num_tokens": 2154662810.0, + "step": 4215 + }, + { + "epoch": 1.1400757166035695, + "grad_norm": 1.5333088636398315, + "learning_rate": 1.821291502379361e-05, + "loss": 2.1648, + "mean_token_accuracy": 0.5350382328033447, + "num_tokens": 2155187072.0, + "step": 4216 + }, + { + "epoch": 1.1403461330448892, + "grad_norm": 1.5785117149353027, + "learning_rate": 1.8211972043512714e-05, + "loss": 1.9454, + "mean_token_accuracy": 0.5499578714370728, + "num_tokens": 2155698551.0, + "step": 4217 + }, + { + "epoch": 1.1406165494862088, + "grad_norm": 1.5983105897903442, + "learning_rate": 1.8211028841947306e-05, + "loss": 2.0547, + "mean_token_accuracy": 0.5471625328063965, + "num_tokens": 2156190299.0, + "step": 4218 + }, + { + "epoch": 1.1408869659275285, + "grad_norm": 1.605289101600647, + "learning_rate": 1.8210085419126325e-05, + "loss": 2.083, + "mean_token_accuracy": 0.5285470485687256, + "num_tokens": 2156714573.0, + "step": 4219 + }, + { + "epoch": 1.1411573823688481, + "grad_norm": 1.4045166969299316, + "learning_rate": 1.8209141775078722e-05, + "loss": 2.0669, + "mean_token_accuracy": 0.5367308259010315, + "num_tokens": 2157238799.0, + "step": 4220 + }, + { + "epoch": 1.1414277988101675, + "grad_norm": 0.8599770069122314, + "learning_rate": 1.8208197909833445e-05, + "loss": 1.2537, + "mean_token_accuracy": 0.6766263246536255, + "num_tokens": 2157699692.0, + "step": 4221 + }, + { + "epoch": 1.1416982152514872, + "grad_norm": 2.5592241287231445, + "learning_rate": 1.820725382341946e-05, + "loss": 2.2176, + "mean_token_accuracy": 0.5231970548629761, + "num_tokens": 2158223936.0, + "step": 4222 + }, + { + "epoch": 1.1419686316928068, + "grad_norm": 1.7350811958312988, + "learning_rate": 1.820630951586573e-05, + "loss": 2.1565, + "mean_token_accuracy": 0.5131815075874329, + "num_tokens": 2158748203.0, + "step": 4223 + }, + { + "epoch": 1.1422390481341265, + "grad_norm": 1.6269307136535645, + "learning_rate": 1.820536498720124e-05, + "loss": 2.1129, + "mean_token_accuracy": 0.542884349822998, + "num_tokens": 2159272389.0, + "step": 4224 + }, + { + "epoch": 1.142509464575446, + "grad_norm": 1.2286138534545898, + "learning_rate": 1.8204420237454956e-05, + "loss": 2.0604, + "mean_token_accuracy": 0.5314990282058716, + "num_tokens": 2159796651.0, + "step": 4225 + }, + { + "epoch": 1.1427798810167658, + "grad_norm": 1.4086830615997314, + "learning_rate": 1.8203475266655873e-05, + "loss": 2.0024, + "mean_token_accuracy": 0.5366151928901672, + "num_tokens": 2160320848.0, + "step": 4226 + }, + { + "epoch": 1.1430502974580854, + "grad_norm": 1.4716520309448242, + "learning_rate": 1.820253007483299e-05, + "loss": 2.0959, + "mean_token_accuracy": 0.5390151739120483, + "num_tokens": 2160845129.0, + "step": 4227 + }, + { + "epoch": 1.143320713899405, + "grad_norm": 1.4154369831085205, + "learning_rate": 1.8201584662015295e-05, + "loss": 2.0013, + "mean_token_accuracy": 0.521784245967865, + "num_tokens": 2161369328.0, + "step": 4228 + }, + { + "epoch": 1.1435911303407247, + "grad_norm": 1.3486168384552002, + "learning_rate": 1.8200639028231805e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.5120943188667297, + "num_tokens": 2161893511.0, + "step": 4229 + }, + { + "epoch": 1.1438615467820443, + "grad_norm": 1.485815167427063, + "learning_rate": 1.819969317351154e-05, + "loss": 2.0804, + "mean_token_accuracy": 0.5396894216537476, + "num_tokens": 2162417688.0, + "step": 4230 + }, + { + "epoch": 1.144131963223364, + "grad_norm": 1.4314810037612915, + "learning_rate": 1.819874709788351e-05, + "loss": 2.0275, + "mean_token_accuracy": 0.5423275232315063, + "num_tokens": 2162941948.0, + "step": 4231 + }, + { + "epoch": 1.1444023796646836, + "grad_norm": 2.0512378215789795, + "learning_rate": 1.819780080137675e-05, + "loss": 1.8482, + "mean_token_accuracy": 0.6046116948127747, + "num_tokens": 2163384312.0, + "step": 4232 + }, + { + "epoch": 1.1446727961060033, + "grad_norm": 1.684426188468933, + "learning_rate": 1.8196854284020294e-05, + "loss": 1.9677, + "mean_token_accuracy": 0.5805688500404358, + "num_tokens": 2163908573.0, + "step": 4233 + }, + { + "epoch": 1.144943212547323, + "grad_norm": 1.4208321571350098, + "learning_rate": 1.819590754584319e-05, + "loss": 1.9109, + "mean_token_accuracy": 0.565772533416748, + "num_tokens": 2164432832.0, + "step": 4234 + }, + { + "epoch": 1.1452136289886425, + "grad_norm": 1.1894924640655518, + "learning_rate": 1.8194960586874474e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.5506371855735779, + "num_tokens": 2164956999.0, + "step": 4235 + }, + { + "epoch": 1.1454840454299622, + "grad_norm": 1.6757227182388306, + "learning_rate": 1.8194013407143206e-05, + "loss": 2.1585, + "mean_token_accuracy": 0.5174030065536499, + "num_tokens": 2165481274.0, + "step": 4236 + }, + { + "epoch": 1.1457544618712818, + "grad_norm": 1.554449439048767, + "learning_rate": 1.8193066006678456e-05, + "loss": 2.0202, + "mean_token_accuracy": 0.541449785232544, + "num_tokens": 2166005548.0, + "step": 4237 + }, + { + "epoch": 1.1460248783126015, + "grad_norm": 1.4514881372451782, + "learning_rate": 1.8192118385509286e-05, + "loss": 2.216, + "mean_token_accuracy": 0.5134961605072021, + "num_tokens": 2166529778.0, + "step": 4238 + }, + { + "epoch": 1.1462952947539211, + "grad_norm": 1.2894448041915894, + "learning_rate": 1.8191170543664774e-05, + "loss": 1.919, + "mean_token_accuracy": 0.5759282112121582, + "num_tokens": 2167053888.0, + "step": 4239 + }, + { + "epoch": 1.1465657111952408, + "grad_norm": 1.999568223953247, + "learning_rate": 1.8190222481174e-05, + "loss": 1.9532, + "mean_token_accuracy": 0.5263732671737671, + "num_tokens": 2167570828.0, + "step": 4240 + }, + { + "epoch": 1.1468361276365604, + "grad_norm": 0.6769770979881287, + "learning_rate": 1.8189274198066055e-05, + "loss": 1.1726, + "mean_token_accuracy": 0.6950985193252563, + "num_tokens": 2168038190.0, + "step": 4241 + }, + { + "epoch": 1.14710654407788, + "grad_norm": 3.008965492248535, + "learning_rate": 1.818832569437004e-05, + "loss": 2.0607, + "mean_token_accuracy": 0.5394721627235413, + "num_tokens": 2168562434.0, + "step": 4242 + }, + { + "epoch": 1.1473769605191997, + "grad_norm": 2.8235666751861572, + "learning_rate": 1.8187376970115046e-05, + "loss": 2.109, + "mean_token_accuracy": 0.5163032412528992, + "num_tokens": 2169086602.0, + "step": 4243 + }, + { + "epoch": 1.147647376960519, + "grad_norm": 1.6821826696395874, + "learning_rate": 1.8186428025330198e-05, + "loss": 2.0917, + "mean_token_accuracy": 0.5153049230575562, + "num_tokens": 2169610793.0, + "step": 4244 + }, + { + "epoch": 1.1479177934018387, + "grad_norm": 1.626758337020874, + "learning_rate": 1.8185478860044597e-05, + "loss": 2.0032, + "mean_token_accuracy": 0.5634039640426636, + "num_tokens": 2170101700.0, + "step": 4245 + }, + { + "epoch": 1.1481882098431584, + "grad_norm": 2.2275443077087402, + "learning_rate": 1.818452947428738e-05, + "loss": 2.081, + "mean_token_accuracy": 0.5440109968185425, + "num_tokens": 2170609439.0, + "step": 4246 + }, + { + "epoch": 1.148458626284478, + "grad_norm": 2.2496585845947266, + "learning_rate": 1.8183579868087665e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.5270732641220093, + "num_tokens": 2171082160.0, + "step": 4247 + }, + { + "epoch": 1.1487290427257977, + "grad_norm": 1.8612021207809448, + "learning_rate": 1.8182630041474595e-05, + "loss": 2.1356, + "mean_token_accuracy": 0.5405287742614746, + "num_tokens": 2171606328.0, + "step": 4248 + }, + { + "epoch": 1.1489994591671173, + "grad_norm": 1.5618929862976074, + "learning_rate": 1.818167999447731e-05, + "loss": 2.1168, + "mean_token_accuracy": 0.5326849222183228, + "num_tokens": 2172130566.0, + "step": 4249 + }, + { + "epoch": 1.149269875608437, + "grad_norm": 1.6663317680358887, + "learning_rate": 1.8180729727124967e-05, + "loss": 2.1293, + "mean_token_accuracy": 0.5309969782829285, + "num_tokens": 2172654813.0, + "step": 4250 + }, + { + "epoch": 1.1495402920497566, + "grad_norm": 1.3148366212844849, + "learning_rate": 1.817977923944672e-05, + "loss": 2.1149, + "mean_token_accuracy": 0.5187883377075195, + "num_tokens": 2173169967.0, + "step": 4251 + }, + { + "epoch": 1.1498107084910762, + "grad_norm": 1.6736385822296143, + "learning_rate": 1.8178828531471727e-05, + "loss": 2.0916, + "mean_token_accuracy": 0.5532113313674927, + "num_tokens": 2173579579.0, + "step": 4252 + }, + { + "epoch": 1.150081124932396, + "grad_norm": 1.4295551776885986, + "learning_rate": 1.8177877603229165e-05, + "loss": 1.9491, + "mean_token_accuracy": 0.5676221251487732, + "num_tokens": 2174040424.0, + "step": 4253 + }, + { + "epoch": 1.1503515413737155, + "grad_norm": 1.3839755058288574, + "learning_rate": 1.817692645474821e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.5207719802856445, + "num_tokens": 2174564475.0, + "step": 4254 + }, + { + "epoch": 1.1506219578150352, + "grad_norm": 1.6980935335159302, + "learning_rate": 1.8175975086058048e-05, + "loss": 2.1643, + "mean_token_accuracy": 0.5335301756858826, + "num_tokens": 2175088748.0, + "step": 4255 + }, + { + "epoch": 1.1508923742563548, + "grad_norm": 1.331419825553894, + "learning_rate": 1.8175023497187863e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.5362581014633179, + "num_tokens": 2175612984.0, + "step": 4256 + }, + { + "epoch": 1.1511627906976745, + "grad_norm": 1.5696195363998413, + "learning_rate": 1.817407168816686e-05, + "loss": 1.9931, + "mean_token_accuracy": 0.5651775002479553, + "num_tokens": 2176113126.0, + "step": 4257 + }, + { + "epoch": 1.151433207138994, + "grad_norm": 1.2758833169937134, + "learning_rate": 1.8173119659024236e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.5249466896057129, + "num_tokens": 2176637339.0, + "step": 4258 + }, + { + "epoch": 1.1517036235803138, + "grad_norm": 1.271433711051941, + "learning_rate": 1.8172167409789212e-05, + "loss": 2.023, + "mean_token_accuracy": 0.5265621542930603, + "num_tokens": 2177161507.0, + "step": 4259 + }, + { + "epoch": 1.1519740400216334, + "grad_norm": 1.3550610542297363, + "learning_rate": 1.8171214940490996e-05, + "loss": 2.1293, + "mean_token_accuracy": 0.5449485778808594, + "num_tokens": 2177685596.0, + "step": 4260 + }, + { + "epoch": 1.152244456462953, + "grad_norm": 14.694586753845215, + "learning_rate": 1.8170262251158815e-05, + "loss": 1.038, + "mean_token_accuracy": 0.7242065668106079, + "num_tokens": 2178209877.0, + "step": 4261 + }, + { + "epoch": 1.1525148729042725, + "grad_norm": 2.4361345767974854, + "learning_rate": 1.8169309341821905e-05, + "loss": 2.2293, + "mean_token_accuracy": 0.50185227394104, + "num_tokens": 2178734047.0, + "step": 4262 + }, + { + "epoch": 1.152785289345592, + "grad_norm": 1.8133517503738403, + "learning_rate": 1.81683562125095e-05, + "loss": 2.072, + "mean_token_accuracy": 0.5402184724807739, + "num_tokens": 2179258276.0, + "step": 4263 + }, + { + "epoch": 1.1530557057869117, + "grad_norm": 1.5485765933990479, + "learning_rate": 1.8167402863250844e-05, + "loss": 2.147, + "mean_token_accuracy": 0.5276668667793274, + "num_tokens": 2179782391.0, + "step": 4264 + }, + { + "epoch": 1.1533261222282314, + "grad_norm": 1.492993950843811, + "learning_rate": 1.816644929407519e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5375990271568298, + "num_tokens": 2180289005.0, + "step": 4265 + }, + { + "epoch": 1.153596538669551, + "grad_norm": 1.6261544227600098, + "learning_rate": 1.8165495505011798e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5560047626495361, + "num_tokens": 2180758708.0, + "step": 4266 + }, + { + "epoch": 1.1538669551108707, + "grad_norm": 1.4854531288146973, + "learning_rate": 1.8164541496089932e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.5488722324371338, + "num_tokens": 2181266005.0, + "step": 4267 + }, + { + "epoch": 1.1541373715521903, + "grad_norm": 1.524102807044983, + "learning_rate": 1.8163587267338866e-05, + "loss": 2.0979, + "mean_token_accuracy": 0.524394690990448, + "num_tokens": 2181790272.0, + "step": 4268 + }, + { + "epoch": 1.15440778799351, + "grad_norm": 1.6256732940673828, + "learning_rate": 1.816263281878787e-05, + "loss": 2.0797, + "mean_token_accuracy": 0.5491427183151245, + "num_tokens": 2182314542.0, + "step": 4269 + }, + { + "epoch": 1.1546782044348296, + "grad_norm": 1.3157309293746948, + "learning_rate": 1.8161678150466237e-05, + "loss": 2.1752, + "mean_token_accuracy": 0.53321772813797, + "num_tokens": 2182802090.0, + "step": 4270 + }, + { + "epoch": 1.1549486208761492, + "grad_norm": 1.2663347721099854, + "learning_rate": 1.816072326240326e-05, + "loss": 1.9375, + "mean_token_accuracy": 0.5432178974151611, + "num_tokens": 2183326223.0, + "step": 4271 + }, + { + "epoch": 1.1552190373174689, + "grad_norm": 1.2926385402679443, + "learning_rate": 1.815976815462823e-05, + "loss": 2.0893, + "mean_token_accuracy": 0.5361834764480591, + "num_tokens": 2183850387.0, + "step": 4272 + }, + { + "epoch": 1.1554894537587885, + "grad_norm": 1.4498810768127441, + "learning_rate": 1.815881282717046e-05, + "loss": 1.7718, + "mean_token_accuracy": 0.5657472014427185, + "num_tokens": 2184374504.0, + "step": 4273 + }, + { + "epoch": 1.1557598702001082, + "grad_norm": 1.506795883178711, + "learning_rate": 1.8157857280059264e-05, + "loss": 2.1462, + "mean_token_accuracy": 0.5060739517211914, + "num_tokens": 2184898695.0, + "step": 4274 + }, + { + "epoch": 1.1560302866414278, + "grad_norm": 1.263825535774231, + "learning_rate": 1.8156901513323953e-05, + "loss": 2.045, + "mean_token_accuracy": 0.5436033010482788, + "num_tokens": 2185422807.0, + "step": 4275 + }, + { + "epoch": 1.1563007030827475, + "grad_norm": 1.2213257551193237, + "learning_rate": 1.8155945526993854e-05, + "loss": 1.9507, + "mean_token_accuracy": 0.5503370761871338, + "num_tokens": 2185925836.0, + "step": 4276 + }, + { + "epoch": 1.156571119524067, + "grad_norm": 1.5102182626724243, + "learning_rate": 1.8154989321098306e-05, + "loss": 2.0401, + "mean_token_accuracy": 0.541587233543396, + "num_tokens": 2186389543.0, + "step": 4277 + }, + { + "epoch": 1.1568415359653867, + "grad_norm": 1.3519799709320068, + "learning_rate": 1.815403289566664e-05, + "loss": 1.9686, + "mean_token_accuracy": 0.5493588447570801, + "num_tokens": 2186913773.0, + "step": 4278 + }, + { + "epoch": 1.1571119524067064, + "grad_norm": 1.7161591053009033, + "learning_rate": 1.815307625072821e-05, + "loss": 2.2422, + "mean_token_accuracy": 0.5185384154319763, + "num_tokens": 2187397324.0, + "step": 4279 + }, + { + "epoch": 1.157382368848026, + "grad_norm": 1.3193094730377197, + "learning_rate": 1.8152119386312365e-05, + "loss": 2.0453, + "mean_token_accuracy": 0.526337206363678, + "num_tokens": 2187921541.0, + "step": 4280 + }, + { + "epoch": 1.1576527852893457, + "grad_norm": 0.7322918772697449, + "learning_rate": 1.815116230244846e-05, + "loss": 1.0873, + "mean_token_accuracy": 0.7046130299568176, + "num_tokens": 2188424164.0, + "step": 4281 + }, + { + "epoch": 1.1579232017306653, + "grad_norm": 1.8100026845932007, + "learning_rate": 1.815020499916587e-05, + "loss": 2.0996, + "mean_token_accuracy": 0.5352604389190674, + "num_tokens": 2188948371.0, + "step": 4282 + }, + { + "epoch": 1.158193618171985, + "grad_norm": 1.4522532224655151, + "learning_rate": 1.814924747649396e-05, + "loss": 1.9514, + "mean_token_accuracy": 0.5573694109916687, + "num_tokens": 2189465114.0, + "step": 4283 + }, + { + "epoch": 1.1584640346133046, + "grad_norm": 1.8796426057815552, + "learning_rate": 1.8148289734462113e-05, + "loss": 2.1162, + "mean_token_accuracy": 0.5421423316001892, + "num_tokens": 2189989388.0, + "step": 4284 + }, + { + "epoch": 1.158734451054624, + "grad_norm": 1.3737924098968506, + "learning_rate": 1.8147331773099717e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5565186738967896, + "num_tokens": 2190470583.0, + "step": 4285 + }, + { + "epoch": 1.1590048674959437, + "grad_norm": 1.68256413936615, + "learning_rate": 1.8146373592436164e-05, + "loss": 2.1011, + "mean_token_accuracy": 0.5385621190071106, + "num_tokens": 2190994771.0, + "step": 4286 + }, + { + "epoch": 1.1592752839372633, + "grad_norm": 1.4529645442962646, + "learning_rate": 1.814541519250085e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.5195105671882629, + "num_tokens": 2191519011.0, + "step": 4287 + }, + { + "epoch": 1.159545700378583, + "grad_norm": 1.4964978694915771, + "learning_rate": 1.814445657332319e-05, + "loss": 2.1554, + "mean_token_accuracy": 0.4954982399940491, + "num_tokens": 2192043201.0, + "step": 4288 + }, + { + "epoch": 1.1598161168199026, + "grad_norm": 1.377467155456543, + "learning_rate": 1.8143497734932587e-05, + "loss": 2.0565, + "mean_token_accuracy": 0.5461221933364868, + "num_tokens": 2192563987.0, + "step": 4289 + }, + { + "epoch": 1.1600865332612222, + "grad_norm": 1.2577179670333862, + "learning_rate": 1.8142538677358467e-05, + "loss": 1.8218, + "mean_token_accuracy": 0.5798468589782715, + "num_tokens": 2193088256.0, + "step": 4290 + }, + { + "epoch": 1.1603569497025419, + "grad_norm": 1.7424060106277466, + "learning_rate": 1.8141579400630255e-05, + "loss": 2.2148, + "mean_token_accuracy": 0.5190640687942505, + "num_tokens": 2193612524.0, + "step": 4291 + }, + { + "epoch": 1.1606273661438615, + "grad_norm": 1.336326241493225, + "learning_rate": 1.8140619904777385e-05, + "loss": 1.9398, + "mean_token_accuracy": 0.5602896213531494, + "num_tokens": 2194085462.0, + "step": 4292 + }, + { + "epoch": 1.1608977825851812, + "grad_norm": 1.3981902599334717, + "learning_rate": 1.8139660189829298e-05, + "loss": 1.9419, + "mean_token_accuracy": 0.5666842460632324, + "num_tokens": 2194609633.0, + "step": 4293 + }, + { + "epoch": 1.1611681990265008, + "grad_norm": 1.498863935470581, + "learning_rate": 1.813870025581544e-05, + "loss": 2.1364, + "mean_token_accuracy": 0.5313587784767151, + "num_tokens": 2195133738.0, + "step": 4294 + }, + { + "epoch": 1.1614386154678205, + "grad_norm": 1.3633332252502441, + "learning_rate": 1.8137740102765263e-05, + "loss": 2.0373, + "mean_token_accuracy": 0.5534144639968872, + "num_tokens": 2195615221.0, + "step": 4295 + }, + { + "epoch": 1.16170903190914, + "grad_norm": 1.2721678018569946, + "learning_rate": 1.813677973070823e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.5318041443824768, + "num_tokens": 2196088526.0, + "step": 4296 + }, + { + "epoch": 1.1619794483504597, + "grad_norm": 1.4215329885482788, + "learning_rate": 1.813581913967381e-05, + "loss": 1.9826, + "mean_token_accuracy": 0.5399317741394043, + "num_tokens": 2196612727.0, + "step": 4297 + }, + { + "epoch": 1.1622498647917794, + "grad_norm": 1.4599978923797607, + "learning_rate": 1.813485832969147e-05, + "loss": 2.003, + "mean_token_accuracy": 0.5552701354026794, + "num_tokens": 2197136887.0, + "step": 4298 + }, + { + "epoch": 1.162520281233099, + "grad_norm": 1.2405385971069336, + "learning_rate": 1.8133897300790694e-05, + "loss": 2.0359, + "mean_token_accuracy": 0.5438958406448364, + "num_tokens": 2197660980.0, + "step": 4299 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 1.2978190183639526, + "learning_rate": 1.8132936053000975e-05, + "loss": 1.8735, + "mean_token_accuracy": 0.5766580104827881, + "num_tokens": 2198185178.0, + "step": 4300 + }, + { + "epoch": 1.1630611141157383, + "grad_norm": 0.7603078484535217, + "learning_rate": 1.8131974586351796e-05, + "loss": 1.2609, + "mean_token_accuracy": 0.666498601436615, + "num_tokens": 2198709397.0, + "step": 4301 + }, + { + "epoch": 1.163331530557058, + "grad_norm": 1.6565070152282715, + "learning_rate": 1.813101290087266e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.5226370096206665, + "num_tokens": 2199221513.0, + "step": 4302 + }, + { + "epoch": 1.1636019469983774, + "grad_norm": 1.6123679876327515, + "learning_rate": 1.813005099659308e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.5246822237968445, + "num_tokens": 2199745770.0, + "step": 4303 + }, + { + "epoch": 1.163872363439697, + "grad_norm": 1.5414332151412964, + "learning_rate": 1.812908887354257e-05, + "loss": 1.9587, + "mean_token_accuracy": 0.5520599484443665, + "num_tokens": 2200231441.0, + "step": 4304 + }, + { + "epoch": 1.1641427798810167, + "grad_norm": 1.6238231658935547, + "learning_rate": 1.8128126531750646e-05, + "loss": 2.1404, + "mean_token_accuracy": 0.538088321685791, + "num_tokens": 2200711603.0, + "step": 4305 + }, + { + "epoch": 1.1644131963223363, + "grad_norm": 1.6383094787597656, + "learning_rate": 1.812716397124684e-05, + "loss": 2.0073, + "mean_token_accuracy": 0.5671195983886719, + "num_tokens": 2201235843.0, + "step": 4306 + }, + { + "epoch": 1.164683612763656, + "grad_norm": 1.3763341903686523, + "learning_rate": 1.812620119206068e-05, + "loss": 2.1005, + "mean_token_accuracy": 0.5284222364425659, + "num_tokens": 2201744198.0, + "step": 4307 + }, + { + "epoch": 1.1649540292049756, + "grad_norm": 1.3843075037002563, + "learning_rate": 1.812523819422171e-05, + "loss": 1.9807, + "mean_token_accuracy": 0.5488232970237732, + "num_tokens": 2202268435.0, + "step": 4308 + }, + { + "epoch": 1.1652244456462952, + "grad_norm": 1.4565232992172241, + "learning_rate": 1.8124274977759483e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5287148356437683, + "num_tokens": 2202792709.0, + "step": 4309 + }, + { + "epoch": 1.1654948620876149, + "grad_norm": 1.513627529144287, + "learning_rate": 1.8123311542703547e-05, + "loss": 2.0657, + "mean_token_accuracy": 0.5397190451622009, + "num_tokens": 2203314741.0, + "step": 4310 + }, + { + "epoch": 1.1657652785289345, + "grad_norm": 1.2095623016357422, + "learning_rate": 1.8122347889083464e-05, + "loss": 1.9362, + "mean_token_accuracy": 0.5661602020263672, + "num_tokens": 2203786026.0, + "step": 4311 + }, + { + "epoch": 1.1660356949702542, + "grad_norm": 1.401572823524475, + "learning_rate": 1.8121384016928802e-05, + "loss": 2.085, + "mean_token_accuracy": 0.5313704013824463, + "num_tokens": 2204300297.0, + "step": 4312 + }, + { + "epoch": 1.1663061114115738, + "grad_norm": 1.3896011114120483, + "learning_rate": 1.8120419926269134e-05, + "loss": 2.053, + "mean_token_accuracy": 0.5415281653404236, + "num_tokens": 2204824509.0, + "step": 4313 + }, + { + "epoch": 1.1665765278528935, + "grad_norm": 1.3052916526794434, + "learning_rate": 1.8119455617134045e-05, + "loss": 2.0471, + "mean_token_accuracy": 0.5493040084838867, + "num_tokens": 2205320842.0, + "step": 4314 + }, + { + "epoch": 1.166846944294213, + "grad_norm": 1.5895507335662842, + "learning_rate": 1.8118491089553122e-05, + "loss": 2.1341, + "mean_token_accuracy": 0.5182161331176758, + "num_tokens": 2205844993.0, + "step": 4315 + }, + { + "epoch": 1.1671173607355327, + "grad_norm": 1.2523198127746582, + "learning_rate": 1.8117526343555955e-05, + "loss": 2.0226, + "mean_token_accuracy": 0.5552588105201721, + "num_tokens": 2206369188.0, + "step": 4316 + }, + { + "epoch": 1.1673877771768524, + "grad_norm": 1.3910404443740845, + "learning_rate": 1.811656137917215e-05, + "loss": 1.9296, + "mean_token_accuracy": 0.5528690814971924, + "num_tokens": 2206893413.0, + "step": 4317 + }, + { + "epoch": 1.167658193618172, + "grad_norm": 1.361497163772583, + "learning_rate": 1.8115596196431317e-05, + "loss": 2.1162, + "mean_token_accuracy": 0.5334769487380981, + "num_tokens": 2207417626.0, + "step": 4318 + }, + { + "epoch": 1.1679286100594917, + "grad_norm": 1.2581504583358765, + "learning_rate": 1.811463079536306e-05, + "loss": 2.0436, + "mean_token_accuracy": 0.5516963005065918, + "num_tokens": 2207941825.0, + "step": 4319 + }, + { + "epoch": 1.1681990265008113, + "grad_norm": 1.2497844696044922, + "learning_rate": 1.811366517599701e-05, + "loss": 1.9894, + "mean_token_accuracy": 0.5355969071388245, + "num_tokens": 2208465994.0, + "step": 4320 + }, + { + "epoch": 1.168469442942131, + "grad_norm": 0.7875208258628845, + "learning_rate": 1.8112699338362792e-05, + "loss": 1.0979, + "mean_token_accuracy": 0.6960593461990356, + "num_tokens": 2208932314.0, + "step": 4321 + }, + { + "epoch": 1.1687398593834506, + "grad_norm": 2.1198365688323975, + "learning_rate": 1.8111733282490043e-05, + "loss": 2.1537, + "mean_token_accuracy": 0.5214512348175049, + "num_tokens": 2209456469.0, + "step": 4322 + }, + { + "epoch": 1.1690102758247702, + "grad_norm": 1.669802188873291, + "learning_rate": 1.8110767008408404e-05, + "loss": 2.0847, + "mean_token_accuracy": 0.5267136693000793, + "num_tokens": 2209980592.0, + "step": 4323 + }, + { + "epoch": 1.1692806922660899, + "grad_norm": 1.4148746728897095, + "learning_rate": 1.8109800516147523e-05, + "loss": 2.1201, + "mean_token_accuracy": 0.5457902550697327, + "num_tokens": 2210471655.0, + "step": 4324 + }, + { + "epoch": 1.1695511087074095, + "grad_norm": 1.6483049392700195, + "learning_rate": 1.8108833805737047e-05, + "loss": 2.1001, + "mean_token_accuracy": 0.5309326648712158, + "num_tokens": 2210995929.0, + "step": 4325 + }, + { + "epoch": 1.169821525148729, + "grad_norm": 1.50283682346344, + "learning_rate": 1.8107866877206646e-05, + "loss": 2.1627, + "mean_token_accuracy": 0.550194501876831, + "num_tokens": 2211455461.0, + "step": 4326 + }, + { + "epoch": 1.1700919415900486, + "grad_norm": 1.580929160118103, + "learning_rate": 1.810689973058599e-05, + "loss": 2.1371, + "mean_token_accuracy": 0.5391031503677368, + "num_tokens": 2211979646.0, + "step": 4327 + }, + { + "epoch": 1.1703623580313682, + "grad_norm": 1.5346275568008423, + "learning_rate": 1.810593236590475e-05, + "loss": 2.0288, + "mean_token_accuracy": 0.544450044631958, + "num_tokens": 2212503743.0, + "step": 4328 + }, + { + "epoch": 1.1706327744726879, + "grad_norm": 1.4980049133300781, + "learning_rate": 1.8104964783192608e-05, + "loss": 2.1226, + "mean_token_accuracy": 0.5364542007446289, + "num_tokens": 2213027988.0, + "step": 4329 + }, + { + "epoch": 1.1709031909140075, + "grad_norm": 1.105466365814209, + "learning_rate": 1.810399698247925e-05, + "loss": 2.0409, + "mean_token_accuracy": 0.5365906357765198, + "num_tokens": 2213552063.0, + "step": 4330 + }, + { + "epoch": 1.1711736073553272, + "grad_norm": 1.3188196420669556, + "learning_rate": 1.810302896379437e-05, + "loss": 1.9947, + "mean_token_accuracy": 0.5292645692825317, + "num_tokens": 2214076090.0, + "step": 4331 + }, + { + "epoch": 1.1714440237966468, + "grad_norm": 1.2004443407058716, + "learning_rate": 1.810206072716768e-05, + "loss": 1.8168, + "mean_token_accuracy": 0.5602982640266418, + "num_tokens": 2214600248.0, + "step": 4332 + }, + { + "epoch": 1.1717144402379664, + "grad_norm": 1.5546375513076782, + "learning_rate": 1.8101092272628877e-05, + "loss": 2.0448, + "mean_token_accuracy": 0.546628475189209, + "num_tokens": 2215077565.0, + "step": 4333 + }, + { + "epoch": 1.171984856679286, + "grad_norm": 1.3192254304885864, + "learning_rate": 1.810012360020768e-05, + "loss": 2.0262, + "mean_token_accuracy": 0.5565257668495178, + "num_tokens": 2215601722.0, + "step": 4334 + }, + { + "epoch": 1.1722552731206057, + "grad_norm": 1.6684894561767578, + "learning_rate": 1.809915470993381e-05, + "loss": 2.1014, + "mean_token_accuracy": 0.5334713459014893, + "num_tokens": 2216125962.0, + "step": 4335 + }, + { + "epoch": 1.1725256895619254, + "grad_norm": 1.2506483793258667, + "learning_rate": 1.8098185601837e-05, + "loss": 2.1292, + "mean_token_accuracy": 0.559557318687439, + "num_tokens": 2216567363.0, + "step": 4336 + }, + { + "epoch": 1.172796106003245, + "grad_norm": 1.7606297731399536, + "learning_rate": 1.8097216275946977e-05, + "loss": 2.0018, + "mean_token_accuracy": 0.5397368669509888, + "num_tokens": 2217091428.0, + "step": 4337 + }, + { + "epoch": 1.1730665224445647, + "grad_norm": 1.4037357568740845, + "learning_rate": 1.809624673229349e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5619178414344788, + "num_tokens": 2217615555.0, + "step": 4338 + }, + { + "epoch": 1.1733369388858843, + "grad_norm": 1.5932048559188843, + "learning_rate": 1.809527697090628e-05, + "loss": 1.8823, + "mean_token_accuracy": 0.5643963813781738, + "num_tokens": 2218139826.0, + "step": 4339 + }, + { + "epoch": 1.173607355327204, + "grad_norm": 1.256739616394043, + "learning_rate": 1.809430699181511e-05, + "loss": 2.0801, + "mean_token_accuracy": 0.5222314596176147, + "num_tokens": 2218663914.0, + "step": 4340 + }, + { + "epoch": 1.1738777717685236, + "grad_norm": 0.9255395531654358, + "learning_rate": 1.809333679504974e-05, + "loss": 1.1322, + "mean_token_accuracy": 0.6988159418106079, + "num_tokens": 2219188098.0, + "step": 4341 + }, + { + "epoch": 1.1741481882098432, + "grad_norm": 2.4480161666870117, + "learning_rate": 1.8092366380639934e-05, + "loss": 2.0544, + "mean_token_accuracy": 0.545051097869873, + "num_tokens": 2219712376.0, + "step": 4342 + }, + { + "epoch": 1.1744186046511629, + "grad_norm": 1.8705308437347412, + "learning_rate": 1.809139574861547e-05, + "loss": 2.0987, + "mean_token_accuracy": 0.5322508811950684, + "num_tokens": 2220236487.0, + "step": 4343 + }, + { + "epoch": 1.1746890210924823, + "grad_norm": 1.400829792022705, + "learning_rate": 1.8090424899006134e-05, + "loss": 2.0085, + "mean_token_accuracy": 0.5491646528244019, + "num_tokens": 2220760710.0, + "step": 4344 + }, + { + "epoch": 1.174959437533802, + "grad_norm": 1.8710912466049194, + "learning_rate": 1.808945383184171e-05, + "loss": 1.98, + "mean_token_accuracy": 0.5300962924957275, + "num_tokens": 2221284857.0, + "step": 4345 + }, + { + "epoch": 1.1752298539751216, + "grad_norm": 2.0607593059539795, + "learning_rate": 1.808848254715199e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.5155307054519653, + "num_tokens": 2221809038.0, + "step": 4346 + }, + { + "epoch": 1.1755002704164412, + "grad_norm": 1.876447319984436, + "learning_rate": 1.8087511044966783e-05, + "loss": 2.0428, + "mean_token_accuracy": 0.5314344167709351, + "num_tokens": 2222333204.0, + "step": 4347 + }, + { + "epoch": 1.1757706868577609, + "grad_norm": 2.132302761077881, + "learning_rate": 1.808653932531589e-05, + "loss": 2.0366, + "mean_token_accuracy": 0.5383344888687134, + "num_tokens": 2222857448.0, + "step": 4348 + }, + { + "epoch": 1.1760411032990805, + "grad_norm": 4.010105133056641, + "learning_rate": 1.8085567388229133e-05, + "loss": 1.7373, + "mean_token_accuracy": 0.5876997113227844, + "num_tokens": 2223316038.0, + "step": 4349 + }, + { + "epoch": 1.1763115197404002, + "grad_norm": 1.7800298929214478, + "learning_rate": 1.8084595233736332e-05, + "loss": 1.9983, + "mean_token_accuracy": 0.5611684918403625, + "num_tokens": 2223792430.0, + "step": 4350 + }, + { + "epoch": 1.1765819361817198, + "grad_norm": 1.714137315750122, + "learning_rate": 1.8083622861867317e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.5478735566139221, + "num_tokens": 2224248356.0, + "step": 4351 + }, + { + "epoch": 1.1768523526230394, + "grad_norm": 4.815013885498047, + "learning_rate": 1.808265027265192e-05, + "loss": 1.8896, + "mean_token_accuracy": 0.5852023363113403, + "num_tokens": 2224772587.0, + "step": 4352 + }, + { + "epoch": 1.177122769064359, + "grad_norm": 2.614743947982788, + "learning_rate": 1.8081677466119978e-05, + "loss": 1.9916, + "mean_token_accuracy": 0.551235020160675, + "num_tokens": 2225296766.0, + "step": 4353 + }, + { + "epoch": 1.1773931855056787, + "grad_norm": 2.26694917678833, + "learning_rate": 1.8080704442301352e-05, + "loss": 2.0955, + "mean_token_accuracy": 0.5476334095001221, + "num_tokens": 2225763853.0, + "step": 4354 + }, + { + "epoch": 1.1776636019469984, + "grad_norm": 1.4824448823928833, + "learning_rate": 1.8079731201225884e-05, + "loss": 2.0584, + "mean_token_accuracy": 0.5373570919036865, + "num_tokens": 2226287847.0, + "step": 4355 + }, + { + "epoch": 1.177934018388318, + "grad_norm": 1.846448540687561, + "learning_rate": 1.8078757742923453e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5610476732254028, + "num_tokens": 2226812044.0, + "step": 4356 + }, + { + "epoch": 1.1782044348296377, + "grad_norm": 1.8545117378234863, + "learning_rate": 1.807778406742391e-05, + "loss": 2.0439, + "mean_token_accuracy": 0.5430452227592468, + "num_tokens": 2227316871.0, + "step": 4357 + }, + { + "epoch": 1.1784748512709573, + "grad_norm": 1.7952959537506104, + "learning_rate": 1.8076810174757137e-05, + "loss": 2.1243, + "mean_token_accuracy": 0.5268722176551819, + "num_tokens": 2227831224.0, + "step": 4358 + }, + { + "epoch": 1.178745267712277, + "grad_norm": 1.8235499858856201, + "learning_rate": 1.807583606495302e-05, + "loss": 1.9295, + "mean_token_accuracy": 0.5637096166610718, + "num_tokens": 2228309842.0, + "step": 4359 + }, + { + "epoch": 1.1790156841535966, + "grad_norm": 1.3569391965866089, + "learning_rate": 1.807486173804144e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5311299562454224, + "num_tokens": 2228834119.0, + "step": 4360 + }, + { + "epoch": 1.1792861005949162, + "grad_norm": 0.620398998260498, + "learning_rate": 1.8073887194052295e-05, + "loss": 1.1733, + "mean_token_accuracy": 0.6976381540298462, + "num_tokens": 2229319921.0, + "step": 4361 + }, + { + "epoch": 1.1795565170362359, + "grad_norm": 2.4836838245391846, + "learning_rate": 1.807291243301549e-05, + "loss": 2.0412, + "mean_token_accuracy": 0.5466245412826538, + "num_tokens": 2229844189.0, + "step": 4362 + }, + { + "epoch": 1.1798269334775555, + "grad_norm": 2.0616204738616943, + "learning_rate": 1.8071937454960932e-05, + "loss": 1.7963, + "mean_token_accuracy": 0.5463240742683411, + "num_tokens": 2230368441.0, + "step": 4363 + }, + { + "epoch": 1.1800973499188752, + "grad_norm": 1.482475996017456, + "learning_rate": 1.8070962259918538e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.554044783115387, + "num_tokens": 2230799723.0, + "step": 4364 + }, + { + "epoch": 1.1803677663601948, + "grad_norm": 1.5915272235870361, + "learning_rate": 1.8069986847918225e-05, + "loss": 1.9875, + "mean_token_accuracy": 0.5489529371261597, + "num_tokens": 2231300962.0, + "step": 4365 + }, + { + "epoch": 1.1806381828015144, + "grad_norm": 1.5733450651168823, + "learning_rate": 1.8069011218989926e-05, + "loss": 2.027, + "mean_token_accuracy": 0.5417206883430481, + "num_tokens": 2231825204.0, + "step": 4366 + }, + { + "epoch": 1.1809085992428339, + "grad_norm": 1.6348721981048584, + "learning_rate": 1.806803537316357e-05, + "loss": 2.1652, + "mean_token_accuracy": 0.5254355669021606, + "num_tokens": 2232349484.0, + "step": 4367 + }, + { + "epoch": 1.1811790156841535, + "grad_norm": 1.3744512796401978, + "learning_rate": 1.8067059310469105e-05, + "loss": 2.125, + "mean_token_accuracy": 0.5473122596740723, + "num_tokens": 2232862992.0, + "step": 4368 + }, + { + "epoch": 1.1814494321254732, + "grad_norm": 1.6102567911148071, + "learning_rate": 1.806608303093648e-05, + "loss": 2.2029, + "mean_token_accuracy": 0.5282360315322876, + "num_tokens": 2233329394.0, + "step": 4369 + }, + { + "epoch": 1.1817198485667928, + "grad_norm": 1.4766254425048828, + "learning_rate": 1.8065106534595643e-05, + "loss": 1.9527, + "mean_token_accuracy": 0.5517086386680603, + "num_tokens": 2233811566.0, + "step": 4370 + }, + { + "epoch": 1.1819902650081124, + "grad_norm": 1.5586740970611572, + "learning_rate": 1.8064129821476565e-05, + "loss": 2.1822, + "mean_token_accuracy": 0.5162157416343689, + "num_tokens": 2234335823.0, + "step": 4371 + }, + { + "epoch": 1.182260681449432, + "grad_norm": 1.5700424909591675, + "learning_rate": 1.8063152891609207e-05, + "loss": 2.0457, + "mean_token_accuracy": 0.5388584733009338, + "num_tokens": 2234859946.0, + "step": 4372 + }, + { + "epoch": 1.1825310978907517, + "grad_norm": 1.237548589706421, + "learning_rate": 1.8062175745023546e-05, + "loss": 2.0231, + "mean_token_accuracy": 0.5548096895217896, + "num_tokens": 2235384221.0, + "step": 4373 + }, + { + "epoch": 1.1828015143320714, + "grad_norm": 1.3296904563903809, + "learning_rate": 1.8061198381749567e-05, + "loss": 2.0296, + "mean_token_accuracy": 0.5445114374160767, + "num_tokens": 2235868589.0, + "step": 4374 + }, + { + "epoch": 1.183071930773391, + "grad_norm": 1.4351511001586914, + "learning_rate": 1.8060220801817256e-05, + "loss": 2.0821, + "mean_token_accuracy": 0.5464258193969727, + "num_tokens": 2236392857.0, + "step": 4375 + }, + { + "epoch": 1.1833423472147107, + "grad_norm": 1.7225005626678467, + "learning_rate": 1.8059243005256608e-05, + "loss": 2.0845, + "mean_token_accuracy": 0.5475225448608398, + "num_tokens": 2236884909.0, + "step": 4376 + }, + { + "epoch": 1.1836127636560303, + "grad_norm": 1.3405449390411377, + "learning_rate": 1.8058264992097624e-05, + "loss": 2.1782, + "mean_token_accuracy": 0.5151179432868958, + "num_tokens": 2237409149.0, + "step": 4377 + }, + { + "epoch": 1.18388318009735, + "grad_norm": 1.216039776802063, + "learning_rate": 1.8057286762370313e-05, + "loss": 1.945, + "mean_token_accuracy": 0.5651254653930664, + "num_tokens": 2237933435.0, + "step": 4378 + }, + { + "epoch": 1.1841535965386696, + "grad_norm": 1.4061734676361084, + "learning_rate": 1.805630831610469e-05, + "loss": 2.0053, + "mean_token_accuracy": 0.5308493375778198, + "num_tokens": 2238457607.0, + "step": 4379 + }, + { + "epoch": 1.1844240129799892, + "grad_norm": 12.024290084838867, + "learning_rate": 1.805532965333078e-05, + "loss": 1.9165, + "mean_token_accuracy": 0.5723872780799866, + "num_tokens": 2238970919.0, + "step": 4380 + }, + { + "epoch": 1.1846944294213089, + "grad_norm": 0.8013390898704529, + "learning_rate": 1.80543507740786e-05, + "loss": 1.1923, + "mean_token_accuracy": 0.6855665445327759, + "num_tokens": 2239495011.0, + "step": 4381 + }, + { + "epoch": 1.1849648458626285, + "grad_norm": 2.968489408493042, + "learning_rate": 1.80533716783782e-05, + "loss": 2.1158, + "mean_token_accuracy": 0.5350202322006226, + "num_tokens": 2240019185.0, + "step": 4382 + }, + { + "epoch": 1.1852352623039482, + "grad_norm": 2.2758541107177734, + "learning_rate": 1.805239236625961e-05, + "loss": 1.977, + "mean_token_accuracy": 0.5433273315429688, + "num_tokens": 2240543433.0, + "step": 4383 + }, + { + "epoch": 1.1855056787452678, + "grad_norm": 1.6453707218170166, + "learning_rate": 1.8051412837752886e-05, + "loss": 2.0452, + "mean_token_accuracy": 0.5531828999519348, + "num_tokens": 2241067712.0, + "step": 4384 + }, + { + "epoch": 1.1857760951865872, + "grad_norm": 1.7408344745635986, + "learning_rate": 1.805043309288808e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5569484233856201, + "num_tokens": 2241591857.0, + "step": 4385 + }, + { + "epoch": 1.1860465116279069, + "grad_norm": 1.8053845167160034, + "learning_rate": 1.8049453131695245e-05, + "loss": 1.6862, + "mean_token_accuracy": 0.641403079032898, + "num_tokens": 2242116052.0, + "step": 4386 + }, + { + "epoch": 1.1863169280692265, + "grad_norm": 1.7586112022399902, + "learning_rate": 1.8048472954204464e-05, + "loss": 2.109, + "mean_token_accuracy": 0.5538504123687744, + "num_tokens": 2242629777.0, + "step": 4387 + }, + { + "epoch": 1.1865873445105461, + "grad_norm": 1.9377684593200684, + "learning_rate": 1.80474925604458e-05, + "loss": 2.0486, + "mean_token_accuracy": 0.5442782044410706, + "num_tokens": 2243150803.0, + "step": 4388 + }, + { + "epoch": 1.1868577609518658, + "grad_norm": 1.5908808708190918, + "learning_rate": 1.8046511950449342e-05, + "loss": 2.07, + "mean_token_accuracy": 0.5606192946434021, + "num_tokens": 2243675072.0, + "step": 4389 + }, + { + "epoch": 1.1871281773931854, + "grad_norm": 1.6635773181915283, + "learning_rate": 1.8045531124245178e-05, + "loss": 2.0532, + "mean_token_accuracy": 0.533358097076416, + "num_tokens": 2244199343.0, + "step": 4390 + }, + { + "epoch": 1.187398593834505, + "grad_norm": 1.8215159177780151, + "learning_rate": 1.8044550081863393e-05, + "loss": 2.1004, + "mean_token_accuracy": 0.5482006072998047, + "num_tokens": 2244669799.0, + "step": 4391 + }, + { + "epoch": 1.1876690102758247, + "grad_norm": 1.5265538692474365, + "learning_rate": 1.8043568823334097e-05, + "loss": 2.0521, + "mean_token_accuracy": 0.5242075324058533, + "num_tokens": 2245194029.0, + "step": 4392 + }, + { + "epoch": 1.1879394267171444, + "grad_norm": 1.5147461891174316, + "learning_rate": 1.8042587348687398e-05, + "loss": 2.1216, + "mean_token_accuracy": 0.5339667797088623, + "num_tokens": 2245718278.0, + "step": 4393 + }, + { + "epoch": 1.188209843158464, + "grad_norm": 1.4254634380340576, + "learning_rate": 1.8041605657953408e-05, + "loss": 2.1898, + "mean_token_accuracy": 0.5233029127120972, + "num_tokens": 2246242441.0, + "step": 4394 + }, + { + "epoch": 1.1884802595997837, + "grad_norm": 1.4779804944992065, + "learning_rate": 1.8040623751162246e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.5560178756713867, + "num_tokens": 2246683949.0, + "step": 4395 + }, + { + "epoch": 1.1887506760411033, + "grad_norm": 1.4321774244308472, + "learning_rate": 1.803964162834404e-05, + "loss": 1.9447, + "mean_token_accuracy": 0.5552802085876465, + "num_tokens": 2247208196.0, + "step": 4396 + }, + { + "epoch": 1.189021092482423, + "grad_norm": 1.3378297090530396, + "learning_rate": 1.803865928952893e-05, + "loss": 1.9586, + "mean_token_accuracy": 0.5430058240890503, + "num_tokens": 2247685550.0, + "step": 4397 + }, + { + "epoch": 1.1892915089237426, + "grad_norm": 1.3424575328826904, + "learning_rate": 1.8037676734747057e-05, + "loss": 2.0816, + "mean_token_accuracy": 0.5465754866600037, + "num_tokens": 2248209818.0, + "step": 4398 + }, + { + "epoch": 1.1895619253650622, + "grad_norm": 1.4266960620880127, + "learning_rate": 1.803669396402856e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.5406649708747864, + "num_tokens": 2248698412.0, + "step": 4399 + }, + { + "epoch": 1.1898323418063819, + "grad_norm": 1.8510054349899292, + "learning_rate": 1.80357109774036e-05, + "loss": 2.0211, + "mean_token_accuracy": 0.5473635196685791, + "num_tokens": 2249216626.0, + "step": 4400 + }, + { + "epoch": 1.1901027582477015, + "grad_norm": 0.808538556098938, + "learning_rate": 1.8034727774902338e-05, + "loss": 1.1821, + "mean_token_accuracy": 0.6837708353996277, + "num_tokens": 2249740902.0, + "step": 4401 + }, + { + "epoch": 1.1903731746890212, + "grad_norm": 1.7058988809585571, + "learning_rate": 1.8033744356554938e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.5335602760314941, + "num_tokens": 2250265079.0, + "step": 4402 + }, + { + "epoch": 1.1906435911303408, + "grad_norm": 1.615759015083313, + "learning_rate": 1.8032760722391577e-05, + "loss": 2.0028, + "mean_token_accuracy": 0.5457257032394409, + "num_tokens": 2250789249.0, + "step": 4403 + }, + { + "epoch": 1.1909140075716604, + "grad_norm": 1.4826664924621582, + "learning_rate": 1.8031776872442438e-05, + "loss": 2.016, + "mean_token_accuracy": 0.5445641279220581, + "num_tokens": 2251313409.0, + "step": 4404 + }, + { + "epoch": 1.19118442401298, + "grad_norm": 1.3864535093307495, + "learning_rate": 1.80307928067377e-05, + "loss": 2.0089, + "mean_token_accuracy": 0.5643439292907715, + "num_tokens": 2251830579.0, + "step": 4405 + }, + { + "epoch": 1.1914548404542997, + "grad_norm": 1.295182704925537, + "learning_rate": 1.8029808525307567e-05, + "loss": 1.996, + "mean_token_accuracy": 0.5403051376342773, + "num_tokens": 2252354815.0, + "step": 4406 + }, + { + "epoch": 1.1917252568956194, + "grad_norm": 1.3593086004257202, + "learning_rate": 1.802882402818223e-05, + "loss": 2.0121, + "mean_token_accuracy": 0.5478073358535767, + "num_tokens": 2252868241.0, + "step": 4407 + }, + { + "epoch": 1.191995673336939, + "grad_norm": 1.3653459548950195, + "learning_rate": 1.802783931539191e-05, + "loss": 1.9684, + "mean_token_accuracy": 0.5433915257453918, + "num_tokens": 2253392415.0, + "step": 4408 + }, + { + "epoch": 1.1922660897782584, + "grad_norm": 1.2766084671020508, + "learning_rate": 1.8026854386966805e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.5387601852416992, + "num_tokens": 2253916692.0, + "step": 4409 + }, + { + "epoch": 1.192536506219578, + "grad_norm": 1.256203293800354, + "learning_rate": 1.8025869242937145e-05, + "loss": 2.0261, + "mean_token_accuracy": 0.5270289182662964, + "num_tokens": 2254440880.0, + "step": 4410 + }, + { + "epoch": 1.1928069226608977, + "grad_norm": 1.607342004776001, + "learning_rate": 1.8024883883333155e-05, + "loss": 2.049, + "mean_token_accuracy": 0.5291353464126587, + "num_tokens": 2254933212.0, + "step": 4411 + }, + { + "epoch": 1.1930773391022174, + "grad_norm": 1.5345077514648438, + "learning_rate": 1.8023898308185067e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.5283728837966919, + "num_tokens": 2255452648.0, + "step": 4412 + }, + { + "epoch": 1.193347755543537, + "grad_norm": 1.5387741327285767, + "learning_rate": 1.8022912517523127e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5520895719528198, + "num_tokens": 2255976772.0, + "step": 4413 + }, + { + "epoch": 1.1936181719848566, + "grad_norm": 1.4117261171340942, + "learning_rate": 1.8021926511377574e-05, + "loss": 2.0053, + "mean_token_accuracy": 0.5660611391067505, + "num_tokens": 2256500910.0, + "step": 4414 + }, + { + "epoch": 1.1938885884261763, + "grad_norm": 1.284070611000061, + "learning_rate": 1.8020940289778667e-05, + "loss": 1.922, + "mean_token_accuracy": 0.5857104063034058, + "num_tokens": 2256954715.0, + "step": 4415 + }, + { + "epoch": 1.194159004867496, + "grad_norm": 1.522307276725769, + "learning_rate": 1.8019953852756666e-05, + "loss": 1.9621, + "mean_token_accuracy": 0.5493097305297852, + "num_tokens": 2257478954.0, + "step": 4416 + }, + { + "epoch": 1.1944294213088156, + "grad_norm": 1.517396330833435, + "learning_rate": 1.8018967200341833e-05, + "loss": 2.045, + "mean_token_accuracy": 0.5406228303909302, + "num_tokens": 2258003146.0, + "step": 4417 + }, + { + "epoch": 1.1946998377501352, + "grad_norm": 1.3337644338607788, + "learning_rate": 1.8017980332564445e-05, + "loss": 2.044, + "mean_token_accuracy": 0.5427051186561584, + "num_tokens": 2258527398.0, + "step": 4418 + }, + { + "epoch": 1.1949702541914549, + "grad_norm": 1.1860400438308716, + "learning_rate": 1.8016993249454786e-05, + "loss": 1.9158, + "mean_token_accuracy": 0.5390205383300781, + "num_tokens": 2259051558.0, + "step": 4419 + }, + { + "epoch": 1.1952406706327745, + "grad_norm": 1.2147574424743652, + "learning_rate": 1.8016005951043137e-05, + "loss": 2.084, + "mean_token_accuracy": 0.5370894074440002, + "num_tokens": 2259575819.0, + "step": 4420 + }, + { + "epoch": 1.1955110870740941, + "grad_norm": 0.7940687537193298, + "learning_rate": 1.801501843735979e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.6962936520576477, + "num_tokens": 2260060175.0, + "step": 4421 + }, + { + "epoch": 1.1957815035154138, + "grad_norm": 1.5476279258728027, + "learning_rate": 1.801403070843505e-05, + "loss": 1.9782, + "mean_token_accuracy": 0.5515151023864746, + "num_tokens": 2260573401.0, + "step": 4422 + }, + { + "epoch": 1.1960519199567334, + "grad_norm": 1.325864553451538, + "learning_rate": 1.801304276429922e-05, + "loss": 1.9512, + "mean_token_accuracy": 0.54712975025177, + "num_tokens": 2261036757.0, + "step": 4423 + }, + { + "epoch": 1.196322336398053, + "grad_norm": 1.7748444080352783, + "learning_rate": 1.8012054604982614e-05, + "loss": 2.1451, + "mean_token_accuracy": 0.5414292216300964, + "num_tokens": 2261560909.0, + "step": 4424 + }, + { + "epoch": 1.1965927528393727, + "grad_norm": 1.571258544921875, + "learning_rate": 1.8011066230515552e-05, + "loss": 2.1123, + "mean_token_accuracy": 0.55047607421875, + "num_tokens": 2262085041.0, + "step": 4425 + }, + { + "epoch": 1.1968631692806924, + "grad_norm": 1.5340367555618286, + "learning_rate": 1.801007764092836e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.5349146127700806, + "num_tokens": 2262609325.0, + "step": 4426 + }, + { + "epoch": 1.1971335857220118, + "grad_norm": 1.5911239385604858, + "learning_rate": 1.800908883625137e-05, + "loss": 2.1655, + "mean_token_accuracy": 0.5413517951965332, + "num_tokens": 2263071954.0, + "step": 4427 + }, + { + "epoch": 1.1974040021633314, + "grad_norm": 1.625586748123169, + "learning_rate": 1.8008099816514925e-05, + "loss": 1.9824, + "mean_token_accuracy": 0.5585170984268188, + "num_tokens": 2263596207.0, + "step": 4428 + }, + { + "epoch": 1.197674418604651, + "grad_norm": 1.3505293130874634, + "learning_rate": 1.8007110581749363e-05, + "loss": 2.0163, + "mean_token_accuracy": 0.5475535988807678, + "num_tokens": 2264120446.0, + "step": 4429 + }, + { + "epoch": 1.1979448350459707, + "grad_norm": 1.4964933395385742, + "learning_rate": 1.8006121131985044e-05, + "loss": 2.2002, + "mean_token_accuracy": 0.5240379571914673, + "num_tokens": 2264644596.0, + "step": 4430 + }, + { + "epoch": 1.1982152514872904, + "grad_norm": 1.447190284729004, + "learning_rate": 1.8005131467252325e-05, + "loss": 2.0165, + "mean_token_accuracy": 0.5583137273788452, + "num_tokens": 2265144219.0, + "step": 4431 + }, + { + "epoch": 1.19848566792861, + "grad_norm": 1.370344877243042, + "learning_rate": 1.8004141587581573e-05, + "loss": 1.9703, + "mean_token_accuracy": 0.5481452345848083, + "num_tokens": 2265668348.0, + "step": 4432 + }, + { + "epoch": 1.1987560843699296, + "grad_norm": 1.4188041687011719, + "learning_rate": 1.800315149300316e-05, + "loss": 2.1511, + "mean_token_accuracy": 0.5280218124389648, + "num_tokens": 2266192442.0, + "step": 4433 + }, + { + "epoch": 1.1990265008112493, + "grad_norm": 1.441394329071045, + "learning_rate": 1.800216118354746e-05, + "loss": 2.0743, + "mean_token_accuracy": 0.5449097156524658, + "num_tokens": 2266707924.0, + "step": 4434 + }, + { + "epoch": 1.199296917252569, + "grad_norm": 1.249592900276184, + "learning_rate": 1.8001170659244866e-05, + "loss": 1.8914, + "mean_token_accuracy": 0.5815514326095581, + "num_tokens": 2267203578.0, + "step": 4435 + }, + { + "epoch": 1.1995673336938886, + "grad_norm": 1.5637716054916382, + "learning_rate": 1.8000179920125765e-05, + "loss": 2.1187, + "mean_token_accuracy": 0.5186430811882019, + "num_tokens": 2267727715.0, + "step": 4436 + }, + { + "epoch": 1.1998377501352082, + "grad_norm": 1.1703002452850342, + "learning_rate": 1.7999188966220562e-05, + "loss": 1.8341, + "mean_token_accuracy": 0.5978916883468628, + "num_tokens": 2268251913.0, + "step": 4437 + }, + { + "epoch": 1.2001081665765279, + "grad_norm": 1.387972116470337, + "learning_rate": 1.7998197797559655e-05, + "loss": 2.094, + "mean_token_accuracy": 0.5460985898971558, + "num_tokens": 2268732851.0, + "step": 4438 + }, + { + "epoch": 1.2003785830178475, + "grad_norm": 1.665123701095581, + "learning_rate": 1.799720641417346e-05, + "loss": 1.6005, + "mean_token_accuracy": 0.6083775758743286, + "num_tokens": 2269257065.0, + "step": 4439 + }, + { + "epoch": 1.2006489994591671, + "grad_norm": 1.449013590812683, + "learning_rate": 1.7996214816092396e-05, + "loss": 2.0842, + "mean_token_accuracy": 0.5335260629653931, + "num_tokens": 2269781246.0, + "step": 4440 + }, + { + "epoch": 1.2009194159004868, + "grad_norm": 0.662264883518219, + "learning_rate": 1.7995223003346884e-05, + "loss": 1.1295, + "mean_token_accuracy": 0.6957947611808777, + "num_tokens": 2270292198.0, + "step": 4441 + }, + { + "epoch": 1.2011898323418064, + "grad_norm": 1.9858882427215576, + "learning_rate": 1.799423097596736e-05, + "loss": 2.061, + "mean_token_accuracy": 0.5413103699684143, + "num_tokens": 2270816297.0, + "step": 4442 + }, + { + "epoch": 1.201460248783126, + "grad_norm": 1.5844343900680542, + "learning_rate": 1.7993238733984264e-05, + "loss": 2.1043, + "mean_token_accuracy": 0.5369392037391663, + "num_tokens": 2271340526.0, + "step": 4443 + }, + { + "epoch": 1.2017306652244457, + "grad_norm": 1.3334934711456299, + "learning_rate": 1.7992246277428037e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.5640654563903809, + "num_tokens": 2271864516.0, + "step": 4444 + }, + { + "epoch": 1.2020010816657654, + "grad_norm": 1.5954947471618652, + "learning_rate": 1.7991253606329128e-05, + "loss": 2.1046, + "mean_token_accuracy": 0.5440682172775269, + "num_tokens": 2272347351.0, + "step": 4445 + }, + { + "epoch": 1.202271498107085, + "grad_norm": 1.268897294998169, + "learning_rate": 1.7990260720717997e-05, + "loss": 2.0176, + "mean_token_accuracy": 0.5361977815628052, + "num_tokens": 2272871388.0, + "step": 4446 + }, + { + "epoch": 1.2025419145484046, + "grad_norm": 2.397284507751465, + "learning_rate": 1.7989267620625113e-05, + "loss": 2.1168, + "mean_token_accuracy": 0.5377447605133057, + "num_tokens": 2273395651.0, + "step": 4447 + }, + { + "epoch": 1.2028123309897243, + "grad_norm": 2.111504554748535, + "learning_rate": 1.798827430608095e-05, + "loss": 2.0614, + "mean_token_accuracy": 0.5339715480804443, + "num_tokens": 2273919813.0, + "step": 4448 + }, + { + "epoch": 1.203082747431044, + "grad_norm": 1.383702039718628, + "learning_rate": 1.798728077711597e-05, + "loss": 1.9775, + "mean_token_accuracy": 0.5585354566574097, + "num_tokens": 2274415218.0, + "step": 4449 + }, + { + "epoch": 1.2033531638723634, + "grad_norm": 1.78054940700531, + "learning_rate": 1.7986287033760672e-05, + "loss": 2.1093, + "mean_token_accuracy": 0.5233970284461975, + "num_tokens": 2274939338.0, + "step": 4450 + }, + { + "epoch": 1.203623580313683, + "grad_norm": 1.7490692138671875, + "learning_rate": 1.7985293076045544e-05, + "loss": 1.9587, + "mean_token_accuracy": 0.5653846859931946, + "num_tokens": 2275463510.0, + "step": 4451 + }, + { + "epoch": 1.2038939967550026, + "grad_norm": 1.2998467683792114, + "learning_rate": 1.7984298904001078e-05, + "loss": 2.049, + "mean_token_accuracy": 0.5458009243011475, + "num_tokens": 2275987778.0, + "step": 4452 + }, + { + "epoch": 1.2041644131963223, + "grad_norm": 1.3872824907302856, + "learning_rate": 1.7983304517657788e-05, + "loss": 2.0496, + "mean_token_accuracy": 0.5368280410766602, + "num_tokens": 2276511930.0, + "step": 4453 + }, + { + "epoch": 1.204434829637642, + "grad_norm": 1.4693487882614136, + "learning_rate": 1.7982309917046174e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.5518310070037842, + "num_tokens": 2277036114.0, + "step": 4454 + }, + { + "epoch": 1.2047052460789616, + "grad_norm": 1.3171041011810303, + "learning_rate": 1.798131510219676e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.5477316379547119, + "num_tokens": 2277560379.0, + "step": 4455 + }, + { + "epoch": 1.2049756625202812, + "grad_norm": 1.4406945705413818, + "learning_rate": 1.7980320073140064e-05, + "loss": 2.0245, + "mean_token_accuracy": 0.5309348106384277, + "num_tokens": 2278084546.0, + "step": 4456 + }, + { + "epoch": 1.2052460789616009, + "grad_norm": 1.6531224250793457, + "learning_rate": 1.7979324829906627e-05, + "loss": 2.0141, + "mean_token_accuracy": 0.564184308052063, + "num_tokens": 2278454365.0, + "step": 4457 + }, + { + "epoch": 1.2055164954029205, + "grad_norm": 1.4013170003890991, + "learning_rate": 1.7978329372526974e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5538101196289062, + "num_tokens": 2278953122.0, + "step": 4458 + }, + { + "epoch": 1.2057869118442401, + "grad_norm": 1.7419564723968506, + "learning_rate": 1.7977333701031655e-05, + "loss": 2.1003, + "mean_token_accuracy": 0.5473893284797668, + "num_tokens": 2279477359.0, + "step": 4459 + }, + { + "epoch": 1.2060573282855598, + "grad_norm": 1.515745759010315, + "learning_rate": 1.7976337815451223e-05, + "loss": 2.0725, + "mean_token_accuracy": 0.5340266823768616, + "num_tokens": 2280001548.0, + "step": 4460 + }, + { + "epoch": 1.2063277447268794, + "grad_norm": 0.6700544953346252, + "learning_rate": 1.7975341715816227e-05, + "loss": 1.0374, + "mean_token_accuracy": 0.7250374555587769, + "num_tokens": 2280502044.0, + "step": 4461 + }, + { + "epoch": 1.206598161168199, + "grad_norm": 2.3962857723236084, + "learning_rate": 1.7974345402157235e-05, + "loss": 2.0853, + "mean_token_accuracy": 0.5448383688926697, + "num_tokens": 2281022806.0, + "step": 4462 + }, + { + "epoch": 1.2068685776095187, + "grad_norm": 1.8520675897598267, + "learning_rate": 1.797334887450482e-05, + "loss": 2.052, + "mean_token_accuracy": 0.539294958114624, + "num_tokens": 2281547085.0, + "step": 4463 + }, + { + "epoch": 1.2071389940508384, + "grad_norm": 1.391769289970398, + "learning_rate": 1.7972352132889545e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5562615990638733, + "num_tokens": 2282071181.0, + "step": 4464 + }, + { + "epoch": 1.207409410492158, + "grad_norm": 1.7980743646621704, + "learning_rate": 1.7971355177342006e-05, + "loss": 2.0774, + "mean_token_accuracy": 0.5554274320602417, + "num_tokens": 2282560317.0, + "step": 4465 + }, + { + "epoch": 1.2076798269334776, + "grad_norm": 1.600618600845337, + "learning_rate": 1.797035800789279e-05, + "loss": 2.0124, + "mean_token_accuracy": 0.550518810749054, + "num_tokens": 2283084559.0, + "step": 4466 + }, + { + "epoch": 1.2079502433747973, + "grad_norm": 1.6172451972961426, + "learning_rate": 1.7969360624572495e-05, + "loss": 2.0396, + "mean_token_accuracy": 0.526764452457428, + "num_tokens": 2283608814.0, + "step": 4467 + }, + { + "epoch": 1.2082206598161167, + "grad_norm": 1.4876821041107178, + "learning_rate": 1.796836302741172e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.5368250608444214, + "num_tokens": 2284133012.0, + "step": 4468 + }, + { + "epoch": 1.2084910762574363, + "grad_norm": 1.4844838380813599, + "learning_rate": 1.796736521644107e-05, + "loss": 2.1357, + "mean_token_accuracy": 0.520038366317749, + "num_tokens": 2284657187.0, + "step": 4469 + }, + { + "epoch": 1.208761492698756, + "grad_norm": 1.41850745677948, + "learning_rate": 1.7966367191691167e-05, + "loss": 1.7585, + "mean_token_accuracy": 0.5864874124526978, + "num_tokens": 2285181372.0, + "step": 4470 + }, + { + "epoch": 1.2090319091400756, + "grad_norm": 1.4218175411224365, + "learning_rate": 1.7965368953192632e-05, + "loss": 2.0382, + "mean_token_accuracy": 0.5432145595550537, + "num_tokens": 2285654205.0, + "step": 4471 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 1.495818853378296, + "learning_rate": 1.7964370500976093e-05, + "loss": 2.0612, + "mean_token_accuracy": 0.5306662321090698, + "num_tokens": 2286178474.0, + "step": 4472 + }, + { + "epoch": 1.209572742022715, + "grad_norm": 1.6984974145889282, + "learning_rate": 1.7963371835072187e-05, + "loss": 2.0779, + "mean_token_accuracy": 0.5110933780670166, + "num_tokens": 2286699067.0, + "step": 4473 + }, + { + "epoch": 1.2098431584640346, + "grad_norm": 1.5097566843032837, + "learning_rate": 1.7962372955511555e-05, + "loss": 2.0896, + "mean_token_accuracy": 0.5506260991096497, + "num_tokens": 2287223340.0, + "step": 4474 + }, + { + "epoch": 1.2101135749053542, + "grad_norm": 1.4211204051971436, + "learning_rate": 1.7961373862324846e-05, + "loss": 2.0855, + "mean_token_accuracy": 0.5278798341751099, + "num_tokens": 2287747511.0, + "step": 4475 + }, + { + "epoch": 1.2103839913466738, + "grad_norm": 1.502315640449524, + "learning_rate": 1.7960374555542717e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.5260583162307739, + "num_tokens": 2288263977.0, + "step": 4476 + }, + { + "epoch": 1.2106544077879935, + "grad_norm": 1.1320141553878784, + "learning_rate": 1.7959375035195823e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.533979594707489, + "num_tokens": 2288788115.0, + "step": 4477 + }, + { + "epoch": 1.2109248242293131, + "grad_norm": 1.6497900485992432, + "learning_rate": 1.795837530131484e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5567600131034851, + "num_tokens": 2289268837.0, + "step": 4478 + }, + { + "epoch": 1.2111952406706328, + "grad_norm": 1.473711609840393, + "learning_rate": 1.795737535393044e-05, + "loss": 1.9816, + "mean_token_accuracy": 0.5597009062767029, + "num_tokens": 2289741188.0, + "step": 4479 + }, + { + "epoch": 1.2114656571119524, + "grad_norm": 1.1479716300964355, + "learning_rate": 1.79563751930733e-05, + "loss": 2.0274, + "mean_token_accuracy": 0.5509544610977173, + "num_tokens": 2290256582.0, + "step": 4480 + }, + { + "epoch": 1.211736073553272, + "grad_norm": 0.655073881149292, + "learning_rate": 1.7955374818774115e-05, + "loss": 1.1555, + "mean_token_accuracy": 0.6897163987159729, + "num_tokens": 2290780732.0, + "step": 4481 + }, + { + "epoch": 1.2120064899945917, + "grad_norm": 2.882650852203369, + "learning_rate": 1.7954374231063576e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.5438211560249329, + "num_tokens": 2291240188.0, + "step": 4482 + }, + { + "epoch": 1.2122769064359114, + "grad_norm": 2.414651393890381, + "learning_rate": 1.7953373429972382e-05, + "loss": 2.113, + "mean_token_accuracy": 0.5447503328323364, + "num_tokens": 2291764469.0, + "step": 4483 + }, + { + "epoch": 1.212547322877231, + "grad_norm": 1.5046720504760742, + "learning_rate": 1.7952372415531245e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.5358405113220215, + "num_tokens": 2292288735.0, + "step": 4484 + }, + { + "epoch": 1.2128177393185506, + "grad_norm": 1.5938977003097534, + "learning_rate": 1.7951371187770877e-05, + "loss": 2.0305, + "mean_token_accuracy": 0.5558274388313293, + "num_tokens": 2292777915.0, + "step": 4485 + }, + { + "epoch": 1.2130881557598703, + "grad_norm": 1.7340915203094482, + "learning_rate": 1.7950369746721995e-05, + "loss": 2.1076, + "mean_token_accuracy": 0.5309638977050781, + "num_tokens": 2293302168.0, + "step": 4486 + }, + { + "epoch": 1.21335857220119, + "grad_norm": 1.3481014966964722, + "learning_rate": 1.794936809241533e-05, + "loss": 1.9203, + "mean_token_accuracy": 0.5339747667312622, + "num_tokens": 2293822803.0, + "step": 4487 + }, + { + "epoch": 1.2136289886425096, + "grad_norm": 1.6698763370513916, + "learning_rate": 1.794836622488162e-05, + "loss": 2.1437, + "mean_token_accuracy": 0.525131106376648, + "num_tokens": 2294304332.0, + "step": 4488 + }, + { + "epoch": 1.2138994050838292, + "grad_norm": 1.7608706951141357, + "learning_rate": 1.7947364144151596e-05, + "loss": 2.0384, + "mean_token_accuracy": 0.5369305610656738, + "num_tokens": 2294828473.0, + "step": 4489 + }, + { + "epoch": 1.2141698215251489, + "grad_norm": 1.4700367450714111, + "learning_rate": 1.7946361850256007e-05, + "loss": 2.1249, + "mean_token_accuracy": 0.5277018547058105, + "num_tokens": 2295352750.0, + "step": 4490 + }, + { + "epoch": 1.2144402379664683, + "grad_norm": 1.4569827318191528, + "learning_rate": 1.7945359343225615e-05, + "loss": 2.0079, + "mean_token_accuracy": 0.5411711931228638, + "num_tokens": 2295876954.0, + "step": 4491 + }, + { + "epoch": 1.214710654407788, + "grad_norm": 1.7671548128128052, + "learning_rate": 1.794435662309117e-05, + "loss": 2.0469, + "mean_token_accuracy": 0.540118932723999, + "num_tokens": 2296401237.0, + "step": 4492 + }, + { + "epoch": 1.2149810708491076, + "grad_norm": 1.4135024547576904, + "learning_rate": 1.7943353689883443e-05, + "loss": 2.0374, + "mean_token_accuracy": 0.5503041744232178, + "num_tokens": 2296911544.0, + "step": 4493 + }, + { + "epoch": 1.2152514872904272, + "grad_norm": 1.5928162336349487, + "learning_rate": 1.7942350543633206e-05, + "loss": 1.8719, + "mean_token_accuracy": 0.5636081695556641, + "num_tokens": 2297435633.0, + "step": 4494 + }, + { + "epoch": 1.2155219037317468, + "grad_norm": 1.9390757083892822, + "learning_rate": 1.7941347184371235e-05, + "loss": 1.9912, + "mean_token_accuracy": 0.5538829565048218, + "num_tokens": 2297959785.0, + "step": 4495 + }, + { + "epoch": 1.2157923201730665, + "grad_norm": 1.408871054649353, + "learning_rate": 1.7940343612128318e-05, + "loss": 1.9954, + "mean_token_accuracy": 0.5481846332550049, + "num_tokens": 2298484052.0, + "step": 4496 + }, + { + "epoch": 1.2160627366143861, + "grad_norm": 1.9888383150100708, + "learning_rate": 1.7939339826935254e-05, + "loss": 1.8197, + "mean_token_accuracy": 0.5668677687644958, + "num_tokens": 2298967190.0, + "step": 4497 + }, + { + "epoch": 1.2163331530557058, + "grad_norm": 1.788109302520752, + "learning_rate": 1.7938335828822836e-05, + "loss": 2.0357, + "mean_token_accuracy": 0.546939492225647, + "num_tokens": 2299453635.0, + "step": 4498 + }, + { + "epoch": 1.2166035694970254, + "grad_norm": 1.4768671989440918, + "learning_rate": 1.7937331617821868e-05, + "loss": 1.9717, + "mean_token_accuracy": 0.5480169057846069, + "num_tokens": 2299929863.0, + "step": 4499 + }, + { + "epoch": 1.216873985938345, + "grad_norm": 1.6333439350128174, + "learning_rate": 1.7936327193963166e-05, + "loss": 1.9129, + "mean_token_accuracy": 0.5855046510696411, + "num_tokens": 2300454045.0, + "step": 4500 + }, + { + "epoch": 1.2171444023796647, + "grad_norm": 0.7590129971504211, + "learning_rate": 1.7935322557277546e-05, + "loss": 1.0812, + "mean_token_accuracy": 0.7390825748443604, + "num_tokens": 2300978327.0, + "step": 4501 + }, + { + "epoch": 1.2174148188209843, + "grad_norm": 2.48451828956604, + "learning_rate": 1.7934317707795837e-05, + "loss": 2.0266, + "mean_token_accuracy": 0.5490289330482483, + "num_tokens": 2301450656.0, + "step": 4502 + }, + { + "epoch": 1.217685235262304, + "grad_norm": 1.9987927675247192, + "learning_rate": 1.7933312645548867e-05, + "loss": 2.1153, + "mean_token_accuracy": 0.5415472984313965, + "num_tokens": 2301949119.0, + "step": 4503 + }, + { + "epoch": 1.2179556517036236, + "grad_norm": 1.5945605039596558, + "learning_rate": 1.7932307370567472e-05, + "loss": 2.0086, + "mean_token_accuracy": 0.5548027753829956, + "num_tokens": 2302473299.0, + "step": 4504 + }, + { + "epoch": 1.2182260681449433, + "grad_norm": 1.54562246799469, + "learning_rate": 1.7931301882882507e-05, + "loss": 1.8387, + "mean_token_accuracy": 0.5706015825271606, + "num_tokens": 2302997555.0, + "step": 4505 + }, + { + "epoch": 1.218496484586263, + "grad_norm": 1.6491997241973877, + "learning_rate": 1.793029618252481e-05, + "loss": 2.0456, + "mean_token_accuracy": 0.5604151487350464, + "num_tokens": 2303501909.0, + "step": 4506 + }, + { + "epoch": 1.2187669010275826, + "grad_norm": 1.7387981414794922, + "learning_rate": 1.792929026952525e-05, + "loss": 1.935, + "mean_token_accuracy": 0.5743665099143982, + "num_tokens": 2304018217.0, + "step": 4507 + }, + { + "epoch": 1.2190373174689022, + "grad_norm": 1.8012712001800537, + "learning_rate": 1.7928284143914684e-05, + "loss": 2.027, + "mean_token_accuracy": 0.5515141487121582, + "num_tokens": 2304514193.0, + "step": 4508 + }, + { + "epoch": 1.2193077339102216, + "grad_norm": 1.3084993362426758, + "learning_rate": 1.7927277805723988e-05, + "loss": 2.0525, + "mean_token_accuracy": 0.5413978099822998, + "num_tokens": 2305038398.0, + "step": 4509 + }, + { + "epoch": 1.2195781503515413, + "grad_norm": 2.1182501316070557, + "learning_rate": 1.7926271254984038e-05, + "loss": 2.0043, + "mean_token_accuracy": 0.5909541845321655, + "num_tokens": 2305498178.0, + "step": 4510 + }, + { + "epoch": 1.219848566792861, + "grad_norm": 1.5176702737808228, + "learning_rate": 1.7925264491725713e-05, + "loss": 2.018, + "mean_token_accuracy": 0.5407449007034302, + "num_tokens": 2306022374.0, + "step": 4511 + }, + { + "epoch": 1.2201189832341806, + "grad_norm": 1.658600091934204, + "learning_rate": 1.792425751597991e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.5351980924606323, + "num_tokens": 2306546535.0, + "step": 4512 + }, + { + "epoch": 1.2203893996755002, + "grad_norm": 1.777506709098816, + "learning_rate": 1.7923250327777524e-05, + "loss": 2.0829, + "mean_token_accuracy": 0.5410333871841431, + "num_tokens": 2307070758.0, + "step": 4513 + }, + { + "epoch": 1.2206598161168198, + "grad_norm": 1.3653684854507446, + "learning_rate": 1.7922242927149458e-05, + "loss": 2.0885, + "mean_token_accuracy": 0.5379561185836792, + "num_tokens": 2307594987.0, + "step": 4514 + }, + { + "epoch": 1.2209302325581395, + "grad_norm": 1.4480843544006348, + "learning_rate": 1.792123531412662e-05, + "loss": 2.0222, + "mean_token_accuracy": 0.5529516935348511, + "num_tokens": 2308119237.0, + "step": 4515 + }, + { + "epoch": 1.2212006489994591, + "grad_norm": 1.6709996461868286, + "learning_rate": 1.792022748873993e-05, + "loss": 2.0003, + "mean_token_accuracy": 0.560360312461853, + "num_tokens": 2308619995.0, + "step": 4516 + }, + { + "epoch": 1.2214710654407788, + "grad_norm": 1.5565818548202515, + "learning_rate": 1.791921945102031e-05, + "loss": 2.1146, + "mean_token_accuracy": 0.5408157110214233, + "num_tokens": 2309144162.0, + "step": 4517 + }, + { + "epoch": 1.2217414818820984, + "grad_norm": 1.3274502754211426, + "learning_rate": 1.791821120099869e-05, + "loss": 2.1532, + "mean_token_accuracy": 0.5230245590209961, + "num_tokens": 2309668439.0, + "step": 4518 + }, + { + "epoch": 1.222011898323418, + "grad_norm": 1.3484209775924683, + "learning_rate": 1.7917202738706003e-05, + "loss": 1.8997, + "mean_token_accuracy": 0.5626380443572998, + "num_tokens": 2310130253.0, + "step": 4519 + }, + { + "epoch": 1.2222823147647377, + "grad_norm": 1.786312460899353, + "learning_rate": 1.7916194064173195e-05, + "loss": 1.9875, + "mean_token_accuracy": 0.5764312148094177, + "num_tokens": 2310653588.0, + "step": 4520 + }, + { + "epoch": 1.2225527312060573, + "grad_norm": 0.6161965727806091, + "learning_rate": 1.791518517743121e-05, + "loss": 1.2744, + "mean_token_accuracy": 0.6502509713172913, + "num_tokens": 2311122296.0, + "step": 4521 + }, + { + "epoch": 1.222823147647377, + "grad_norm": 2.361940860748291, + "learning_rate": 1.7914176078511013e-05, + "loss": 2.0476, + "mean_token_accuracy": 0.5472817420959473, + "num_tokens": 2311646442.0, + "step": 4522 + }, + { + "epoch": 1.2230935640886966, + "grad_norm": 2.035956859588623, + "learning_rate": 1.791316676744356e-05, + "loss": 2.0591, + "mean_token_accuracy": 0.52335125207901, + "num_tokens": 2312170665.0, + "step": 4523 + }, + { + "epoch": 1.2233639805300163, + "grad_norm": 1.5859359502792358, + "learning_rate": 1.7912157244259816e-05, + "loss": 1.7427, + "mean_token_accuracy": 0.6068036556243896, + "num_tokens": 2312604325.0, + "step": 4524 + }, + { + "epoch": 1.223634396971336, + "grad_norm": 1.6578425168991089, + "learning_rate": 1.7911147508990765e-05, + "loss": 1.8442, + "mean_token_accuracy": 0.5661556124687195, + "num_tokens": 2313128487.0, + "step": 4525 + }, + { + "epoch": 1.2239048134126556, + "grad_norm": 2.0636584758758545, + "learning_rate": 1.7910137561667386e-05, + "loss": 2.0552, + "mean_token_accuracy": 0.5341989398002625, + "num_tokens": 2313652754.0, + "step": 4526 + }, + { + "epoch": 1.2241752298539752, + "grad_norm": 1.60152006149292, + "learning_rate": 1.7909127402320655e-05, + "loss": 2.194, + "mean_token_accuracy": 0.5326137542724609, + "num_tokens": 2314109690.0, + "step": 4527 + }, + { + "epoch": 1.2244456462952948, + "grad_norm": 1.8247039318084717, + "learning_rate": 1.7908117030981587e-05, + "loss": 2.0777, + "mean_token_accuracy": 0.5468952655792236, + "num_tokens": 2314633874.0, + "step": 4528 + }, + { + "epoch": 1.2247160627366145, + "grad_norm": 2.159518003463745, + "learning_rate": 1.7907106447681168e-05, + "loss": 1.9384, + "mean_token_accuracy": 0.579131007194519, + "num_tokens": 2315158142.0, + "step": 4529 + }, + { + "epoch": 1.2249864791779341, + "grad_norm": 1.3525491952896118, + "learning_rate": 1.7906095652450412e-05, + "loss": 1.9303, + "mean_token_accuracy": 0.5624625086784363, + "num_tokens": 2315605683.0, + "step": 4530 + }, + { + "epoch": 1.2252568956192538, + "grad_norm": 1.4423810243606567, + "learning_rate": 1.7905084645320333e-05, + "loss": 2.1064, + "mean_token_accuracy": 0.5420504808425903, + "num_tokens": 2316129854.0, + "step": 4531 + }, + { + "epoch": 1.2255273120605732, + "grad_norm": 1.4753546714782715, + "learning_rate": 1.790407342632195e-05, + "loss": 2.0348, + "mean_token_accuracy": 0.5332413911819458, + "num_tokens": 2316654127.0, + "step": 4532 + }, + { + "epoch": 1.2257977285018928, + "grad_norm": 1.6070754528045654, + "learning_rate": 1.7903061995486288e-05, + "loss": 2.0544, + "mean_token_accuracy": 0.5413755178451538, + "num_tokens": 2317178359.0, + "step": 4533 + }, + { + "epoch": 1.2260681449432125, + "grad_norm": 1.3067777156829834, + "learning_rate": 1.790205035284439e-05, + "loss": 2.0537, + "mean_token_accuracy": 0.5301012992858887, + "num_tokens": 2317702509.0, + "step": 4534 + }, + { + "epoch": 1.2263385613845321, + "grad_norm": 1.5106480121612549, + "learning_rate": 1.7901038498427282e-05, + "loss": 2.2062, + "mean_token_accuracy": 0.5297162532806396, + "num_tokens": 2318226604.0, + "step": 4535 + }, + { + "epoch": 1.2266089778258518, + "grad_norm": 1.2924103736877441, + "learning_rate": 1.790002643226602e-05, + "loss": 1.9459, + "mean_token_accuracy": 0.5635495185852051, + "num_tokens": 2318714604.0, + "step": 4536 + }, + { + "epoch": 1.2268793942671714, + "grad_norm": 1.3617147207260132, + "learning_rate": 1.7899014154391657e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.5500269532203674, + "num_tokens": 2319238874.0, + "step": 4537 + }, + { + "epoch": 1.227149810708491, + "grad_norm": 1.2352458238601685, + "learning_rate": 1.7898001664835255e-05, + "loss": 2.0947, + "mean_token_accuracy": 0.5346113443374634, + "num_tokens": 2319763144.0, + "step": 4538 + }, + { + "epoch": 1.2274202271498107, + "grad_norm": 2.0105950832366943, + "learning_rate": 1.789698896362787e-05, + "loss": 1.9422, + "mean_token_accuracy": 0.579644501209259, + "num_tokens": 2320250641.0, + "step": 4539 + }, + { + "epoch": 1.2276906435911303, + "grad_norm": 1.744921088218689, + "learning_rate": 1.7895976050800585e-05, + "loss": 2.1754, + "mean_token_accuracy": 0.5348449349403381, + "num_tokens": 2320774832.0, + "step": 4540 + }, + { + "epoch": 1.22796106003245, + "grad_norm": 0.6344113945960999, + "learning_rate": 1.789496292638447e-05, + "loss": 1.1815, + "mean_token_accuracy": 0.6934196352958679, + "num_tokens": 2321219890.0, + "step": 4541 + }, + { + "epoch": 1.2282314764737696, + "grad_norm": 2.8577566146850586, + "learning_rate": 1.789394959041062e-05, + "loss": 2.1397, + "mean_token_accuracy": 0.5176577568054199, + "num_tokens": 2321732781.0, + "step": 4542 + }, + { + "epoch": 1.2285018929150893, + "grad_norm": 2.092970848083496, + "learning_rate": 1.7892936042910125e-05, + "loss": 1.9985, + "mean_token_accuracy": 0.5429736375808716, + "num_tokens": 2322257020.0, + "step": 4543 + }, + { + "epoch": 1.228772309356409, + "grad_norm": 1.571652889251709, + "learning_rate": 1.7891922283914078e-05, + "loss": 2.1369, + "mean_token_accuracy": 0.5401352643966675, + "num_tokens": 2322753259.0, + "step": 4544 + }, + { + "epoch": 1.2290427257977286, + "grad_norm": 1.4966175556182861, + "learning_rate": 1.7890908313453585e-05, + "loss": 1.9995, + "mean_token_accuracy": 0.5425128936767578, + "num_tokens": 2323277518.0, + "step": 4545 + }, + { + "epoch": 1.2293131422390482, + "grad_norm": 1.430582880973816, + "learning_rate": 1.788989413155976e-05, + "loss": 2.063, + "mean_token_accuracy": 0.5465054512023926, + "num_tokens": 2323801803.0, + "step": 4546 + }, + { + "epoch": 1.2295835586803678, + "grad_norm": 1.4416812658309937, + "learning_rate": 1.7888879738263725e-05, + "loss": 1.9888, + "mean_token_accuracy": 0.5601745843887329, + "num_tokens": 2324299337.0, + "step": 4547 + }, + { + "epoch": 1.2298539751216875, + "grad_norm": 1.3469966650009155, + "learning_rate": 1.78878651335966e-05, + "loss": 2.0929, + "mean_token_accuracy": 0.5383739471435547, + "num_tokens": 2324823517.0, + "step": 4548 + }, + { + "epoch": 1.2301243915630071, + "grad_norm": 1.3331941366195679, + "learning_rate": 1.7886850317589518e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.5509913563728333, + "num_tokens": 2325347734.0, + "step": 4549 + }, + { + "epoch": 1.2303948080043265, + "grad_norm": 1.2433525323867798, + "learning_rate": 1.7885835290273613e-05, + "loss": 2.0258, + "mean_token_accuracy": 0.5439850091934204, + "num_tokens": 2325863461.0, + "step": 4550 + }, + { + "epoch": 1.2306652244456462, + "grad_norm": 1.1760228872299194, + "learning_rate": 1.788482005168003e-05, + "loss": 2.15, + "mean_token_accuracy": 0.5170947909355164, + "num_tokens": 2326387581.0, + "step": 4551 + }, + { + "epoch": 1.2309356408869658, + "grad_norm": 1.2686775922775269, + "learning_rate": 1.7883804601839925e-05, + "loss": 1.9887, + "mean_token_accuracy": 0.5398324728012085, + "num_tokens": 2326911822.0, + "step": 4552 + }, + { + "epoch": 1.2312060573282855, + "grad_norm": 1.2620699405670166, + "learning_rate": 1.788278894078445e-05, + "loss": 2.0173, + "mean_token_accuracy": 0.5445908904075623, + "num_tokens": 2327435992.0, + "step": 4553 + }, + { + "epoch": 1.2314764737696051, + "grad_norm": 1.5033057928085327, + "learning_rate": 1.7881773068544767e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.5309287309646606, + "num_tokens": 2327960225.0, + "step": 4554 + }, + { + "epoch": 1.2317468902109248, + "grad_norm": 1.229414701461792, + "learning_rate": 1.788075698515205e-05, + "loss": 2.0339, + "mean_token_accuracy": 0.5556068420410156, + "num_tokens": 2328484321.0, + "step": 4555 + }, + { + "epoch": 1.2320173066522444, + "grad_norm": 1.7730929851531982, + "learning_rate": 1.7879740690637472e-05, + "loss": 2.0865, + "mean_token_accuracy": 0.5429294109344482, + "num_tokens": 2329008551.0, + "step": 4556 + }, + { + "epoch": 1.232287723093564, + "grad_norm": 1.973488688468933, + "learning_rate": 1.787872418503222e-05, + "loss": 2.0656, + "mean_token_accuracy": 0.5306638479232788, + "num_tokens": 2329532608.0, + "step": 4557 + }, + { + "epoch": 1.2325581395348837, + "grad_norm": 1.3541512489318848, + "learning_rate": 1.787770746836748e-05, + "loss": 2.0884, + "mean_token_accuracy": 0.5249302387237549, + "num_tokens": 2330056682.0, + "step": 4558 + }, + { + "epoch": 1.2328285559762033, + "grad_norm": 1.7191452980041504, + "learning_rate": 1.7876690540674446e-05, + "loss": 2.1834, + "mean_token_accuracy": 0.5272402763366699, + "num_tokens": 2330548448.0, + "step": 4559 + }, + { + "epoch": 1.233098972417523, + "grad_norm": 1.3693228960037231, + "learning_rate": 1.787567340198433e-05, + "loss": 2.089, + "mean_token_accuracy": 0.5257976651191711, + "num_tokens": 2331072615.0, + "step": 4560 + }, + { + "epoch": 1.2333693888588426, + "grad_norm": 0.6205570101737976, + "learning_rate": 1.7874656052328326e-05, + "loss": 1.1868, + "mean_token_accuracy": 0.6948421001434326, + "num_tokens": 2331596827.0, + "step": 4561 + }, + { + "epoch": 1.2336398053001623, + "grad_norm": 1.9229259490966797, + "learning_rate": 1.787363849173766e-05, + "loss": 2.0765, + "mean_token_accuracy": 0.5446432828903198, + "num_tokens": 2332110135.0, + "step": 4562 + }, + { + "epoch": 1.233910221741482, + "grad_norm": 1.541684865951538, + "learning_rate": 1.7872620720243554e-05, + "loss": 2.0484, + "mean_token_accuracy": 0.5499681830406189, + "num_tokens": 2332634408.0, + "step": 4563 + }, + { + "epoch": 1.2341806381828015, + "grad_norm": 1.3285713195800781, + "learning_rate": 1.7871602737877233e-05, + "loss": 2.0943, + "mean_token_accuracy": 0.5550872087478638, + "num_tokens": 2333158641.0, + "step": 4564 + }, + { + "epoch": 1.2344510546241212, + "grad_norm": 1.6478952169418335, + "learning_rate": 1.7870584544669928e-05, + "loss": 1.9216, + "mean_token_accuracy": 0.539028525352478, + "num_tokens": 2333682902.0, + "step": 4565 + }, + { + "epoch": 1.2347214710654408, + "grad_norm": 1.8332364559173584, + "learning_rate": 1.7869566140652887e-05, + "loss": 2.0839, + "mean_token_accuracy": 0.5426782369613647, + "num_tokens": 2334207168.0, + "step": 4566 + }, + { + "epoch": 1.2349918875067605, + "grad_norm": 2.2541651725769043, + "learning_rate": 1.7868547525857354e-05, + "loss": 2.0655, + "mean_token_accuracy": 0.5447615385055542, + "num_tokens": 2334731252.0, + "step": 4567 + }, + { + "epoch": 1.2352623039480801, + "grad_norm": 1.6694180965423584, + "learning_rate": 1.7867528700314584e-05, + "loss": 2.0375, + "mean_token_accuracy": 0.549350380897522, + "num_tokens": 2335220256.0, + "step": 4568 + }, + { + "epoch": 1.2355327203893998, + "grad_norm": 1.6311067342758179, + "learning_rate": 1.7866509664055837e-05, + "loss": 2.0365, + "mean_token_accuracy": 0.5369886755943298, + "num_tokens": 2335744532.0, + "step": 4569 + }, + { + "epoch": 1.2358031368307194, + "grad_norm": 1.3791803121566772, + "learning_rate": 1.786549041711238e-05, + "loss": 2.0874, + "mean_token_accuracy": 0.556549072265625, + "num_tokens": 2336209282.0, + "step": 4570 + }, + { + "epoch": 1.236073553272039, + "grad_norm": 1.4830524921417236, + "learning_rate": 1.7864470959515486e-05, + "loss": 1.9941, + "mean_token_accuracy": 0.5571107864379883, + "num_tokens": 2336733412.0, + "step": 4571 + }, + { + "epoch": 1.2363439697133587, + "grad_norm": 1.6453036069869995, + "learning_rate": 1.786345129129644e-05, + "loss": 2.0528, + "mean_token_accuracy": 0.5309087038040161, + "num_tokens": 2337210665.0, + "step": 4572 + }, + { + "epoch": 1.2366143861546781, + "grad_norm": 1.7313141822814941, + "learning_rate": 1.7862431412486524e-05, + "loss": 2.1335, + "mean_token_accuracy": 0.5425615310668945, + "num_tokens": 2337734821.0, + "step": 4573 + }, + { + "epoch": 1.2368848025959978, + "grad_norm": 1.151907205581665, + "learning_rate": 1.7861411323117026e-05, + "loss": 2.0125, + "mean_token_accuracy": 0.5504876375198364, + "num_tokens": 2338258989.0, + "step": 4574 + }, + { + "epoch": 1.2371552190373174, + "grad_norm": 1.4585586786270142, + "learning_rate": 1.7860391023219255e-05, + "loss": 2.0158, + "mean_token_accuracy": 0.5591486692428589, + "num_tokens": 2338783263.0, + "step": 4575 + }, + { + "epoch": 1.237425635478637, + "grad_norm": 1.4148586988449097, + "learning_rate": 1.785937051282451e-05, + "loss": 2.0242, + "mean_token_accuracy": 0.537314236164093, + "num_tokens": 2339307402.0, + "step": 4576 + }, + { + "epoch": 1.2376960519199567, + "grad_norm": 1.3768951892852783, + "learning_rate": 1.7858349791964105e-05, + "loss": 1.9391, + "mean_token_accuracy": 0.5411442518234253, + "num_tokens": 2339831610.0, + "step": 4577 + }, + { + "epoch": 1.2379664683612763, + "grad_norm": 1.7336958646774292, + "learning_rate": 1.785732886066936e-05, + "loss": 2.0422, + "mean_token_accuracy": 0.5474891662597656, + "num_tokens": 2340355779.0, + "step": 4578 + }, + { + "epoch": 1.238236884802596, + "grad_norm": 1.2867190837860107, + "learning_rate": 1.78563077189716e-05, + "loss": 1.9921, + "mean_token_accuracy": 0.5453050136566162, + "num_tokens": 2340880045.0, + "step": 4579 + }, + { + "epoch": 1.2385073012439156, + "grad_norm": 1.631533145904541, + "learning_rate": 1.7855286366902157e-05, + "loss": 2.1074, + "mean_token_accuracy": 0.5212441682815552, + "num_tokens": 2341404176.0, + "step": 4580 + }, + { + "epoch": 1.2387777176852353, + "grad_norm": 0.7021290063858032, + "learning_rate": 1.785426480449237e-05, + "loss": 1.2027, + "mean_token_accuracy": 0.671299934387207, + "num_tokens": 2341928278.0, + "step": 4581 + }, + { + "epoch": 1.239048134126555, + "grad_norm": 2.194876194000244, + "learning_rate": 1.7853243031773576e-05, + "loss": 2.0866, + "mean_token_accuracy": 0.5385299324989319, + "num_tokens": 2342452347.0, + "step": 4582 + }, + { + "epoch": 1.2393185505678745, + "grad_norm": 1.5696539878845215, + "learning_rate": 1.7852221048777137e-05, + "loss": 1.975, + "mean_token_accuracy": 0.5431675314903259, + "num_tokens": 2342976584.0, + "step": 4583 + }, + { + "epoch": 1.2395889670091942, + "grad_norm": 1.259027361869812, + "learning_rate": 1.78511988555344e-05, + "loss": 1.9967, + "mean_token_accuracy": 0.550674319267273, + "num_tokens": 2343500843.0, + "step": 4584 + }, + { + "epoch": 1.2398593834505138, + "grad_norm": 1.6745381355285645, + "learning_rate": 1.785017645207674e-05, + "loss": 2.1195, + "mean_token_accuracy": 0.5352946519851685, + "num_tokens": 2344025104.0, + "step": 4585 + }, + { + "epoch": 1.2401297998918335, + "grad_norm": 1.4682027101516724, + "learning_rate": 1.784915383843552e-05, + "loss": 1.986, + "mean_token_accuracy": 0.5368738174438477, + "num_tokens": 2344538074.0, + "step": 4586 + }, + { + "epoch": 1.2404002163331531, + "grad_norm": 1.7062917947769165, + "learning_rate": 1.7848131014642114e-05, + "loss": 2.2041, + "mean_token_accuracy": 0.5282025337219238, + "num_tokens": 2345062238.0, + "step": 4587 + }, + { + "epoch": 1.2406706327744728, + "grad_norm": 1.490770936012268, + "learning_rate": 1.7847107980727913e-05, + "loss": 2.0248, + "mean_token_accuracy": 0.5415754318237305, + "num_tokens": 2345586396.0, + "step": 4588 + }, + { + "epoch": 1.2409410492157924, + "grad_norm": 1.1429047584533691, + "learning_rate": 1.7846084736724302e-05, + "loss": 2.0191, + "mean_token_accuracy": 0.5435396432876587, + "num_tokens": 2346066163.0, + "step": 4589 + }, + { + "epoch": 1.241211465657112, + "grad_norm": 1.5037872791290283, + "learning_rate": 1.7845061282662677e-05, + "loss": 1.9831, + "mean_token_accuracy": 0.5368335247039795, + "num_tokens": 2346548564.0, + "step": 4590 + }, + { + "epoch": 1.2414818820984315, + "grad_norm": 4.285386562347412, + "learning_rate": 1.784403761857444e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.564952552318573, + "num_tokens": 2347072573.0, + "step": 4591 + }, + { + "epoch": 1.241752298539751, + "grad_norm": 1.6205170154571533, + "learning_rate": 1.784301374449101e-05, + "loss": 1.9445, + "mean_token_accuracy": 0.541749119758606, + "num_tokens": 2347596854.0, + "step": 4592 + }, + { + "epoch": 1.2420227149810708, + "grad_norm": 1.4004300832748413, + "learning_rate": 1.784198966044379e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.559431254863739, + "num_tokens": 2348121123.0, + "step": 4593 + }, + { + "epoch": 1.2422931314223904, + "grad_norm": 1.4912421703338623, + "learning_rate": 1.7840965366464202e-05, + "loss": 1.9408, + "mean_token_accuracy": 0.5517412424087524, + "num_tokens": 2348645315.0, + "step": 4594 + }, + { + "epoch": 1.24256354786371, + "grad_norm": 1.2728859186172485, + "learning_rate": 1.7839940862583684e-05, + "loss": 2.0109, + "mean_token_accuracy": 0.5466015934944153, + "num_tokens": 2349169579.0, + "step": 4595 + }, + { + "epoch": 1.2428339643050297, + "grad_norm": 1.3462097644805908, + "learning_rate": 1.7838916148833664e-05, + "loss": 1.8949, + "mean_token_accuracy": 0.5647006034851074, + "num_tokens": 2349693685.0, + "step": 4596 + }, + { + "epoch": 1.2431043807463493, + "grad_norm": 1.2785435914993286, + "learning_rate": 1.7837891225245583e-05, + "loss": 2.0863, + "mean_token_accuracy": 0.5308130979537964, + "num_tokens": 2350217962.0, + "step": 4597 + }, + { + "epoch": 1.243374797187669, + "grad_norm": 1.6340807676315308, + "learning_rate": 1.783686609185089e-05, + "loss": 2.0583, + "mean_token_accuracy": 0.5459429025650024, + "num_tokens": 2350742225.0, + "step": 4598 + }, + { + "epoch": 1.2436452136289886, + "grad_norm": 1.3334099054336548, + "learning_rate": 1.783584074868104e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.5431721210479736, + "num_tokens": 2351266352.0, + "step": 4599 + }, + { + "epoch": 1.2439156300703083, + "grad_norm": 1.3295806646347046, + "learning_rate": 1.7834815195767497e-05, + "loss": 2.0344, + "mean_token_accuracy": 0.5568745136260986, + "num_tokens": 2351773670.0, + "step": 4600 + }, + { + "epoch": 1.244186046511628, + "grad_norm": 0.6679308414459229, + "learning_rate": 1.783378943314172e-05, + "loss": 1.2106, + "mean_token_accuracy": 0.6663312911987305, + "num_tokens": 2352297859.0, + "step": 4601 + }, + { + "epoch": 1.2444564629529475, + "grad_norm": 2.3529860973358154, + "learning_rate": 1.783276346083519e-05, + "loss": 2.1106, + "mean_token_accuracy": 0.5497316122055054, + "num_tokens": 2352772439.0, + "step": 4602 + }, + { + "epoch": 1.2447268793942672, + "grad_norm": 1.6870200634002686, + "learning_rate": 1.7831737278879376e-05, + "loss": 1.9189, + "mean_token_accuracy": 0.5469163060188293, + "num_tokens": 2353296537.0, + "step": 4603 + }, + { + "epoch": 1.2449972958355868, + "grad_norm": 1.6784043312072754, + "learning_rate": 1.7830710887305775e-05, + "loss": 2.0255, + "mean_token_accuracy": 0.5534981489181519, + "num_tokens": 2353809418.0, + "step": 4604 + }, + { + "epoch": 1.2452677122769065, + "grad_norm": 1.7484723329544067, + "learning_rate": 1.782968428614588e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5357711911201477, + "num_tokens": 2354333632.0, + "step": 4605 + }, + { + "epoch": 1.2455381287182261, + "grad_norm": 1.2957302331924438, + "learning_rate": 1.782865747543118e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.5565014481544495, + "num_tokens": 2354817037.0, + "step": 4606 + }, + { + "epoch": 1.2458085451595458, + "grad_norm": 1.4090532064437866, + "learning_rate": 1.7827630455193192e-05, + "loss": 2.0563, + "mean_token_accuracy": 0.5463632941246033, + "num_tokens": 2355341315.0, + "step": 4607 + }, + { + "epoch": 1.2460789616008654, + "grad_norm": 1.269474744796753, + "learning_rate": 1.7826603225463418e-05, + "loss": 1.8943, + "mean_token_accuracy": 0.566496729850769, + "num_tokens": 2355865516.0, + "step": 4608 + }, + { + "epoch": 1.246349378042185, + "grad_norm": 1.4223177433013916, + "learning_rate": 1.7825575786273386e-05, + "loss": 2.0428, + "mean_token_accuracy": 0.5575190782546997, + "num_tokens": 2356389713.0, + "step": 4609 + }, + { + "epoch": 1.2466197944835047, + "grad_norm": 1.3180643320083618, + "learning_rate": 1.7824548137654614e-05, + "loss": 1.9293, + "mean_token_accuracy": 0.5466127395629883, + "num_tokens": 2356913960.0, + "step": 4610 + }, + { + "epoch": 1.2468902109248243, + "grad_norm": 1.3356274366378784, + "learning_rate": 1.7823520279638635e-05, + "loss": 2.1391, + "mean_token_accuracy": 0.5141310691833496, + "num_tokens": 2357438243.0, + "step": 4611 + }, + { + "epoch": 1.247160627366144, + "grad_norm": 1.165050983428955, + "learning_rate": 1.782249221225699e-05, + "loss": 1.9107, + "mean_token_accuracy": 0.5674197673797607, + "num_tokens": 2357962458.0, + "step": 4612 + }, + { + "epoch": 1.2474310438074636, + "grad_norm": 1.3718807697296143, + "learning_rate": 1.782146393554122e-05, + "loss": 1.9381, + "mean_token_accuracy": 0.5285958647727966, + "num_tokens": 2358486474.0, + "step": 4613 + }, + { + "epoch": 1.247701460248783, + "grad_norm": 1.4289554357528687, + "learning_rate": 1.7820435449522874e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.5253536105155945, + "num_tokens": 2359010716.0, + "step": 4614 + }, + { + "epoch": 1.2479718766901027, + "grad_norm": 1.530003309249878, + "learning_rate": 1.781940675423351e-05, + "loss": 2.0835, + "mean_token_accuracy": 0.539968729019165, + "num_tokens": 2359534982.0, + "step": 4615 + }, + { + "epoch": 1.2482422931314223, + "grad_norm": 1.4525161981582642, + "learning_rate": 1.781837784970469e-05, + "loss": 2.0955, + "mean_token_accuracy": 0.49820035696029663, + "num_tokens": 2360059188.0, + "step": 4616 + }, + { + "epoch": 1.248512709572742, + "grad_norm": 1.3579034805297852, + "learning_rate": 1.7817348735967993e-05, + "loss": 2.1467, + "mean_token_accuracy": 0.5289292335510254, + "num_tokens": 2360583400.0, + "step": 4617 + }, + { + "epoch": 1.2487831260140616, + "grad_norm": 6.496406078338623, + "learning_rate": 1.7816319413054983e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.5556672811508179, + "num_tokens": 2361107599.0, + "step": 4618 + }, + { + "epoch": 1.2490535424553812, + "grad_norm": 1.9021000862121582, + "learning_rate": 1.781528988099725e-05, + "loss": 2.1923, + "mean_token_accuracy": 0.5183747410774231, + "num_tokens": 2361631872.0, + "step": 4619 + }, + { + "epoch": 1.249323958896701, + "grad_norm": 1.7313919067382812, + "learning_rate": 1.7814260139826377e-05, + "loss": 2.071, + "mean_token_accuracy": 0.5358327627182007, + "num_tokens": 2362155962.0, + "step": 4620 + }, + { + "epoch": 1.2495943753380205, + "grad_norm": 0.7536177039146423, + "learning_rate": 1.781323018957397e-05, + "loss": 1.1677, + "mean_token_accuracy": 0.6875962018966675, + "num_tokens": 2362680133.0, + "step": 4621 + }, + { + "epoch": 1.2498647917793402, + "grad_norm": 2.059737205505371, + "learning_rate": 1.781220003027162e-05, + "loss": 2.0744, + "mean_token_accuracy": 0.5503977537155151, + "num_tokens": 2363204236.0, + "step": 4622 + }, + { + "epoch": 1.2501352082206598, + "grad_norm": 1.868972897529602, + "learning_rate": 1.781116966195094e-05, + "loss": 2.1475, + "mean_token_accuracy": 0.521388828754425, + "num_tokens": 2363728517.0, + "step": 4623 + }, + { + "epoch": 1.2504056246619795, + "grad_norm": 1.5851701498031616, + "learning_rate": 1.7810139084643546e-05, + "loss": 2.0073, + "mean_token_accuracy": 0.5622636675834656, + "num_tokens": 2364250734.0, + "step": 4624 + }, + { + "epoch": 1.250676041103299, + "grad_norm": 1.6355952024459839, + "learning_rate": 1.7809108298381058e-05, + "loss": 2.0072, + "mean_token_accuracy": 0.5448300242424011, + "num_tokens": 2364742272.0, + "step": 4625 + }, + { + "epoch": 1.2509464575446188, + "grad_norm": 1.7473399639129639, + "learning_rate": 1.78080773031951e-05, + "loss": 1.9678, + "mean_token_accuracy": 0.580579936504364, + "num_tokens": 2365201221.0, + "step": 4626 + }, + { + "epoch": 1.2512168739859384, + "grad_norm": 1.6515566110610962, + "learning_rate": 1.7807046099117312e-05, + "loss": 1.9878, + "mean_token_accuracy": 0.542798638343811, + "num_tokens": 2365725282.0, + "step": 4627 + }, + { + "epoch": 1.251487290427258, + "grad_norm": 1.7398226261138916, + "learning_rate": 1.780601468617933e-05, + "loss": 2.1069, + "mean_token_accuracy": 0.545626163482666, + "num_tokens": 2366228614.0, + "step": 4628 + }, + { + "epoch": 1.2517577068685777, + "grad_norm": 1.5666805505752563, + "learning_rate": 1.7804983064412803e-05, + "loss": 2.0265, + "mean_token_accuracy": 0.5450761318206787, + "num_tokens": 2366752811.0, + "step": 4629 + }, + { + "epoch": 1.2520281233098973, + "grad_norm": 1.414881944656372, + "learning_rate": 1.780395123384938e-05, + "loss": 1.9358, + "mean_token_accuracy": 0.5484541058540344, + "num_tokens": 2367276914.0, + "step": 4630 + }, + { + "epoch": 1.2522985397512167, + "grad_norm": 1.743645191192627, + "learning_rate": 1.780291919452073e-05, + "loss": 2.0756, + "mean_token_accuracy": 0.5476528406143188, + "num_tokens": 2367755954.0, + "step": 4631 + }, + { + "epoch": 1.2525689561925364, + "grad_norm": 1.2447733879089355, + "learning_rate": 1.780188694645851e-05, + "loss": 2.0816, + "mean_token_accuracy": 0.5459191799163818, + "num_tokens": 2368280164.0, + "step": 4632 + }, + { + "epoch": 1.252839372633856, + "grad_norm": 1.5420494079589844, + "learning_rate": 1.7800854489694397e-05, + "loss": 1.8335, + "mean_token_accuracy": 0.569090723991394, + "num_tokens": 2368804411.0, + "step": 4633 + }, + { + "epoch": 1.2531097890751757, + "grad_norm": 1.5825119018554688, + "learning_rate": 1.7799821824260066e-05, + "loss": 2.0779, + "mean_token_accuracy": 0.5474740266799927, + "num_tokens": 2369328590.0, + "step": 4634 + }, + { + "epoch": 1.2533802055164953, + "grad_norm": 1.4524366855621338, + "learning_rate": 1.7798788950187205e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5527022480964661, + "num_tokens": 2369852750.0, + "step": 4635 + }, + { + "epoch": 1.253650621957815, + "grad_norm": 1.7603141069412231, + "learning_rate": 1.7797755867507507e-05, + "loss": 2.159, + "mean_token_accuracy": 0.5285670757293701, + "num_tokens": 2370376925.0, + "step": 4636 + }, + { + "epoch": 1.2539210383991346, + "grad_norm": 1.5092194080352783, + "learning_rate": 1.7796722576252667e-05, + "loss": 1.8259, + "mean_token_accuracy": 0.5627765655517578, + "num_tokens": 2370901111.0, + "step": 4637 + }, + { + "epoch": 1.2541914548404542, + "grad_norm": 1.3718997240066528, + "learning_rate": 1.779568907645439e-05, + "loss": 2.0969, + "mean_token_accuracy": 0.5442397594451904, + "num_tokens": 2371365562.0, + "step": 4638 + }, + { + "epoch": 1.2544618712817739, + "grad_norm": 1.730076789855957, + "learning_rate": 1.7794655368144386e-05, + "loss": 2.021, + "mean_token_accuracy": 0.5495345592498779, + "num_tokens": 2371889816.0, + "step": 4639 + }, + { + "epoch": 1.2547322877230935, + "grad_norm": 27.334810256958008, + "learning_rate": 1.7793621451354377e-05, + "loss": 1.9734, + "mean_token_accuracy": 0.5553500652313232, + "num_tokens": 2372413945.0, + "step": 4640 + }, + { + "epoch": 1.2550027041644132, + "grad_norm": 0.6486225724220276, + "learning_rate": 1.7792587326116078e-05, + "loss": 1.1582, + "mean_token_accuracy": 0.6945792436599731, + "num_tokens": 2372830322.0, + "step": 4641 + }, + { + "epoch": 1.2552731206057328, + "grad_norm": 2.0878593921661377, + "learning_rate": 1.779155299246123e-05, + "loss": 2.0068, + "mean_token_accuracy": 0.5329840183258057, + "num_tokens": 2373354482.0, + "step": 4642 + }, + { + "epoch": 1.2555435370470525, + "grad_norm": 1.656785011291504, + "learning_rate": 1.7790518450421558e-05, + "loss": 1.9337, + "mean_token_accuracy": 0.557459831237793, + "num_tokens": 2373878664.0, + "step": 4643 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 1.1521189212799072, + "learning_rate": 1.7789483700028817e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.5901292562484741, + "num_tokens": 2374402932.0, + "step": 4644 + }, + { + "epoch": 1.2560843699296917, + "grad_norm": 1.4870202541351318, + "learning_rate": 1.778844874131474e-05, + "loss": 2.0346, + "mean_token_accuracy": 0.5433812141418457, + "num_tokens": 2374927190.0, + "step": 4645 + }, + { + "epoch": 1.2563547863710114, + "grad_norm": 1.7576303482055664, + "learning_rate": 1.77874135743111e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.5662409067153931, + "num_tokens": 2375451289.0, + "step": 4646 + }, + { + "epoch": 1.256625202812331, + "grad_norm": 1.4955955743789673, + "learning_rate": 1.778637819904965e-05, + "loss": 2.0517, + "mean_token_accuracy": 0.5438516139984131, + "num_tokens": 2375959534.0, + "step": 4647 + }, + { + "epoch": 1.2568956192536507, + "grad_norm": 1.668674349784851, + "learning_rate": 1.7785342615562155e-05, + "loss": 1.9271, + "mean_token_accuracy": 0.5732145309448242, + "num_tokens": 2376483784.0, + "step": 4648 + }, + { + "epoch": 1.2571660356949703, + "grad_norm": 1.6008261442184448, + "learning_rate": 1.77843068238804e-05, + "loss": 2.0505, + "mean_token_accuracy": 0.5387244820594788, + "num_tokens": 2377007970.0, + "step": 4649 + }, + { + "epoch": 1.25743645213629, + "grad_norm": 1.4106664657592773, + "learning_rate": 1.778327082403616e-05, + "loss": 1.9676, + "mean_token_accuracy": 0.5502088069915771, + "num_tokens": 2377507268.0, + "step": 4650 + }, + { + "epoch": 1.2577068685776096, + "grad_norm": 1.6906815767288208, + "learning_rate": 1.778223461606122e-05, + "loss": 2.1763, + "mean_token_accuracy": 0.5542420148849487, + "num_tokens": 2377967850.0, + "step": 4651 + }, + { + "epoch": 1.2579772850189292, + "grad_norm": 1.296078085899353, + "learning_rate": 1.7781198199987375e-05, + "loss": 1.9528, + "mean_token_accuracy": 0.5496459007263184, + "num_tokens": 2378491990.0, + "step": 4652 + }, + { + "epoch": 1.258247701460249, + "grad_norm": 1.2734383344650269, + "learning_rate": 1.7780161575846428e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.5564466714859009, + "num_tokens": 2379008198.0, + "step": 4653 + }, + { + "epoch": 1.2585181179015685, + "grad_norm": 1.3528673648834229, + "learning_rate": 1.7779124743670187e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.5573675632476807, + "num_tokens": 2379497566.0, + "step": 4654 + }, + { + "epoch": 1.2587885343428882, + "grad_norm": 1.4428833723068237, + "learning_rate": 1.777808770349046e-05, + "loss": 2.0727, + "mean_token_accuracy": 0.5488216280937195, + "num_tokens": 2380021838.0, + "step": 4655 + }, + { + "epoch": 1.2590589507842076, + "grad_norm": 1.1146841049194336, + "learning_rate": 1.7777050455339074e-05, + "loss": 1.9966, + "mean_token_accuracy": 0.5567320585250854, + "num_tokens": 2380542193.0, + "step": 4656 + }, + { + "epoch": 1.2593293672255272, + "grad_norm": 1.0638952255249023, + "learning_rate": 1.7776012999247848e-05, + "loss": 1.952, + "mean_token_accuracy": 0.5771276950836182, + "num_tokens": 2381066244.0, + "step": 4657 + }, + { + "epoch": 1.2595997836668469, + "grad_norm": 1.2551277875900269, + "learning_rate": 1.7774975335248614e-05, + "loss": 2.1203, + "mean_token_accuracy": 0.5393120050430298, + "num_tokens": 2381590513.0, + "step": 4658 + }, + { + "epoch": 1.2598702001081665, + "grad_norm": 1.1488555669784546, + "learning_rate": 1.7773937463373213e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5159136056900024, + "num_tokens": 2382114782.0, + "step": 4659 + }, + { + "epoch": 1.2601406165494862, + "grad_norm": 1.140337347984314, + "learning_rate": 1.777289938365349e-05, + "loss": 1.9401, + "mean_token_accuracy": 0.5209797620773315, + "num_tokens": 2382639002.0, + "step": 4660 + }, + { + "epoch": 1.2604110329908058, + "grad_norm": 0.6243053078651428, + "learning_rate": 1.7771861096121297e-05, + "loss": 1.149, + "mean_token_accuracy": 0.6898492574691772, + "num_tokens": 2383158903.0, + "step": 4661 + }, + { + "epoch": 1.2606814494321255, + "grad_norm": 2.485177755355835, + "learning_rate": 1.7770822600808492e-05, + "loss": 2.0984, + "mean_token_accuracy": 0.532520055770874, + "num_tokens": 2383683042.0, + "step": 4662 + }, + { + "epoch": 1.260951865873445, + "grad_norm": 2.063748836517334, + "learning_rate": 1.7769783897746935e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.527321994304657, + "num_tokens": 2384207221.0, + "step": 4663 + }, + { + "epoch": 1.2612222823147647, + "grad_norm": 1.478980541229248, + "learning_rate": 1.77687449869685e-05, + "loss": 2.0567, + "mean_token_accuracy": 0.5400721430778503, + "num_tokens": 2384731303.0, + "step": 4664 + }, + { + "epoch": 1.2614926987560844, + "grad_norm": 1.5157197713851929, + "learning_rate": 1.7767705868505063e-05, + "loss": 2.0409, + "mean_token_accuracy": 0.5400298237800598, + "num_tokens": 2385255471.0, + "step": 4665 + }, + { + "epoch": 1.261763115197404, + "grad_norm": 1.3915265798568726, + "learning_rate": 1.776666654238851e-05, + "loss": 1.8462, + "mean_token_accuracy": 0.5628527402877808, + "num_tokens": 2385779678.0, + "step": 4666 + }, + { + "epoch": 1.2620335316387237, + "grad_norm": 1.9196784496307373, + "learning_rate": 1.776562700865073e-05, + "loss": 2.0369, + "mean_token_accuracy": 0.551143229007721, + "num_tokens": 2386299861.0, + "step": 4667 + }, + { + "epoch": 1.2623039480800433, + "grad_norm": 1.8086329698562622, + "learning_rate": 1.776458726732361e-05, + "loss": 2.1465, + "mean_token_accuracy": 0.533748209476471, + "num_tokens": 2386824132.0, + "step": 4668 + }, + { + "epoch": 1.262574364521363, + "grad_norm": 1.9183844327926636, + "learning_rate": 1.7763547318439064e-05, + "loss": 2.0084, + "mean_token_accuracy": 0.5489486455917358, + "num_tokens": 2387348311.0, + "step": 4669 + }, + { + "epoch": 1.2628447809626826, + "grad_norm": 1.7885408401489258, + "learning_rate": 1.7762507162028994e-05, + "loss": 2.0606, + "mean_token_accuracy": 0.5421655178070068, + "num_tokens": 2387872453.0, + "step": 4670 + }, + { + "epoch": 1.2631151974040022, + "grad_norm": 17.22848129272461, + "learning_rate": 1.776146679812532e-05, + "loss": 1.7332, + "mean_token_accuracy": 0.6035634875297546, + "num_tokens": 2388348159.0, + "step": 4671 + }, + { + "epoch": 1.2633856138453217, + "grad_norm": 1.3895955085754395, + "learning_rate": 1.7760426226759954e-05, + "loss": 1.9749, + "mean_token_accuracy": 0.5482447147369385, + "num_tokens": 2388872412.0, + "step": 4672 + }, + { + "epoch": 1.2636560302866413, + "grad_norm": 1.5842071771621704, + "learning_rate": 1.7759385447964833e-05, + "loss": 1.9834, + "mean_token_accuracy": 0.5380282402038574, + "num_tokens": 2389396611.0, + "step": 4673 + }, + { + "epoch": 1.263926446727961, + "grad_norm": 1.1851861476898193, + "learning_rate": 1.7758344461771893e-05, + "loss": 1.9913, + "mean_token_accuracy": 0.5358935594558716, + "num_tokens": 2389920890.0, + "step": 4674 + }, + { + "epoch": 1.2641968631692806, + "grad_norm": 1.2188057899475098, + "learning_rate": 1.7757303268213066e-05, + "loss": 2.0126, + "mean_token_accuracy": 0.5278946161270142, + "num_tokens": 2390445127.0, + "step": 4675 + }, + { + "epoch": 1.2644672796106002, + "grad_norm": 1.3949198722839355, + "learning_rate": 1.7756261867320306e-05, + "loss": 2.0371, + "mean_token_accuracy": 0.5606876015663147, + "num_tokens": 2390969334.0, + "step": 4676 + }, + { + "epoch": 1.2647376960519199, + "grad_norm": 1.2940531969070435, + "learning_rate": 1.775522025912556e-05, + "loss": 2.0058, + "mean_token_accuracy": 0.5628547668457031, + "num_tokens": 2391463478.0, + "step": 4677 + }, + { + "epoch": 1.2650081124932395, + "grad_norm": 1.3355681896209717, + "learning_rate": 1.775417844366079e-05, + "loss": 2.0778, + "mean_token_accuracy": 0.5331701040267944, + "num_tokens": 2391987659.0, + "step": 4678 + }, + { + "epoch": 1.2652785289345592, + "grad_norm": 1.3766506910324097, + "learning_rate": 1.7753136420957963e-05, + "loss": 2.0634, + "mean_token_accuracy": 0.5323007702827454, + "num_tokens": 2392511837.0, + "step": 4679 + }, + { + "epoch": 1.2655489453758788, + "grad_norm": 1.4728202819824219, + "learning_rate": 1.7752094191049053e-05, + "loss": 2.1257, + "mean_token_accuracy": 0.5458230972290039, + "num_tokens": 2393036009.0, + "step": 4680 + }, + { + "epoch": 1.2658193618171985, + "grad_norm": 0.7927582859992981, + "learning_rate": 1.7751051753966035e-05, + "loss": 1.161, + "mean_token_accuracy": 0.6839281320571899, + "num_tokens": 2393542650.0, + "step": 4681 + }, + { + "epoch": 1.266089778258518, + "grad_norm": 2.2238545417785645, + "learning_rate": 1.7750009109740897e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5247921943664551, + "num_tokens": 2394017425.0, + "step": 4682 + }, + { + "epoch": 1.2663601946998377, + "grad_norm": 1.8786836862564087, + "learning_rate": 1.7748966258405628e-05, + "loss": 1.9184, + "mean_token_accuracy": 0.557225227355957, + "num_tokens": 2394541547.0, + "step": 4683 + }, + { + "epoch": 1.2666306111411574, + "grad_norm": 1.4014647006988525, + "learning_rate": 1.7747923199992227e-05, + "loss": 1.9553, + "mean_token_accuracy": 0.5543360114097595, + "num_tokens": 2395065632.0, + "step": 4684 + }, + { + "epoch": 1.266901027582477, + "grad_norm": 1.9227205514907837, + "learning_rate": 1.7746879934532698e-05, + "loss": 1.9064, + "mean_token_accuracy": 0.5533777475357056, + "num_tokens": 2395530559.0, + "step": 4685 + }, + { + "epoch": 1.2671714440237967, + "grad_norm": 1.656639814376831, + "learning_rate": 1.7745836462059048e-05, + "loss": 2.1529, + "mean_token_accuracy": 0.5308263301849365, + "num_tokens": 2396054799.0, + "step": 4686 + }, + { + "epoch": 1.2674418604651163, + "grad_norm": 1.6237038373947144, + "learning_rate": 1.7744792782603303e-05, + "loss": 2.0196, + "mean_token_accuracy": 0.5410213470458984, + "num_tokens": 2396578996.0, + "step": 4687 + }, + { + "epoch": 1.267712276906436, + "grad_norm": 1.6707046031951904, + "learning_rate": 1.7743748896197476e-05, + "loss": 2.0849, + "mean_token_accuracy": 0.5204009413719177, + "num_tokens": 2397103267.0, + "step": 4688 + }, + { + "epoch": 1.2679826933477556, + "grad_norm": 1.4643316268920898, + "learning_rate": 1.7742704802873602e-05, + "loss": 1.9274, + "mean_token_accuracy": 0.5630131959915161, + "num_tokens": 2397618188.0, + "step": 4689 + }, + { + "epoch": 1.2682531097890752, + "grad_norm": 1.3267149925231934, + "learning_rate": 1.7741660502663717e-05, + "loss": 2.0405, + "mean_token_accuracy": 0.534767746925354, + "num_tokens": 2398142383.0, + "step": 4690 + }, + { + "epoch": 1.2685235262303949, + "grad_norm": 1.4683637619018555, + "learning_rate": 1.774061599559986e-05, + "loss": 1.9685, + "mean_token_accuracy": 0.5525557994842529, + "num_tokens": 2398666662.0, + "step": 4691 + }, + { + "epoch": 1.2687939426717145, + "grad_norm": 1.5198311805725098, + "learning_rate": 1.7739571281714084e-05, + "loss": 2.0524, + "mean_token_accuracy": 0.5394909381866455, + "num_tokens": 2399190846.0, + "step": 4692 + }, + { + "epoch": 1.2690643591130342, + "grad_norm": 1.3462450504302979, + "learning_rate": 1.7738526361038437e-05, + "loss": 2.1457, + "mean_token_accuracy": 0.523409366607666, + "num_tokens": 2399714938.0, + "step": 4693 + }, + { + "epoch": 1.2693347755543538, + "grad_norm": 1.2396485805511475, + "learning_rate": 1.7737481233604987e-05, + "loss": 2.0291, + "mean_token_accuracy": 0.5452040433883667, + "num_tokens": 2400239203.0, + "step": 4694 + }, + { + "epoch": 1.2696051919956735, + "grad_norm": 1.26007080078125, + "learning_rate": 1.7736435899445802e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5396113991737366, + "num_tokens": 2400763434.0, + "step": 4695 + }, + { + "epoch": 1.269875608436993, + "grad_norm": 1.306890606880188, + "learning_rate": 1.773539035859295e-05, + "loss": 2.0648, + "mean_token_accuracy": 0.5360788106918335, + "num_tokens": 2401287713.0, + "step": 4696 + }, + { + "epoch": 1.2701460248783125, + "grad_norm": 1.373748540878296, + "learning_rate": 1.773434461107851e-05, + "loss": 2.0668, + "mean_token_accuracy": 0.5355650186538696, + "num_tokens": 2401811934.0, + "step": 4697 + }, + { + "epoch": 1.2704164413196322, + "grad_norm": 1.6342419385910034, + "learning_rate": 1.773329865693458e-05, + "loss": 2.1462, + "mean_token_accuracy": 0.5273525714874268, + "num_tokens": 2402336216.0, + "step": 4698 + }, + { + "epoch": 1.2706868577609518, + "grad_norm": 1.509851336479187, + "learning_rate": 1.7732252496193244e-05, + "loss": 2.0388, + "mean_token_accuracy": 0.543563961982727, + "num_tokens": 2402843279.0, + "step": 4699 + }, + { + "epoch": 1.2709572742022714, + "grad_norm": 1.4428189992904663, + "learning_rate": 1.77312061288866e-05, + "loss": 1.9919, + "mean_token_accuracy": 0.5502579212188721, + "num_tokens": 2403367410.0, + "step": 4700 + }, + { + "epoch": 1.271227690643591, + "grad_norm": 0.6089760661125183, + "learning_rate": 1.773015955504676e-05, + "loss": 1.1533, + "mean_token_accuracy": 0.7017143964767456, + "num_tokens": 2403891682.0, + "step": 4701 + }, + { + "epoch": 1.2714981070849107, + "grad_norm": 1.6178393363952637, + "learning_rate": 1.7729112774705833e-05, + "loss": 2.0133, + "mean_token_accuracy": 0.5287567973136902, + "num_tokens": 2404415961.0, + "step": 4702 + }, + { + "epoch": 1.2717685235262304, + "grad_norm": 1.448656439781189, + "learning_rate": 1.7728065787895937e-05, + "loss": 2.0968, + "mean_token_accuracy": 0.5406985282897949, + "num_tokens": 2404940227.0, + "step": 4703 + }, + { + "epoch": 1.27203893996755, + "grad_norm": 1.2926424741744995, + "learning_rate": 1.7727018594649195e-05, + "loss": 1.961, + "mean_token_accuracy": 0.5718903541564941, + "num_tokens": 2405464353.0, + "step": 4704 + }, + { + "epoch": 1.2723093564088697, + "grad_norm": 1.541682481765747, + "learning_rate": 1.772597119499774e-05, + "loss": 2.0166, + "mean_token_accuracy": 0.549522876739502, + "num_tokens": 2405929688.0, + "step": 4705 + }, + { + "epoch": 1.2725797728501893, + "grad_norm": 1.2996761798858643, + "learning_rate": 1.7724923588973707e-05, + "loss": 2.0678, + "mean_token_accuracy": 0.5359668731689453, + "num_tokens": 2406453908.0, + "step": 4706 + }, + { + "epoch": 1.272850189291509, + "grad_norm": 1.4131956100463867, + "learning_rate": 1.7723875776609245e-05, + "loss": 1.9536, + "mean_token_accuracy": 0.5335902571678162, + "num_tokens": 2406978134.0, + "step": 4707 + }, + { + "epoch": 1.2731206057328286, + "grad_norm": 1.3485996723175049, + "learning_rate": 1.77228277579365e-05, + "loss": 2.1503, + "mean_token_accuracy": 0.5282558798789978, + "num_tokens": 2407502407.0, + "step": 4708 + }, + { + "epoch": 1.2733910221741482, + "grad_norm": 1.3538615703582764, + "learning_rate": 1.7721779532987628e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.5141135454177856, + "num_tokens": 2408026624.0, + "step": 4709 + }, + { + "epoch": 1.2736614386154679, + "grad_norm": 1.6285369396209717, + "learning_rate": 1.7720731101794793e-05, + "loss": 2.0846, + "mean_token_accuracy": 0.5456728339195251, + "num_tokens": 2408550874.0, + "step": 4710 + }, + { + "epoch": 1.2739318550567875, + "grad_norm": 1.6485426425933838, + "learning_rate": 1.7719682464390164e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5527878999710083, + "num_tokens": 2409075054.0, + "step": 4711 + }, + { + "epoch": 1.2742022714981072, + "grad_norm": 1.6072115898132324, + "learning_rate": 1.771863362080591e-05, + "loss": 1.9559, + "mean_token_accuracy": 0.5702950358390808, + "num_tokens": 2409599322.0, + "step": 4712 + }, + { + "epoch": 1.2744726879394266, + "grad_norm": 1.1349151134490967, + "learning_rate": 1.7717584571074225e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5577443242073059, + "num_tokens": 2410123522.0, + "step": 4713 + }, + { + "epoch": 1.2747431043807462, + "grad_norm": 1.4838002920150757, + "learning_rate": 1.7716535315227287e-05, + "loss": 2.1085, + "mean_token_accuracy": 0.5272976756095886, + "num_tokens": 2410647770.0, + "step": 4714 + }, + { + "epoch": 1.2750135208220659, + "grad_norm": 1.233656883239746, + "learning_rate": 1.7715485853297297e-05, + "loss": 2.0733, + "mean_token_accuracy": 0.5448183417320251, + "num_tokens": 2411171934.0, + "step": 4715 + }, + { + "epoch": 1.2752839372633855, + "grad_norm": 1.3927809000015259, + "learning_rate": 1.771443618531645e-05, + "loss": 2.0326, + "mean_token_accuracy": 0.5308258533477783, + "num_tokens": 2411696195.0, + "step": 4716 + }, + { + "epoch": 1.2755543537047052, + "grad_norm": 1.111379623413086, + "learning_rate": 1.7713386311316955e-05, + "loss": 1.9939, + "mean_token_accuracy": 0.5443510413169861, + "num_tokens": 2412220403.0, + "step": 4717 + }, + { + "epoch": 1.2758247701460248, + "grad_norm": 1.2533938884735107, + "learning_rate": 1.7712336231331025e-05, + "loss": 2.0537, + "mean_token_accuracy": 0.5506841540336609, + "num_tokens": 2412713682.0, + "step": 4718 + }, + { + "epoch": 1.2760951865873444, + "grad_norm": 1.2131316661834717, + "learning_rate": 1.771128594539088e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.549591064453125, + "num_tokens": 2413237942.0, + "step": 4719 + }, + { + "epoch": 1.276365603028664, + "grad_norm": 1.2819546461105347, + "learning_rate": 1.7710235453528743e-05, + "loss": 2.1284, + "mean_token_accuracy": 0.5191435813903809, + "num_tokens": 2413762195.0, + "step": 4720 + }, + { + "epoch": 1.2766360194699837, + "grad_norm": 0.9595822691917419, + "learning_rate": 1.770918475577685e-05, + "loss": 1.1245, + "mean_token_accuracy": 0.708066463470459, + "num_tokens": 2414248833.0, + "step": 4721 + }, + { + "epoch": 1.2769064359113034, + "grad_norm": 2.0856099128723145, + "learning_rate": 1.770813385216744e-05, + "loss": 2.081, + "mean_token_accuracy": 0.5583502054214478, + "num_tokens": 2414708185.0, + "step": 4722 + }, + { + "epoch": 1.277176852352623, + "grad_norm": 1.5820858478546143, + "learning_rate": 1.770708274273276e-05, + "loss": 1.957, + "mean_token_accuracy": 0.5744572877883911, + "num_tokens": 2415205244.0, + "step": 4723 + }, + { + "epoch": 1.2774472687939427, + "grad_norm": 1.213975191116333, + "learning_rate": 1.7706031427505048e-05, + "loss": 2.0185, + "mean_token_accuracy": 0.5480387210845947, + "num_tokens": 2415721647.0, + "step": 4724 + }, + { + "epoch": 1.2777176852352623, + "grad_norm": 1.5494176149368286, + "learning_rate": 1.7704979906516578e-05, + "loss": 1.9487, + "mean_token_accuracy": 0.5687854886054993, + "num_tokens": 2416245810.0, + "step": 4725 + }, + { + "epoch": 1.277988101676582, + "grad_norm": 1.323056697845459, + "learning_rate": 1.77039281797996e-05, + "loss": 2.0721, + "mean_token_accuracy": 0.5424289107322693, + "num_tokens": 2416750394.0, + "step": 4726 + }, + { + "epoch": 1.2782585181179016, + "grad_norm": 1.1246289014816284, + "learning_rate": 1.7702876247386395e-05, + "loss": 2.0057, + "mean_token_accuracy": 0.5419687628746033, + "num_tokens": 2417274498.0, + "step": 4727 + }, + { + "epoch": 1.2785289345592212, + "grad_norm": 1.4070316553115845, + "learning_rate": 1.7701824109309232e-05, + "loss": 1.9303, + "mean_token_accuracy": 0.5708545446395874, + "num_tokens": 2417798641.0, + "step": 4728 + }, + { + "epoch": 1.2787993510005409, + "grad_norm": 1.2661280632019043, + "learning_rate": 1.7700771765600398e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.5208339095115662, + "num_tokens": 2418322908.0, + "step": 4729 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 1.3092739582061768, + "learning_rate": 1.7699719216292185e-05, + "loss": 1.9852, + "mean_token_accuracy": 0.5493489503860474, + "num_tokens": 2418815983.0, + "step": 4730 + }, + { + "epoch": 1.2793401838831802, + "grad_norm": 1.1936311721801758, + "learning_rate": 1.769866646141688e-05, + "loss": 2.1343, + "mean_token_accuracy": 0.5339073538780212, + "num_tokens": 2419340126.0, + "step": 4731 + }, + { + "epoch": 1.2796106003244998, + "grad_norm": 1.188632845878601, + "learning_rate": 1.769761350100679e-05, + "loss": 2.0923, + "mean_token_accuracy": 0.5371673703193665, + "num_tokens": 2419864327.0, + "step": 4732 + }, + { + "epoch": 1.2798810167658194, + "grad_norm": 1.6537165641784668, + "learning_rate": 1.7696560335094216e-05, + "loss": 2.08, + "mean_token_accuracy": 0.5324182510375977, + "num_tokens": 2420388575.0, + "step": 4733 + }, + { + "epoch": 1.280151433207139, + "grad_norm": 1.308835744857788, + "learning_rate": 1.7695506963711485e-05, + "loss": 1.9126, + "mean_token_accuracy": 0.5587738156318665, + "num_tokens": 2420912736.0, + "step": 4734 + }, + { + "epoch": 1.2804218496484587, + "grad_norm": 1.1542954444885254, + "learning_rate": 1.7694453386890904e-05, + "loss": 2.1277, + "mean_token_accuracy": 0.5224484205245972, + "num_tokens": 2421437007.0, + "step": 4735 + }, + { + "epoch": 1.2806922660897784, + "grad_norm": 1.4345601797103882, + "learning_rate": 1.769339960466481e-05, + "loss": 1.9441, + "mean_token_accuracy": 0.5537549257278442, + "num_tokens": 2421961003.0, + "step": 4736 + }, + { + "epoch": 1.280962682531098, + "grad_norm": 1.3845864534378052, + "learning_rate": 1.7692345617065528e-05, + "loss": 1.9645, + "mean_token_accuracy": 0.5632320642471313, + "num_tokens": 2422471872.0, + "step": 4737 + }, + { + "epoch": 1.2812330989724177, + "grad_norm": 1.3639882802963257, + "learning_rate": 1.7691291424125405e-05, + "loss": 2.0604, + "mean_token_accuracy": 0.5524234771728516, + "num_tokens": 2422935404.0, + "step": 4738 + }, + { + "epoch": 1.281503515413737, + "grad_norm": 1.6115546226501465, + "learning_rate": 1.7690237025876782e-05, + "loss": 2.0979, + "mean_token_accuracy": 0.5341070890426636, + "num_tokens": 2423459672.0, + "step": 4739 + }, + { + "epoch": 1.2817739318550567, + "grad_norm": 1.4163925647735596, + "learning_rate": 1.7689182422352014e-05, + "loss": 2.0645, + "mean_token_accuracy": 0.5422203540802002, + "num_tokens": 2423983807.0, + "step": 4740 + }, + { + "epoch": 1.2820443482963764, + "grad_norm": 0.6878334283828735, + "learning_rate": 1.7688127613583455e-05, + "loss": 1.2135, + "mean_token_accuracy": 0.6894477605819702, + "num_tokens": 2424468508.0, + "step": 4741 + }, + { + "epoch": 1.282314764737696, + "grad_norm": 1.8348774909973145, + "learning_rate": 1.7687072599603475e-05, + "loss": 2.0715, + "mean_token_accuracy": 0.5309659242630005, + "num_tokens": 2424992781.0, + "step": 4742 + }, + { + "epoch": 1.2825851811790157, + "grad_norm": 1.716579556465149, + "learning_rate": 1.768601738044444e-05, + "loss": 2.0767, + "mean_token_accuracy": 0.5360937118530273, + "num_tokens": 2425512460.0, + "step": 4743 + }, + { + "epoch": 1.2828555976203353, + "grad_norm": 1.3429844379425049, + "learning_rate": 1.768496195613873e-05, + "loss": 2.0805, + "mean_token_accuracy": 0.5326191782951355, + "num_tokens": 2426036703.0, + "step": 4744 + }, + { + "epoch": 1.283126014061655, + "grad_norm": 1.5667014122009277, + "learning_rate": 1.768390632671873e-05, + "loss": 2.14, + "mean_token_accuracy": 0.5352566242218018, + "num_tokens": 2426560981.0, + "step": 4745 + }, + { + "epoch": 1.2833964305029746, + "grad_norm": 1.4533709287643433, + "learning_rate": 1.7682850492216825e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.5539118051528931, + "num_tokens": 2427076263.0, + "step": 4746 + }, + { + "epoch": 1.2836668469442942, + "grad_norm": 1.6150449514389038, + "learning_rate": 1.7681794452665416e-05, + "loss": 2.1116, + "mean_token_accuracy": 0.5357566475868225, + "num_tokens": 2427600467.0, + "step": 4747 + }, + { + "epoch": 1.2839372633856139, + "grad_norm": 1.4232532978057861, + "learning_rate": 1.7680738208096905e-05, + "loss": 2.0968, + "mean_token_accuracy": 0.5345154404640198, + "num_tokens": 2428124668.0, + "step": 4748 + }, + { + "epoch": 1.2842076798269335, + "grad_norm": 1.4073448181152344, + "learning_rate": 1.7679681758543694e-05, + "loss": 2.0001, + "mean_token_accuracy": 0.5454404950141907, + "num_tokens": 2428648888.0, + "step": 4749 + }, + { + "epoch": 1.2844780962682532, + "grad_norm": 1.5822087526321411, + "learning_rate": 1.7678625104038206e-05, + "loss": 2.0693, + "mean_token_accuracy": 0.5442275404930115, + "num_tokens": 2429152506.0, + "step": 4750 + }, + { + "epoch": 1.2847485127095728, + "grad_norm": 1.6840379238128662, + "learning_rate": 1.767756824461286e-05, + "loss": 2.1219, + "mean_token_accuracy": 0.5394988656044006, + "num_tokens": 2429623583.0, + "step": 4751 + }, + { + "epoch": 1.2850189291508924, + "grad_norm": 1.5137519836425781, + "learning_rate": 1.7676511180300083e-05, + "loss": 2.0084, + "mean_token_accuracy": 0.5250999331474304, + "num_tokens": 2430147811.0, + "step": 4752 + }, + { + "epoch": 1.285289345592212, + "grad_norm": 1.3578826189041138, + "learning_rate": 1.7675453911132306e-05, + "loss": 2.0569, + "mean_token_accuracy": 0.5567405223846436, + "num_tokens": 2430632960.0, + "step": 4753 + }, + { + "epoch": 1.2855597620335315, + "grad_norm": 1.3224709033966064, + "learning_rate": 1.7674396437141975e-05, + "loss": 1.9927, + "mean_token_accuracy": 0.5518571734428406, + "num_tokens": 2431157139.0, + "step": 4754 + }, + { + "epoch": 1.2858301784748511, + "grad_norm": 1.2677053213119507, + "learning_rate": 1.7673338758361526e-05, + "loss": 1.9456, + "mean_token_accuracy": 0.5432493686676025, + "num_tokens": 2431681182.0, + "step": 4755 + }, + { + "epoch": 1.2861005949161708, + "grad_norm": 1.4240642786026, + "learning_rate": 1.7672280874823426e-05, + "loss": 2.0356, + "mean_token_accuracy": 0.5330284833908081, + "num_tokens": 2432205406.0, + "step": 4756 + }, + { + "epoch": 1.2863710113574904, + "grad_norm": 1.4525090456008911, + "learning_rate": 1.7671222786560126e-05, + "loss": 2.0477, + "mean_token_accuracy": 0.547167181968689, + "num_tokens": 2432729679.0, + "step": 4757 + }, + { + "epoch": 1.28664142779881, + "grad_norm": 1.4773213863372803, + "learning_rate": 1.767016449360409e-05, + "loss": 1.9796, + "mean_token_accuracy": 0.5563034415245056, + "num_tokens": 2433210304.0, + "step": 4758 + }, + { + "epoch": 1.2869118442401297, + "grad_norm": 1.0985244512557983, + "learning_rate": 1.7669105995987794e-05, + "loss": 2.0539, + "mean_token_accuracy": 0.5372049808502197, + "num_tokens": 2433734553.0, + "step": 4759 + }, + { + "epoch": 1.2871822606814494, + "grad_norm": 1.3373850584030151, + "learning_rate": 1.766804729374371e-05, + "loss": 2.043, + "mean_token_accuracy": 0.5408514738082886, + "num_tokens": 2434258808.0, + "step": 4760 + }, + { + "epoch": 1.287452677122769, + "grad_norm": 0.729849636554718, + "learning_rate": 1.7666988386904328e-05, + "loss": 1.2255, + "mean_token_accuracy": 0.6775733232498169, + "num_tokens": 2434783072.0, + "step": 4761 + }, + { + "epoch": 1.2877230935640886, + "grad_norm": 1.6657664775848389, + "learning_rate": 1.7665929275502137e-05, + "loss": 2.0315, + "mean_token_accuracy": 0.5445611476898193, + "num_tokens": 2435284353.0, + "step": 4762 + }, + { + "epoch": 1.2879935100054083, + "grad_norm": 1.6031843423843384, + "learning_rate": 1.7664869959569632e-05, + "loss": 2.0971, + "mean_token_accuracy": 0.5248252153396606, + "num_tokens": 2435808598.0, + "step": 4763 + }, + { + "epoch": 1.288263926446728, + "grad_norm": 1.4018816947937012, + "learning_rate": 1.7663810439139318e-05, + "loss": 2.0296, + "mean_token_accuracy": 0.5377523303031921, + "num_tokens": 2436332812.0, + "step": 4764 + }, + { + "epoch": 1.2885343428880476, + "grad_norm": 1.3220571279525757, + "learning_rate": 1.7662750714243697e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.5503876805305481, + "num_tokens": 2436857021.0, + "step": 4765 + }, + { + "epoch": 1.2888047593293672, + "grad_norm": 1.4528928995132446, + "learning_rate": 1.7661690784915295e-05, + "loss": 2.0344, + "mean_token_accuracy": 0.5549107789993286, + "num_tokens": 2437381308.0, + "step": 4766 + }, + { + "epoch": 1.2890751757706869, + "grad_norm": 1.7921385765075684, + "learning_rate": 1.7660630651186628e-05, + "loss": 2.0276, + "mean_token_accuracy": 0.5593496561050415, + "num_tokens": 2437842279.0, + "step": 4767 + }, + { + "epoch": 1.2893455922120065, + "grad_norm": 1.4598766565322876, + "learning_rate": 1.7659570313090226e-05, + "loss": 1.9731, + "mean_token_accuracy": 0.5537591576576233, + "num_tokens": 2438366375.0, + "step": 4768 + }, + { + "epoch": 1.2896160086533262, + "grad_norm": 1.321247935295105, + "learning_rate": 1.765850977065862e-05, + "loss": 2.0771, + "mean_token_accuracy": 0.5484252572059631, + "num_tokens": 2438890643.0, + "step": 4769 + }, + { + "epoch": 1.2898864250946458, + "grad_norm": 1.4266598224639893, + "learning_rate": 1.7657449023924353e-05, + "loss": 2.0361, + "mean_token_accuracy": 0.5335312485694885, + "num_tokens": 2439371687.0, + "step": 4770 + }, + { + "epoch": 1.2901568415359654, + "grad_norm": 1.400164008140564, + "learning_rate": 1.7656388072919972e-05, + "loss": 1.9498, + "mean_token_accuracy": 0.5711861252784729, + "num_tokens": 2439895895.0, + "step": 4771 + }, + { + "epoch": 1.290427257977285, + "grad_norm": 1.46565842628479, + "learning_rate": 1.765532691767803e-05, + "loss": 2.0763, + "mean_token_accuracy": 0.537755012512207, + "num_tokens": 2440420100.0, + "step": 4772 + }, + { + "epoch": 1.2906976744186047, + "grad_norm": 1.9237124919891357, + "learning_rate": 1.765426555823109e-05, + "loss": 2.0175, + "mean_token_accuracy": 0.5465096235275269, + "num_tokens": 2440944208.0, + "step": 4773 + }, + { + "epoch": 1.2909680908599244, + "grad_norm": 1.4809271097183228, + "learning_rate": 1.7653203994611708e-05, + "loss": 2.0863, + "mean_token_accuracy": 0.5624680519104004, + "num_tokens": 2441404880.0, + "step": 4774 + }, + { + "epoch": 1.291238507301244, + "grad_norm": 1.7305134534835815, + "learning_rate": 1.7652142226852464e-05, + "loss": 2.0528, + "mean_token_accuracy": 0.5607687830924988, + "num_tokens": 2441901541.0, + "step": 4775 + }, + { + "epoch": 1.2915089237425637, + "grad_norm": 1.3076345920562744, + "learning_rate": 1.7651080254985933e-05, + "loss": 1.9331, + "mean_token_accuracy": 0.5586947202682495, + "num_tokens": 2442406665.0, + "step": 4776 + }, + { + "epoch": 1.2917793401838833, + "grad_norm": 1.0343188047409058, + "learning_rate": 1.7650018079044703e-05, + "loss": 1.9269, + "mean_token_accuracy": 0.5601668357849121, + "num_tokens": 2442930881.0, + "step": 4777 + }, + { + "epoch": 1.292049756625203, + "grad_norm": 1.890662670135498, + "learning_rate": 1.764895569906136e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.5401132106781006, + "num_tokens": 2443396935.0, + "step": 4778 + }, + { + "epoch": 1.2923201730665226, + "grad_norm": 1.3417456150054932, + "learning_rate": 1.7647893115068503e-05, + "loss": 2.1156, + "mean_token_accuracy": 0.5386483669281006, + "num_tokens": 2443921134.0, + "step": 4779 + }, + { + "epoch": 1.292590589507842, + "grad_norm": 1.3558759689331055, + "learning_rate": 1.7646830327098734e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.537535548210144, + "num_tokens": 2444445280.0, + "step": 4780 + }, + { + "epoch": 1.2928610059491616, + "grad_norm": 0.8236724138259888, + "learning_rate": 1.7645767335184662e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.6608777046203613, + "num_tokens": 2444929805.0, + "step": 4781 + }, + { + "epoch": 1.2931314223904813, + "grad_norm": 2.4716339111328125, + "learning_rate": 1.7644704139358907e-05, + "loss": 1.898, + "mean_token_accuracy": 0.5392470359802246, + "num_tokens": 2445453857.0, + "step": 4782 + }, + { + "epoch": 1.293401838831801, + "grad_norm": 2.261927604675293, + "learning_rate": 1.764364073965409e-05, + "loss": 2.0131, + "mean_token_accuracy": 0.5867160558700562, + "num_tokens": 2445913837.0, + "step": 4783 + }, + { + "epoch": 1.2936722552731206, + "grad_norm": 1.5215200185775757, + "learning_rate": 1.7642577136102833e-05, + "loss": 2.0281, + "mean_token_accuracy": 0.5519987344741821, + "num_tokens": 2446438021.0, + "step": 4784 + }, + { + "epoch": 1.2939426717144402, + "grad_norm": 2.4725046157836914, + "learning_rate": 1.7641513328737775e-05, + "loss": 2.0397, + "mean_token_accuracy": 0.5513885021209717, + "num_tokens": 2446928906.0, + "step": 4785 + }, + { + "epoch": 1.2942130881557599, + "grad_norm": 2.4180140495300293, + "learning_rate": 1.764044931759156e-05, + "loss": 2.1293, + "mean_token_accuracy": 0.5250964164733887, + "num_tokens": 2447452885.0, + "step": 4786 + }, + { + "epoch": 1.2944835045970795, + "grad_norm": 1.7744804620742798, + "learning_rate": 1.763938510269683e-05, + "loss": 2.1206, + "mean_token_accuracy": 0.5302643775939941, + "num_tokens": 2447977084.0, + "step": 4787 + }, + { + "epoch": 1.2947539210383991, + "grad_norm": 1.6846545934677124, + "learning_rate": 1.763832068408624e-05, + "loss": 2.0778, + "mean_token_accuracy": 0.5146735310554504, + "num_tokens": 2448501236.0, + "step": 4788 + }, + { + "epoch": 1.2950243374797188, + "grad_norm": 1.6192634105682373, + "learning_rate": 1.763725606179245e-05, + "loss": 2.1373, + "mean_token_accuracy": 0.511945366859436, + "num_tokens": 2449025398.0, + "step": 4789 + }, + { + "epoch": 1.2952947539210384, + "grad_norm": 1.5381934642791748, + "learning_rate": 1.7636191235848126e-05, + "loss": 1.9482, + "mean_token_accuracy": 0.5695314407348633, + "num_tokens": 2449549659.0, + "step": 4790 + }, + { + "epoch": 1.295565170362358, + "grad_norm": 1.8615745306015015, + "learning_rate": 1.7635126206285937e-05, + "loss": 2.0126, + "mean_token_accuracy": 0.5465253591537476, + "num_tokens": 2450073760.0, + "step": 4791 + }, + { + "epoch": 1.2958355868036777, + "grad_norm": 1.5728070735931396, + "learning_rate": 1.7634060973138564e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.5285540819168091, + "num_tokens": 2450535155.0, + "step": 4792 + }, + { + "epoch": 1.2961060032449974, + "grad_norm": 1.612617015838623, + "learning_rate": 1.763299553643869e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.5618802309036255, + "num_tokens": 2451004330.0, + "step": 4793 + }, + { + "epoch": 1.296376419686317, + "grad_norm": 1.4942245483398438, + "learning_rate": 1.7631929896219008e-05, + "loss": 2.0513, + "mean_token_accuracy": 0.5567830204963684, + "num_tokens": 2451528489.0, + "step": 4794 + }, + { + "epoch": 1.2966468361276364, + "grad_norm": 1.4112578630447388, + "learning_rate": 1.7630864052512213e-05, + "loss": 2.1315, + "mean_token_accuracy": 0.5301719903945923, + "num_tokens": 2452052657.0, + "step": 4795 + }, + { + "epoch": 1.296917252568956, + "grad_norm": 1.3496968746185303, + "learning_rate": 1.7629798005351012e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.542316198348999, + "num_tokens": 2452576934.0, + "step": 4796 + }, + { + "epoch": 1.2971876690102757, + "grad_norm": 1.669338345527649, + "learning_rate": 1.7628731754768106e-05, + "loss": 2.0768, + "mean_token_accuracy": 0.5392572283744812, + "num_tokens": 2453101113.0, + "step": 4797 + }, + { + "epoch": 1.2974580854515954, + "grad_norm": 1.284266471862793, + "learning_rate": 1.7627665300796217e-05, + "loss": 1.8942, + "mean_token_accuracy": 0.5610518455505371, + "num_tokens": 2453615753.0, + "step": 4798 + }, + { + "epoch": 1.297728501892915, + "grad_norm": 1.374779462814331, + "learning_rate": 1.762659864346807e-05, + "loss": 1.8959, + "mean_token_accuracy": 0.5622227191925049, + "num_tokens": 2454139989.0, + "step": 4799 + }, + { + "epoch": 1.2979989183342346, + "grad_norm": 1.680561900138855, + "learning_rate": 1.7625531782816382e-05, + "loss": 2.0584, + "mean_token_accuracy": 0.5323445796966553, + "num_tokens": 2454664242.0, + "step": 4800 + }, + { + "epoch": 1.2982693347755543, + "grad_norm": 0.6348028779029846, + "learning_rate": 1.76244647188739e-05, + "loss": 1.1398, + "mean_token_accuracy": 0.6986052989959717, + "num_tokens": 2455188426.0, + "step": 4801 + }, + { + "epoch": 1.298539751216874, + "grad_norm": 2.193272590637207, + "learning_rate": 1.762339745167336e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5525067448616028, + "num_tokens": 2455712645.0, + "step": 4802 + }, + { + "epoch": 1.2988101676581936, + "grad_norm": 2.3132545948028564, + "learning_rate": 1.7622329981247505e-05, + "loss": 2.0004, + "mean_token_accuracy": 0.5483641624450684, + "num_tokens": 2456236922.0, + "step": 4803 + }, + { + "epoch": 1.2990805840995132, + "grad_norm": 1.5170502662658691, + "learning_rate": 1.762126230762909e-05, + "loss": 2.0133, + "mean_token_accuracy": 0.5541352033615112, + "num_tokens": 2456742389.0, + "step": 4804 + }, + { + "epoch": 1.2993510005408329, + "grad_norm": 2.466672897338867, + "learning_rate": 1.762019443085088e-05, + "loss": 2.097, + "mean_token_accuracy": 0.5491008758544922, + "num_tokens": 2457266449.0, + "step": 4805 + }, + { + "epoch": 1.2996214169821525, + "grad_norm": 2.8639607429504395, + "learning_rate": 1.761912635094563e-05, + "loss": 2.0781, + "mean_token_accuracy": 0.5416481494903564, + "num_tokens": 2457790713.0, + "step": 4806 + }, + { + "epoch": 1.2998918334234721, + "grad_norm": 2.185854196548462, + "learning_rate": 1.7618058067946125e-05, + "loss": 2.0568, + "mean_token_accuracy": 0.5403463244438171, + "num_tokens": 2458314840.0, + "step": 4807 + }, + { + "epoch": 1.3001622498647918, + "grad_norm": 2.256824254989624, + "learning_rate": 1.761698958188513e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5485399961471558, + "num_tokens": 2458783096.0, + "step": 4808 + }, + { + "epoch": 1.3004326663061114, + "grad_norm": 2.1264455318450928, + "learning_rate": 1.761592089279544e-05, + "loss": 2.058, + "mean_token_accuracy": 0.5359783172607422, + "num_tokens": 2459307164.0, + "step": 4809 + }, + { + "epoch": 1.300703082747431, + "grad_norm": 1.6436914205551147, + "learning_rate": 1.7614852000709836e-05, + "loss": 2.105, + "mean_token_accuracy": 0.5304353833198547, + "num_tokens": 2459831308.0, + "step": 4810 + }, + { + "epoch": 1.3009734991887507, + "grad_norm": 1.5691213607788086, + "learning_rate": 1.7613782905661124e-05, + "loss": 2.0007, + "mean_token_accuracy": 0.5425588488578796, + "num_tokens": 2460355480.0, + "step": 4811 + }, + { + "epoch": 1.3012439156300704, + "grad_norm": 1.7022438049316406, + "learning_rate": 1.7612713607682098e-05, + "loss": 2.0057, + "mean_token_accuracy": 0.5494626760482788, + "num_tokens": 2460879722.0, + "step": 4812 + }, + { + "epoch": 1.30151433207139, + "grad_norm": 1.4615427255630493, + "learning_rate": 1.761164410680558e-05, + "loss": 2.0634, + "mean_token_accuracy": 0.5421744585037231, + "num_tokens": 2461392742.0, + "step": 4813 + }, + { + "epoch": 1.3017847485127096, + "grad_norm": 1.5943974256515503, + "learning_rate": 1.7610574403064368e-05, + "loss": 1.9701, + "mean_token_accuracy": 0.5461406707763672, + "num_tokens": 2461916966.0, + "step": 4814 + }, + { + "epoch": 1.3020551649540293, + "grad_norm": 1.4183244705200195, + "learning_rate": 1.7609504496491295e-05, + "loss": 1.978, + "mean_token_accuracy": 0.5645722150802612, + "num_tokens": 2462398188.0, + "step": 4815 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 1.3320807218551636, + "learning_rate": 1.7608434387119192e-05, + "loss": 1.9838, + "mean_token_accuracy": 0.5640490055084229, + "num_tokens": 2462911253.0, + "step": 4816 + }, + { + "epoch": 1.3025959978366686, + "grad_norm": 1.3909218311309814, + "learning_rate": 1.7607364074980883e-05, + "loss": 2.1139, + "mean_token_accuracy": 0.5427287220954895, + "num_tokens": 2463435462.0, + "step": 4817 + }, + { + "epoch": 1.3028664142779882, + "grad_norm": 1.4029710292816162, + "learning_rate": 1.7606293560109215e-05, + "loss": 2.0986, + "mean_token_accuracy": 0.524449348449707, + "num_tokens": 2463934710.0, + "step": 4818 + }, + { + "epoch": 1.3031368307193079, + "grad_norm": 1.5302388668060303, + "learning_rate": 1.7605222842537034e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5314812660217285, + "num_tokens": 2464448368.0, + "step": 4819 + }, + { + "epoch": 1.3034072471606275, + "grad_norm": 1.630265235900879, + "learning_rate": 1.7604151922297187e-05, + "loss": 1.9939, + "mean_token_accuracy": 0.5624815225601196, + "num_tokens": 2464972534.0, + "step": 4820 + }, + { + "epoch": 1.303677663601947, + "grad_norm": 0.9476298093795776, + "learning_rate": 1.7603080799422543e-05, + "loss": 1.1768, + "mean_token_accuracy": 0.6818224191665649, + "num_tokens": 2465496725.0, + "step": 4821 + }, + { + "epoch": 1.3039480800432666, + "grad_norm": 2.6586904525756836, + "learning_rate": 1.760200947394596e-05, + "loss": 2.1109, + "mean_token_accuracy": 0.542133092880249, + "num_tokens": 2465982770.0, + "step": 4822 + }, + { + "epoch": 1.3042184964845862, + "grad_norm": 2.001849889755249, + "learning_rate": 1.760093794590031e-05, + "loss": 2.0207, + "mean_token_accuracy": 0.5451302528381348, + "num_tokens": 2466490781.0, + "step": 4823 + }, + { + "epoch": 1.3044889129259059, + "grad_norm": 1.3601115942001343, + "learning_rate": 1.759986621531847e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.5454621315002441, + "num_tokens": 2467000286.0, + "step": 4824 + }, + { + "epoch": 1.3047593293672255, + "grad_norm": 1.7744784355163574, + "learning_rate": 1.7598794282233328e-05, + "loss": 2.0743, + "mean_token_accuracy": 0.5476393699645996, + "num_tokens": 2467524437.0, + "step": 4825 + }, + { + "epoch": 1.3050297458085451, + "grad_norm": 1.8743691444396973, + "learning_rate": 1.759772214667777e-05, + "loss": 2.0217, + "mean_token_accuracy": 0.5353246927261353, + "num_tokens": 2468048629.0, + "step": 4826 + }, + { + "epoch": 1.3053001622498648, + "grad_norm": 1.2216228246688843, + "learning_rate": 1.7596649808684697e-05, + "loss": 1.9845, + "mean_token_accuracy": 0.5561045408248901, + "num_tokens": 2468568581.0, + "step": 4827 + }, + { + "epoch": 1.3055705786911844, + "grad_norm": 1.7640960216522217, + "learning_rate": 1.7595577268287005e-05, + "loss": 1.9981, + "mean_token_accuracy": 0.5463734865188599, + "num_tokens": 2469092839.0, + "step": 4828 + }, + { + "epoch": 1.305840995132504, + "grad_norm": 1.7730884552001953, + "learning_rate": 1.7594504525517607e-05, + "loss": 2.1688, + "mean_token_accuracy": 0.5182303190231323, + "num_tokens": 2469611692.0, + "step": 4829 + }, + { + "epoch": 1.3061114115738237, + "grad_norm": 1.2113248109817505, + "learning_rate": 1.759343158040942e-05, + "loss": 1.9036, + "mean_token_accuracy": 0.5472379922866821, + "num_tokens": 2470135826.0, + "step": 4830 + }, + { + "epoch": 1.3063818280151434, + "grad_norm": 1.7479239702224731, + "learning_rate": 1.7592358432995358e-05, + "loss": 2.0502, + "mean_token_accuracy": 0.5322799682617188, + "num_tokens": 2470659885.0, + "step": 4831 + }, + { + "epoch": 1.306652244456463, + "grad_norm": 1.7714794874191284, + "learning_rate": 1.7591285083308355e-05, + "loss": 2.0458, + "mean_token_accuracy": 0.5397706031799316, + "num_tokens": 2471184117.0, + "step": 4832 + }, + { + "epoch": 1.3069226608977826, + "grad_norm": 1.3881548643112183, + "learning_rate": 1.7590211531381342e-05, + "loss": 2.0723, + "mean_token_accuracy": 0.5457940101623535, + "num_tokens": 2471690828.0, + "step": 4833 + }, + { + "epoch": 1.3071930773391023, + "grad_norm": 1.3903728723526, + "learning_rate": 1.7589137777247253e-05, + "loss": 2.0541, + "mean_token_accuracy": 0.5412148237228394, + "num_tokens": 2472215104.0, + "step": 4834 + }, + { + "epoch": 1.307463493780422, + "grad_norm": 1.552134394645691, + "learning_rate": 1.7588063820939044e-05, + "loss": 2.1161, + "mean_token_accuracy": 0.542771577835083, + "num_tokens": 2472674962.0, + "step": 4835 + }, + { + "epoch": 1.3077339102217413, + "grad_norm": 1.301537036895752, + "learning_rate": 1.758698966248966e-05, + "loss": 1.8707, + "mean_token_accuracy": 0.5601710677146912, + "num_tokens": 2473198247.0, + "step": 4836 + }, + { + "epoch": 1.308004326663061, + "grad_norm": 1.3834574222564697, + "learning_rate": 1.7585915301932068e-05, + "loss": 2.0799, + "mean_token_accuracy": 0.5263025760650635, + "num_tokens": 2473698712.0, + "step": 4837 + }, + { + "epoch": 1.3082747431043806, + "grad_norm": 1.3505339622497559, + "learning_rate": 1.7584840739299224e-05, + "loss": 1.9, + "mean_token_accuracy": 0.5485104322433472, + "num_tokens": 2474222859.0, + "step": 4838 + }, + { + "epoch": 1.3085451595457003, + "grad_norm": 1.5134453773498535, + "learning_rate": 1.7583765974624102e-05, + "loss": 2.0399, + "mean_token_accuracy": 0.5345584750175476, + "num_tokens": 2474711222.0, + "step": 4839 + }, + { + "epoch": 1.30881557598702, + "grad_norm": 1.6217583417892456, + "learning_rate": 1.7582691007939676e-05, + "loss": 2.2766, + "mean_token_accuracy": 0.49324336647987366, + "num_tokens": 2475235401.0, + "step": 4840 + }, + { + "epoch": 1.3090859924283396, + "grad_norm": 0.6693461537361145, + "learning_rate": 1.7581615839278935e-05, + "loss": 1.1239, + "mean_token_accuracy": 0.6958361864089966, + "num_tokens": 2475759679.0, + "step": 4841 + }, + { + "epoch": 1.3093564088696592, + "grad_norm": 2.929454803466797, + "learning_rate": 1.758054046867486e-05, + "loss": 2.1309, + "mean_token_accuracy": 0.525360107421875, + "num_tokens": 2476283950.0, + "step": 4842 + }, + { + "epoch": 1.3096268253109788, + "grad_norm": 2.0300207138061523, + "learning_rate": 1.7579464896160453e-05, + "loss": 2.0002, + "mean_token_accuracy": 0.5471115112304688, + "num_tokens": 2476804593.0, + "step": 4843 + }, + { + "epoch": 1.3098972417522985, + "grad_norm": 1.6085405349731445, + "learning_rate": 1.757838912176872e-05, + "loss": 2.0887, + "mean_token_accuracy": 0.5455636382102966, + "num_tokens": 2477285094.0, + "step": 4844 + }, + { + "epoch": 1.3101676581936181, + "grad_norm": 1.7732959985733032, + "learning_rate": 1.757731314553265e-05, + "loss": 2.0326, + "mean_token_accuracy": 0.5364277958869934, + "num_tokens": 2477809300.0, + "step": 4845 + }, + { + "epoch": 1.3104380746349378, + "grad_norm": 1.503300428390503, + "learning_rate": 1.757623696748528e-05, + "loss": 2.0467, + "mean_token_accuracy": 0.5310791730880737, + "num_tokens": 2478333469.0, + "step": 4846 + }, + { + "epoch": 1.3107084910762574, + "grad_norm": 1.7523974180221558, + "learning_rate": 1.7575160587659622e-05, + "loss": 2.0058, + "mean_token_accuracy": 0.5568664073944092, + "num_tokens": 2478857643.0, + "step": 4847 + }, + { + "epoch": 1.310978907517577, + "grad_norm": 1.4690287113189697, + "learning_rate": 1.7574084006088694e-05, + "loss": 2.1766, + "mean_token_accuracy": 0.530127227306366, + "num_tokens": 2479381823.0, + "step": 4848 + }, + { + "epoch": 1.3112493239588967, + "grad_norm": 1.3285908699035645, + "learning_rate": 1.7573007222805537e-05, + "loss": 1.99, + "mean_token_accuracy": 0.5457006096839905, + "num_tokens": 2479906107.0, + "step": 4849 + }, + { + "epoch": 1.3115197404002163, + "grad_norm": 1.735101580619812, + "learning_rate": 1.757193023784319e-05, + "loss": 2.0683, + "mean_token_accuracy": 0.5438293218612671, + "num_tokens": 2480430282.0, + "step": 4850 + }, + { + "epoch": 1.311790156841536, + "grad_norm": 1.3051586151123047, + "learning_rate": 1.7570853051234698e-05, + "loss": 1.9192, + "mean_token_accuracy": 0.5636597275733948, + "num_tokens": 2480954506.0, + "step": 4851 + }, + { + "epoch": 1.3120605732828556, + "grad_norm": 1.8698303699493408, + "learning_rate": 1.7569775663013112e-05, + "loss": 2.1, + "mean_token_accuracy": 0.5522912740707397, + "num_tokens": 2481426398.0, + "step": 4852 + }, + { + "epoch": 1.3123309897241753, + "grad_norm": 1.535359501838684, + "learning_rate": 1.7568698073211485e-05, + "loss": 2.145, + "mean_token_accuracy": 0.532943069934845, + "num_tokens": 2481950677.0, + "step": 4853 + }, + { + "epoch": 1.312601406165495, + "grad_norm": 1.397147536277771, + "learning_rate": 1.7567620281862884e-05, + "loss": 1.936, + "mean_token_accuracy": 0.5592443943023682, + "num_tokens": 2482474789.0, + "step": 4854 + }, + { + "epoch": 1.3128718226068146, + "grad_norm": 1.7352468967437744, + "learning_rate": 1.7566542289000378e-05, + "loss": 2.1186, + "mean_token_accuracy": 0.5379599332809448, + "num_tokens": 2482998969.0, + "step": 4855 + }, + { + "epoch": 1.3131422390481342, + "grad_norm": 1.5116156339645386, + "learning_rate": 1.7565464094657044e-05, + "loss": 1.9494, + "mean_token_accuracy": 0.5682985782623291, + "num_tokens": 2483523123.0, + "step": 4856 + }, + { + "epoch": 1.3134126554894539, + "grad_norm": 1.7595901489257812, + "learning_rate": 1.7564385698865963e-05, + "loss": 1.8977, + "mean_token_accuracy": 0.602428138256073, + "num_tokens": 2484047344.0, + "step": 4857 + }, + { + "epoch": 1.3136830719307735, + "grad_norm": 1.5776102542877197, + "learning_rate": 1.756330710166023e-05, + "loss": 1.9951, + "mean_token_accuracy": 0.5394470691680908, + "num_tokens": 2484571505.0, + "step": 4858 + }, + { + "epoch": 1.3139534883720931, + "grad_norm": 1.5834364891052246, + "learning_rate": 1.7562228303072928e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.5532965660095215, + "num_tokens": 2485095644.0, + "step": 4859 + }, + { + "epoch": 1.3142239048134128, + "grad_norm": 1.4116204977035522, + "learning_rate": 1.7561149303137165e-05, + "loss": 1.9781, + "mean_token_accuracy": 0.5494928359985352, + "num_tokens": 2485555029.0, + "step": 4860 + }, + { + "epoch": 1.3144943212547324, + "grad_norm": 0.652296245098114, + "learning_rate": 1.7560070101886045e-05, + "loss": 1.1247, + "mean_token_accuracy": 0.7081120014190674, + "num_tokens": 2486079097.0, + "step": 4861 + }, + { + "epoch": 1.3147647376960518, + "grad_norm": 2.2206509113311768, + "learning_rate": 1.7558990699352684e-05, + "loss": 2.0627, + "mean_token_accuracy": 0.5416131615638733, + "num_tokens": 2486603289.0, + "step": 4862 + }, + { + "epoch": 1.3150351541373715, + "grad_norm": 1.9270390272140503, + "learning_rate": 1.7557911095570202e-05, + "loss": 2.2048, + "mean_token_accuracy": 0.5170606374740601, + "num_tokens": 2487062871.0, + "step": 4863 + }, + { + "epoch": 1.3153055705786911, + "grad_norm": 1.3537733554840088, + "learning_rate": 1.7556831290571718e-05, + "loss": 2.084, + "mean_token_accuracy": 0.5284150242805481, + "num_tokens": 2487587070.0, + "step": 4864 + }, + { + "epoch": 1.3155759870200108, + "grad_norm": 1.4552253484725952, + "learning_rate": 1.7555751284390365e-05, + "loss": 2.0174, + "mean_token_accuracy": 0.5354666709899902, + "num_tokens": 2488111160.0, + "step": 4865 + }, + { + "epoch": 1.3158464034613304, + "grad_norm": 1.631115198135376, + "learning_rate": 1.7554671077059287e-05, + "loss": 1.9812, + "mean_token_accuracy": 0.5395016670227051, + "num_tokens": 2488635261.0, + "step": 4866 + }, + { + "epoch": 1.31611681990265, + "grad_norm": 1.7814847230911255, + "learning_rate": 1.7553590668611627e-05, + "loss": 1.9603, + "mean_token_accuracy": 0.5592041015625, + "num_tokens": 2489159540.0, + "step": 4867 + }, + { + "epoch": 1.3163872363439697, + "grad_norm": 1.5615074634552002, + "learning_rate": 1.755251005908053e-05, + "loss": 2.0928, + "mean_token_accuracy": 0.5522782206535339, + "num_tokens": 2489661871.0, + "step": 4868 + }, + { + "epoch": 1.3166576527852893, + "grad_norm": 1.64228355884552, + "learning_rate": 1.7551429248499154e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.5595228672027588, + "num_tokens": 2490186041.0, + "step": 4869 + }, + { + "epoch": 1.316928069226609, + "grad_norm": 2.072061777114868, + "learning_rate": 1.7550348236900663e-05, + "loss": 1.9582, + "mean_token_accuracy": 0.5536239147186279, + "num_tokens": 2490710270.0, + "step": 4870 + }, + { + "epoch": 1.3171984856679286, + "grad_norm": 1.5445773601531982, + "learning_rate": 1.7549267024318222e-05, + "loss": 1.9584, + "mean_token_accuracy": 0.5598617196083069, + "num_tokens": 2491234511.0, + "step": 4871 + }, + { + "epoch": 1.3174689021092483, + "grad_norm": 1.5986902713775635, + "learning_rate": 1.754818561078501e-05, + "loss": 2.1286, + "mean_token_accuracy": 0.5190972089767456, + "num_tokens": 2491758790.0, + "step": 4872 + }, + { + "epoch": 1.317739318550568, + "grad_norm": 1.7090518474578857, + "learning_rate": 1.7547103996334212e-05, + "loss": 2.0332, + "mean_token_accuracy": 0.5475299954414368, + "num_tokens": 2492282963.0, + "step": 4873 + }, + { + "epoch": 1.3180097349918876, + "grad_norm": 1.5236225128173828, + "learning_rate": 1.7546022180999002e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5532733798027039, + "num_tokens": 2492807153.0, + "step": 4874 + }, + { + "epoch": 1.3182801514332072, + "grad_norm": 1.3997042179107666, + "learning_rate": 1.7544940164812586e-05, + "loss": 1.9884, + "mean_token_accuracy": 0.5547584295272827, + "num_tokens": 2493296199.0, + "step": 4875 + }, + { + "epoch": 1.3185505678745268, + "grad_norm": 1.501395583152771, + "learning_rate": 1.7543857947808156e-05, + "loss": 2.0288, + "mean_token_accuracy": 0.534686803817749, + "num_tokens": 2493820452.0, + "step": 4876 + }, + { + "epoch": 1.3188209843158463, + "grad_norm": 1.6336612701416016, + "learning_rate": 1.754277553001892e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.5655786991119385, + "num_tokens": 2494344737.0, + "step": 4877 + }, + { + "epoch": 1.319091400757166, + "grad_norm": 1.27016019821167, + "learning_rate": 1.754169291147809e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.5406823754310608, + "num_tokens": 2494868908.0, + "step": 4878 + }, + { + "epoch": 1.3193618171984856, + "grad_norm": 1.3721282482147217, + "learning_rate": 1.7540610092218883e-05, + "loss": 1.9109, + "mean_token_accuracy": 0.5632239580154419, + "num_tokens": 2495376047.0, + "step": 4879 + }, + { + "epoch": 1.3196322336398052, + "grad_norm": 1.4836276769638062, + "learning_rate": 1.7539527072274524e-05, + "loss": 2.0545, + "mean_token_accuracy": 0.5383539199829102, + "num_tokens": 2495900256.0, + "step": 4880 + }, + { + "epoch": 1.3199026500811248, + "grad_norm": 0.7770049571990967, + "learning_rate": 1.753844385167824e-05, + "loss": 1.1996, + "mean_token_accuracy": 0.6901633739471436, + "num_tokens": 2496424459.0, + "step": 4881 + }, + { + "epoch": 1.3201730665224445, + "grad_norm": 1.8882962465286255, + "learning_rate": 1.753736043046328e-05, + "loss": 2.0527, + "mean_token_accuracy": 0.5254178047180176, + "num_tokens": 2496948724.0, + "step": 4882 + }, + { + "epoch": 1.3204434829637641, + "grad_norm": 1.5579197406768799, + "learning_rate": 1.7536276808662868e-05, + "loss": 1.9957, + "mean_token_accuracy": 0.5716718435287476, + "num_tokens": 2497472979.0, + "step": 4883 + }, + { + "epoch": 1.3207138994050838, + "grad_norm": 1.4413715600967407, + "learning_rate": 1.7535192986310262e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.5415877103805542, + "num_tokens": 2497959791.0, + "step": 4884 + }, + { + "epoch": 1.3209843158464034, + "grad_norm": 1.7118252515792847, + "learning_rate": 1.753410896343872e-05, + "loss": 1.908, + "mean_token_accuracy": 0.5741571187973022, + "num_tokens": 2498484056.0, + "step": 4885 + }, + { + "epoch": 1.321254732287723, + "grad_norm": 1.5240496397018433, + "learning_rate": 1.7533024740081497e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.5456692576408386, + "num_tokens": 2498936667.0, + "step": 4886 + }, + { + "epoch": 1.3215251487290427, + "grad_norm": 1.5800695419311523, + "learning_rate": 1.7531940316271864e-05, + "loss": 1.963, + "mean_token_accuracy": 0.5496904850006104, + "num_tokens": 2499460838.0, + "step": 4887 + }, + { + "epoch": 1.3217955651703623, + "grad_norm": 1.7197067737579346, + "learning_rate": 1.7530855692043094e-05, + "loss": 1.977, + "mean_token_accuracy": 0.5621824264526367, + "num_tokens": 2499984194.0, + "step": 4888 + }, + { + "epoch": 1.322065981611682, + "grad_norm": 1.3059133291244507, + "learning_rate": 1.752977086742846e-05, + "loss": 1.9529, + "mean_token_accuracy": 0.5371871590614319, + "num_tokens": 2500508441.0, + "step": 4889 + }, + { + "epoch": 1.3223363980530016, + "grad_norm": 1.570267677307129, + "learning_rate": 1.752868584246126e-05, + "loss": 1.9705, + "mean_token_accuracy": 0.5583512783050537, + "num_tokens": 2501003578.0, + "step": 4890 + }, + { + "epoch": 1.3226068144943213, + "grad_norm": 1.5278929471969604, + "learning_rate": 1.7527600617174773e-05, + "loss": 2.1291, + "mean_token_accuracy": 0.534473717212677, + "num_tokens": 2501527848.0, + "step": 4891 + }, + { + "epoch": 1.322877230935641, + "grad_norm": 1.4633169174194336, + "learning_rate": 1.752651519160231e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.5594152212142944, + "num_tokens": 2502047226.0, + "step": 4892 + }, + { + "epoch": 1.3231476473769606, + "grad_norm": 1.3588085174560547, + "learning_rate": 1.7525429565777157e-05, + "loss": 2.0446, + "mean_token_accuracy": 0.5321169495582581, + "num_tokens": 2502571396.0, + "step": 4893 + }, + { + "epoch": 1.3234180638182802, + "grad_norm": 1.5619605779647827, + "learning_rate": 1.752434373973264e-05, + "loss": 1.799, + "mean_token_accuracy": 0.5826956629753113, + "num_tokens": 2503078485.0, + "step": 4894 + }, + { + "epoch": 1.3236884802595998, + "grad_norm": 1.59880793094635, + "learning_rate": 1.7523257713502074e-05, + "loss": 1.9341, + "mean_token_accuracy": 0.561991274356842, + "num_tokens": 2503602529.0, + "step": 4895 + }, + { + "epoch": 1.3239588967009195, + "grad_norm": 2.2254722118377686, + "learning_rate": 1.7522171487118772e-05, + "loss": 1.8627, + "mean_token_accuracy": 0.5795630216598511, + "num_tokens": 2504126694.0, + "step": 4896 + }, + { + "epoch": 1.3242293131422391, + "grad_norm": 1.6779366731643677, + "learning_rate": 1.7521085060616073e-05, + "loss": 1.9337, + "mean_token_accuracy": 0.5717435479164124, + "num_tokens": 2504643247.0, + "step": 4897 + }, + { + "epoch": 1.3244997295835588, + "grad_norm": 1.4812058210372925, + "learning_rate": 1.7519998434027305e-05, + "loss": 1.9921, + "mean_token_accuracy": 0.5532727837562561, + "num_tokens": 2505154874.0, + "step": 4898 + }, + { + "epoch": 1.3247701460248784, + "grad_norm": 1.5195122957229614, + "learning_rate": 1.751891160738581e-05, + "loss": 1.9662, + "mean_token_accuracy": 0.5516510009765625, + "num_tokens": 2505673553.0, + "step": 4899 + }, + { + "epoch": 1.325040562466198, + "grad_norm": 1.4735568761825562, + "learning_rate": 1.7517824580724937e-05, + "loss": 2.0175, + "mean_token_accuracy": 0.5511135458946228, + "num_tokens": 2506197801.0, + "step": 4900 + }, + { + "epoch": 1.3253109789075177, + "grad_norm": 0.690053403377533, + "learning_rate": 1.751673735407804e-05, + "loss": 1.0799, + "mean_token_accuracy": 0.7111856937408447, + "num_tokens": 2506697403.0, + "step": 4901 + }, + { + "epoch": 1.3255813953488373, + "grad_norm": 2.4455511569976807, + "learning_rate": 1.7515649927478475e-05, + "loss": 2.1211, + "mean_token_accuracy": 0.5406408309936523, + "num_tokens": 2507212739.0, + "step": 4902 + }, + { + "epoch": 1.3258518117901568, + "grad_norm": 1.9117882251739502, + "learning_rate": 1.7514562300959608e-05, + "loss": 2.072, + "mean_token_accuracy": 0.5310617089271545, + "num_tokens": 2507736843.0, + "step": 4903 + }, + { + "epoch": 1.3261222282314764, + "grad_norm": 1.400511622428894, + "learning_rate": 1.751347447455481e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5460039377212524, + "num_tokens": 2508261096.0, + "step": 4904 + }, + { + "epoch": 1.326392644672796, + "grad_norm": 1.4138410091400146, + "learning_rate": 1.7512386448297463e-05, + "loss": 2.1199, + "mean_token_accuracy": 0.5258291959762573, + "num_tokens": 2508785338.0, + "step": 4905 + }, + { + "epoch": 1.3266630611141157, + "grad_norm": 1.388967514038086, + "learning_rate": 1.7511298222220948e-05, + "loss": 1.9522, + "mean_token_accuracy": 0.5735567808151245, + "num_tokens": 2509309474.0, + "step": 4906 + }, + { + "epoch": 1.3269334775554353, + "grad_norm": 1.640437126159668, + "learning_rate": 1.7510209796358654e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.5110163688659668, + "num_tokens": 2509827149.0, + "step": 4907 + }, + { + "epoch": 1.327203893996755, + "grad_norm": 1.50986647605896, + "learning_rate": 1.750912117074398e-05, + "loss": 1.8675, + "mean_token_accuracy": 0.5547894239425659, + "num_tokens": 2510351261.0, + "step": 4908 + }, + { + "epoch": 1.3274743104380746, + "grad_norm": 1.633939266204834, + "learning_rate": 1.7508032345410325e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.5471770763397217, + "num_tokens": 2510875481.0, + "step": 4909 + }, + { + "epoch": 1.3277447268793943, + "grad_norm": 1.8263182640075684, + "learning_rate": 1.7506943320391098e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.5362635850906372, + "num_tokens": 2511399740.0, + "step": 4910 + }, + { + "epoch": 1.328015143320714, + "grad_norm": 1.304989218711853, + "learning_rate": 1.7505854095719716e-05, + "loss": 2.1541, + "mean_token_accuracy": 0.5170522332191467, + "num_tokens": 2511923859.0, + "step": 4911 + }, + { + "epoch": 1.3282855597620336, + "grad_norm": 1.5098658800125122, + "learning_rate": 1.7504764671429597e-05, + "loss": 1.9949, + "mean_token_accuracy": 0.5449013710021973, + "num_tokens": 2512448057.0, + "step": 4912 + }, + { + "epoch": 1.3285559762033532, + "grad_norm": 1.5140676498413086, + "learning_rate": 1.750367504755417e-05, + "loss": 1.9299, + "mean_token_accuracy": 0.5655965805053711, + "num_tokens": 2512920998.0, + "step": 4913 + }, + { + "epoch": 1.3288263926446728, + "grad_norm": 1.3388618230819702, + "learning_rate": 1.7502585224126862e-05, + "loss": 2.027, + "mean_token_accuracy": 0.5390554070472717, + "num_tokens": 2513445192.0, + "step": 4914 + }, + { + "epoch": 1.3290968090859925, + "grad_norm": 1.4931743144989014, + "learning_rate": 1.750149520118112e-05, + "loss": 1.9981, + "mean_token_accuracy": 0.5586786270141602, + "num_tokens": 2513908298.0, + "step": 4915 + }, + { + "epoch": 1.3293672255273121, + "grad_norm": 1.1052695512771606, + "learning_rate": 1.7500404978750386e-05, + "loss": 1.9545, + "mean_token_accuracy": 0.5643424987792969, + "num_tokens": 2514382618.0, + "step": 4916 + }, + { + "epoch": 1.3296376419686318, + "grad_norm": 1.7583694458007812, + "learning_rate": 1.749931455686811e-05, + "loss": 2.0257, + "mean_token_accuracy": 0.5541200637817383, + "num_tokens": 2514906812.0, + "step": 4917 + }, + { + "epoch": 1.3299080584099512, + "grad_norm": 1.483730435371399, + "learning_rate": 1.7498223935567752e-05, + "loss": 2.0277, + "mean_token_accuracy": 0.549447774887085, + "num_tokens": 2515431084.0, + "step": 4918 + }, + { + "epoch": 1.3301784748512708, + "grad_norm": 1.3107355833053589, + "learning_rate": 1.749713311488277e-05, + "loss": 2.0557, + "mean_token_accuracy": 0.5256011486053467, + "num_tokens": 2515955271.0, + "step": 4919 + }, + { + "epoch": 1.3304488912925905, + "grad_norm": 1.707115650177002, + "learning_rate": 1.7496042094846638e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.5357591509819031, + "num_tokens": 2516479432.0, + "step": 4920 + }, + { + "epoch": 1.3307193077339101, + "grad_norm": 0.5748270750045776, + "learning_rate": 1.7494950875492832e-05, + "loss": 1.1272, + "mean_token_accuracy": 0.7074249982833862, + "num_tokens": 2517003625.0, + "step": 4921 + }, + { + "epoch": 1.3309897241752298, + "grad_norm": 1.80390202999115, + "learning_rate": 1.7493859456854833e-05, + "loss": 2.0606, + "mean_token_accuracy": 0.5794695615768433, + "num_tokens": 2517453770.0, + "step": 4922 + }, + { + "epoch": 1.3312601406165494, + "grad_norm": 1.9826126098632812, + "learning_rate": 1.7492767838966127e-05, + "loss": 2.0764, + "mean_token_accuracy": 0.5543839931488037, + "num_tokens": 2517978048.0, + "step": 4923 + }, + { + "epoch": 1.331530557057869, + "grad_norm": 1.4059150218963623, + "learning_rate": 1.749167602186021e-05, + "loss": 1.9156, + "mean_token_accuracy": 0.5551820993423462, + "num_tokens": 2518502310.0, + "step": 4924 + }, + { + "epoch": 1.3318009734991887, + "grad_norm": 1.6982405185699463, + "learning_rate": 1.7490584005570584e-05, + "loss": 2.1476, + "mean_token_accuracy": 0.5406014919281006, + "num_tokens": 2518985450.0, + "step": 4925 + }, + { + "epoch": 1.3320713899405083, + "grad_norm": 1.5657418966293335, + "learning_rate": 1.7489491790130752e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.555267333984375, + "num_tokens": 2519497736.0, + "step": 4926 + }, + { + "epoch": 1.332341806381828, + "grad_norm": 1.736926794052124, + "learning_rate": 1.7488399375574227e-05, + "loss": 1.9026, + "mean_token_accuracy": 0.5703638195991516, + "num_tokens": 2520021973.0, + "step": 4927 + }, + { + "epoch": 1.3326122228231476, + "grad_norm": 1.6463701725006104, + "learning_rate": 1.748730676193453e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5520510673522949, + "num_tokens": 2520546151.0, + "step": 4928 + }, + { + "epoch": 1.3328826392644673, + "grad_norm": 1.6417970657348633, + "learning_rate": 1.7486213949245183e-05, + "loss": 1.8514, + "mean_token_accuracy": 0.549458920955658, + "num_tokens": 2521025171.0, + "step": 4929 + }, + { + "epoch": 1.333153055705787, + "grad_norm": 1.9658340215682983, + "learning_rate": 1.7485120937539713e-05, + "loss": 1.9209, + "mean_token_accuracy": 0.5731545090675354, + "num_tokens": 2521499900.0, + "step": 4930 + }, + { + "epoch": 1.3334234721471065, + "grad_norm": 1.448602557182312, + "learning_rate": 1.748402772685167e-05, + "loss": 1.9686, + "mean_token_accuracy": 0.5511891841888428, + "num_tokens": 2522024137.0, + "step": 4931 + }, + { + "epoch": 1.3336938885884262, + "grad_norm": 1.876949429512024, + "learning_rate": 1.748293431721458e-05, + "loss": 2.1034, + "mean_token_accuracy": 0.5191514492034912, + "num_tokens": 2522491535.0, + "step": 4932 + }, + { + "epoch": 1.3339643050297458, + "grad_norm": 1.9465596675872803, + "learning_rate": 1.7481840708662003e-05, + "loss": 1.8981, + "mean_token_accuracy": 0.5635442733764648, + "num_tokens": 2523015688.0, + "step": 4933 + }, + { + "epoch": 1.3342347214710655, + "grad_norm": 1.2437585592269897, + "learning_rate": 1.748074690122749e-05, + "loss": 2.0813, + "mean_token_accuracy": 0.5446652173995972, + "num_tokens": 2523539708.0, + "step": 4934 + }, + { + "epoch": 1.3345051379123851, + "grad_norm": 1.6647032499313354, + "learning_rate": 1.7479652894944607e-05, + "loss": 2.0384, + "mean_token_accuracy": 0.5437349081039429, + "num_tokens": 2524063953.0, + "step": 4935 + }, + { + "epoch": 1.3347755543537048, + "grad_norm": 1.7971527576446533, + "learning_rate": 1.7478558689846913e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.5712662935256958, + "num_tokens": 2524551546.0, + "step": 4936 + }, + { + "epoch": 1.3350459707950244, + "grad_norm": 1.2341923713684082, + "learning_rate": 1.7477464285967987e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5601431131362915, + "num_tokens": 2525075732.0, + "step": 4937 + }, + { + "epoch": 1.335316387236344, + "grad_norm": 1.7647515535354614, + "learning_rate": 1.747636968334141e-05, + "loss": 2.0279, + "mean_token_accuracy": 0.5364463329315186, + "num_tokens": 2525599962.0, + "step": 4938 + }, + { + "epoch": 1.3355868036776637, + "grad_norm": 1.670804500579834, + "learning_rate": 1.7475274882000762e-05, + "loss": 2.1077, + "mean_token_accuracy": 0.5345771908760071, + "num_tokens": 2526124100.0, + "step": 4939 + }, + { + "epoch": 1.3358572201189833, + "grad_norm": 1.3474845886230469, + "learning_rate": 1.747417988197964e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5737753510475159, + "num_tokens": 2526648173.0, + "step": 4940 + }, + { + "epoch": 1.336127636560303, + "grad_norm": 0.681380569934845, + "learning_rate": 1.7473084683311646e-05, + "loss": 1.0832, + "mean_token_accuracy": 0.708467960357666, + "num_tokens": 2527172415.0, + "step": 4941 + }, + { + "epoch": 1.3363980530016226, + "grad_norm": 2.8335249423980713, + "learning_rate": 1.747198928603037e-05, + "loss": 1.9561, + "mean_token_accuracy": 0.5552101135253906, + "num_tokens": 2527660672.0, + "step": 4942 + }, + { + "epoch": 1.3366684694429423, + "grad_norm": 2.358412981033325, + "learning_rate": 1.747089369016943e-05, + "loss": 1.9377, + "mean_token_accuracy": 0.5430810451507568, + "num_tokens": 2528142614.0, + "step": 4943 + }, + { + "epoch": 1.3369388858842617, + "grad_norm": 1.4883548021316528, + "learning_rate": 1.7469797895762443e-05, + "loss": 2.0309, + "mean_token_accuracy": 0.561995267868042, + "num_tokens": 2528617464.0, + "step": 4944 + }, + { + "epoch": 1.3372093023255813, + "grad_norm": 1.935278058052063, + "learning_rate": 1.7468701902843034e-05, + "loss": 1.9377, + "mean_token_accuracy": 0.5601698756217957, + "num_tokens": 2529141584.0, + "step": 4945 + }, + { + "epoch": 1.337479718766901, + "grad_norm": 2.3343772888183594, + "learning_rate": 1.7467605711444825e-05, + "loss": 1.9957, + "mean_token_accuracy": 0.5586506128311157, + "num_tokens": 2529640672.0, + "step": 4946 + }, + { + "epoch": 1.3377501352082206, + "grad_norm": 2.089803695678711, + "learning_rate": 1.746650932160145e-05, + "loss": 2.1012, + "mean_token_accuracy": 0.5388859510421753, + "num_tokens": 2530160911.0, + "step": 4947 + }, + { + "epoch": 1.3380205516495403, + "grad_norm": 1.3754042387008667, + "learning_rate": 1.7465412733346557e-05, + "loss": 2.0676, + "mean_token_accuracy": 0.5391139984130859, + "num_tokens": 2530685063.0, + "step": 4948 + }, + { + "epoch": 1.33829096809086, + "grad_norm": 1.7531355619430542, + "learning_rate": 1.7464315946713783e-05, + "loss": 2.0132, + "mean_token_accuracy": 0.5334245562553406, + "num_tokens": 2531193417.0, + "step": 4949 + }, + { + "epoch": 1.3385613845321795, + "grad_norm": 2.3858723640441895, + "learning_rate": 1.746321896173679e-05, + "loss": 2.0887, + "mean_token_accuracy": 0.5400469303131104, + "num_tokens": 2531717656.0, + "step": 4950 + }, + { + "epoch": 1.3388318009734992, + "grad_norm": 1.6632970571517944, + "learning_rate": 1.746212177844923e-05, + "loss": 1.957, + "mean_token_accuracy": 0.558141827583313, + "num_tokens": 2532225139.0, + "step": 4951 + }, + { + "epoch": 1.3391022174148188, + "grad_norm": 1.7264076471328735, + "learning_rate": 1.746102439688477e-05, + "loss": 1.8285, + "mean_token_accuracy": 0.5665947794914246, + "num_tokens": 2532749347.0, + "step": 4952 + }, + { + "epoch": 1.3393726338561385, + "grad_norm": 2.2960758209228516, + "learning_rate": 1.7459926817077078e-05, + "loss": 1.9953, + "mean_token_accuracy": 0.566717267036438, + "num_tokens": 2533273496.0, + "step": 4953 + }, + { + "epoch": 1.3396430502974581, + "grad_norm": 1.6879713535308838, + "learning_rate": 1.7458829039059836e-05, + "loss": 1.9763, + "mean_token_accuracy": 0.5473943948745728, + "num_tokens": 2533797589.0, + "step": 4954 + }, + { + "epoch": 1.3399134667387778, + "grad_norm": 1.3951698541641235, + "learning_rate": 1.7457731062866726e-05, + "loss": 1.9609, + "mean_token_accuracy": 0.5406251549720764, + "num_tokens": 2534321863.0, + "step": 4955 + }, + { + "epoch": 1.3401838831800974, + "grad_norm": 1.628720760345459, + "learning_rate": 1.7456632888531435e-05, + "loss": 2.034, + "mean_token_accuracy": 0.5248085260391235, + "num_tokens": 2534846107.0, + "step": 4956 + }, + { + "epoch": 1.340454299621417, + "grad_norm": 1.5931271314620972, + "learning_rate": 1.7455534516087663e-05, + "loss": 2.0885, + "mean_token_accuracy": 0.5363514423370361, + "num_tokens": 2535325960.0, + "step": 4957 + }, + { + "epoch": 1.3407247160627367, + "grad_norm": 1.5876165628433228, + "learning_rate": 1.7454435945569102e-05, + "loss": 2.0817, + "mean_token_accuracy": 0.5295777320861816, + "num_tokens": 2535850119.0, + "step": 4958 + }, + { + "epoch": 1.340995132504056, + "grad_norm": 1.592183232307434, + "learning_rate": 1.7453337177009465e-05, + "loss": 2.0417, + "mean_token_accuracy": 0.539673924446106, + "num_tokens": 2536374378.0, + "step": 4959 + }, + { + "epoch": 1.3412655489453758, + "grad_norm": 1.4035683870315552, + "learning_rate": 1.7452238210442472e-05, + "loss": 2.1209, + "mean_token_accuracy": 0.5151688456535339, + "num_tokens": 2536898435.0, + "step": 4960 + }, + { + "epoch": 1.3415359653866954, + "grad_norm": 0.6568558812141418, + "learning_rate": 1.745113904590183e-05, + "loss": 1.1387, + "mean_token_accuracy": 0.6870698928833008, + "num_tokens": 2537415642.0, + "step": 4961 + }, + { + "epoch": 1.341806381828015, + "grad_norm": 2.454550266265869, + "learning_rate": 1.745003968342127e-05, + "loss": 1.9963, + "mean_token_accuracy": 0.5410627722740173, + "num_tokens": 2537880864.0, + "step": 4962 + }, + { + "epoch": 1.3420767982693347, + "grad_norm": 1.9588749408721924, + "learning_rate": 1.7448940123034532e-05, + "loss": 2.1145, + "mean_token_accuracy": 0.5504863858222961, + "num_tokens": 2538365049.0, + "step": 4963 + }, + { + "epoch": 1.3423472147106543, + "grad_norm": 1.7491750717163086, + "learning_rate": 1.7447840364775337e-05, + "loss": 2.1216, + "mean_token_accuracy": 0.5432203412055969, + "num_tokens": 2538889325.0, + "step": 4964 + }, + { + "epoch": 1.342617631151974, + "grad_norm": 1.571528434753418, + "learning_rate": 1.7446740408677442e-05, + "loss": 2.034, + "mean_token_accuracy": 0.5391647815704346, + "num_tokens": 2539413604.0, + "step": 4965 + }, + { + "epoch": 1.3428880475932936, + "grad_norm": 1.682259440422058, + "learning_rate": 1.7445640254774592e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5592532753944397, + "num_tokens": 2539937844.0, + "step": 4966 + }, + { + "epoch": 1.3431584640346133, + "grad_norm": 1.6911379098892212, + "learning_rate": 1.744453990310054e-05, + "loss": 2.0727, + "mean_token_accuracy": 0.5287520885467529, + "num_tokens": 2540462118.0, + "step": 4967 + }, + { + "epoch": 1.343428880475933, + "grad_norm": 1.18901526927948, + "learning_rate": 1.7443439353689056e-05, + "loss": 1.8729, + "mean_token_accuracy": 0.5847688317298889, + "num_tokens": 2540986132.0, + "step": 4968 + }, + { + "epoch": 1.3436992969172525, + "grad_norm": 1.5549001693725586, + "learning_rate": 1.74423386065739e-05, + "loss": 1.9457, + "mean_token_accuracy": 0.563720703125, + "num_tokens": 2541467570.0, + "step": 4969 + }, + { + "epoch": 1.3439697133585722, + "grad_norm": 1.238033413887024, + "learning_rate": 1.7441237661788857e-05, + "loss": 1.969, + "mean_token_accuracy": 0.5539443492889404, + "num_tokens": 2541894967.0, + "step": 4970 + }, + { + "epoch": 1.3442401297998918, + "grad_norm": 1.7692372798919678, + "learning_rate": 1.7440136519367695e-05, + "loss": 2.0278, + "mean_token_accuracy": 0.5403362512588501, + "num_tokens": 2542419181.0, + "step": 4971 + }, + { + "epoch": 1.3445105462412115, + "grad_norm": 1.8221948146820068, + "learning_rate": 1.7439035179344203e-05, + "loss": 2.0472, + "mean_token_accuracy": 0.5466769933700562, + "num_tokens": 2542943373.0, + "step": 4972 + }, + { + "epoch": 1.3447809626825311, + "grad_norm": 1.3120173215866089, + "learning_rate": 1.743793364175218e-05, + "loss": 2.0633, + "mean_token_accuracy": 0.5421625971794128, + "num_tokens": 2543467595.0, + "step": 4973 + }, + { + "epoch": 1.3450513791238508, + "grad_norm": 1.3326548337936401, + "learning_rate": 1.743683190662542e-05, + "loss": 2.0684, + "mean_token_accuracy": 0.5315398573875427, + "num_tokens": 2543991793.0, + "step": 4974 + }, + { + "epoch": 1.3453217955651704, + "grad_norm": 1.2441884279251099, + "learning_rate": 1.7435729973997723e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.5616757869720459, + "num_tokens": 2544515976.0, + "step": 4975 + }, + { + "epoch": 1.34559221200649, + "grad_norm": 1.1567332744598389, + "learning_rate": 1.7434627843902908e-05, + "loss": 2.0653, + "mean_token_accuracy": 0.5506328344345093, + "num_tokens": 2544980339.0, + "step": 4976 + }, + { + "epoch": 1.3458626284478097, + "grad_norm": 1.2525734901428223, + "learning_rate": 1.7433525516374788e-05, + "loss": 2.0181, + "mean_token_accuracy": 0.5615054368972778, + "num_tokens": 2545504521.0, + "step": 4977 + }, + { + "epoch": 1.3461330448891293, + "grad_norm": 1.2393501996994019, + "learning_rate": 1.7432422991447185e-05, + "loss": 1.9306, + "mean_token_accuracy": 0.5532852411270142, + "num_tokens": 2546028795.0, + "step": 4978 + }, + { + "epoch": 1.346403461330449, + "grad_norm": 1.3001971244812012, + "learning_rate": 1.7431320269153926e-05, + "loss": 1.9928, + "mean_token_accuracy": 0.5565764904022217, + "num_tokens": 2546552712.0, + "step": 4979 + }, + { + "epoch": 1.3466738777717686, + "grad_norm": 1.283370852470398, + "learning_rate": 1.743021734952885e-05, + "loss": 1.9682, + "mean_token_accuracy": 0.5451153516769409, + "num_tokens": 2547076993.0, + "step": 4980 + }, + { + "epoch": 1.3469442942130883, + "grad_norm": 0.8508799076080322, + "learning_rate": 1.74291142326058e-05, + "loss": 1.2519, + "mean_token_accuracy": 0.6717885136604309, + "num_tokens": 2547591386.0, + "step": 4981 + }, + { + "epoch": 1.347214710654408, + "grad_norm": 1.623579740524292, + "learning_rate": 1.742801091841861e-05, + "loss": 2.0149, + "mean_token_accuracy": 0.5583646893501282, + "num_tokens": 2548115565.0, + "step": 4982 + }, + { + "epoch": 1.3474851270957275, + "grad_norm": 1.2973928451538086, + "learning_rate": 1.7426907407001147e-05, + "loss": 1.9612, + "mean_token_accuracy": 0.5693615078926086, + "num_tokens": 2548639644.0, + "step": 4983 + }, + { + "epoch": 1.3477555435370472, + "grad_norm": 1.2647209167480469, + "learning_rate": 1.7425803698387264e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.5591353178024292, + "num_tokens": 2549128370.0, + "step": 4984 + }, + { + "epoch": 1.3480259599783666, + "grad_norm": 1.6074351072311401, + "learning_rate": 1.7424699792610823e-05, + "loss": 2.0926, + "mean_token_accuracy": 0.5493360757827759, + "num_tokens": 2549652648.0, + "step": 4985 + }, + { + "epoch": 1.3482963764196862, + "grad_norm": 1.5638588666915894, + "learning_rate": 1.7423595689705698e-05, + "loss": 2.0858, + "mean_token_accuracy": 0.5407059192657471, + "num_tokens": 2550159750.0, + "step": 4986 + }, + { + "epoch": 1.348566792861006, + "grad_norm": 1.8617568016052246, + "learning_rate": 1.7422491389705775e-05, + "loss": 2.0373, + "mean_token_accuracy": 0.566076397895813, + "num_tokens": 2550684029.0, + "step": 4987 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 1.4158546924591064, + "learning_rate": 1.7421386892644923e-05, + "loss": 2.1, + "mean_token_accuracy": 0.5399751663208008, + "num_tokens": 2551145579.0, + "step": 4988 + }, + { + "epoch": 1.3491076257436452, + "grad_norm": 1.5690134763717651, + "learning_rate": 1.7420282198557036e-05, + "loss": 2.1418, + "mean_token_accuracy": 0.5272774696350098, + "num_tokens": 2551643112.0, + "step": 4989 + }, + { + "epoch": 1.3493780421849648, + "grad_norm": 1.6740291118621826, + "learning_rate": 1.741917730747601e-05, + "loss": 2.0488, + "mean_token_accuracy": 0.5411903262138367, + "num_tokens": 2552167318.0, + "step": 4990 + }, + { + "epoch": 1.3496484586262845, + "grad_norm": 1.4733628034591675, + "learning_rate": 1.741807221943575e-05, + "loss": 2.0382, + "mean_token_accuracy": 0.5435714721679688, + "num_tokens": 2552662286.0, + "step": 4991 + }, + { + "epoch": 1.349918875067604, + "grad_norm": 1.4221537113189697, + "learning_rate": 1.7416966934470156e-05, + "loss": 1.8927, + "mean_token_accuracy": 0.5413861274719238, + "num_tokens": 2553186501.0, + "step": 4992 + }, + { + "epoch": 1.3501892915089238, + "grad_norm": 1.9062082767486572, + "learning_rate": 1.7415861452613147e-05, + "loss": 1.9834, + "mean_token_accuracy": 0.5591051578521729, + "num_tokens": 2553692136.0, + "step": 4993 + }, + { + "epoch": 1.3504597079502434, + "grad_norm": 1.1939380168914795, + "learning_rate": 1.741475577389864e-05, + "loss": 2.0276, + "mean_token_accuracy": 0.5422539710998535, + "num_tokens": 2554216272.0, + "step": 4994 + }, + { + "epoch": 1.350730124391563, + "grad_norm": 1.8636542558670044, + "learning_rate": 1.741364989836057e-05, + "loss": 1.9505, + "mean_token_accuracy": 0.5689613223075867, + "num_tokens": 2554647442.0, + "step": 4995 + }, + { + "epoch": 1.3510005408328827, + "grad_norm": 1.2304221391677856, + "learning_rate": 1.741254382603285e-05, + "loss": 1.8675, + "mean_token_accuracy": 0.5686643123626709, + "num_tokens": 2555171714.0, + "step": 4996 + }, + { + "epoch": 1.3512709572742023, + "grad_norm": 1.4651373624801636, + "learning_rate": 1.741143755694943e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.5435194969177246, + "num_tokens": 2555695758.0, + "step": 4997 + }, + { + "epoch": 1.351541373715522, + "grad_norm": 1.6921253204345703, + "learning_rate": 1.741033109114425e-05, + "loss": 2.1006, + "mean_token_accuracy": 0.5400957465171814, + "num_tokens": 2556165538.0, + "step": 4998 + }, + { + "epoch": 1.3518117901568416, + "grad_norm": 1.133552074432373, + "learning_rate": 1.740922442865126e-05, + "loss": 2.0207, + "mean_token_accuracy": 0.5642849206924438, + "num_tokens": 2556689753.0, + "step": 4999 + }, + { + "epoch": 1.3520822065981613, + "grad_norm": 1.4726752042770386, + "learning_rate": 1.7408117569504418e-05, + "loss": 2.0417, + "mean_token_accuracy": 0.5389357805252075, + "num_tokens": 2557213946.0, + "step": 5000 + }, + { + "epoch": 1.3523526230394807, + "grad_norm": 0.7903743386268616, + "learning_rate": 1.7407010513737685e-05, + "loss": 1.1569, + "mean_token_accuracy": 0.6865098476409912, + "num_tokens": 2557738117.0, + "step": 5001 + }, + { + "epoch": 1.3526230394808003, + "grad_norm": 1.8707449436187744, + "learning_rate": 1.7405903261385023e-05, + "loss": 2.0368, + "mean_token_accuracy": 0.5402424335479736, + "num_tokens": 2558262289.0, + "step": 5002 + }, + { + "epoch": 1.35289345592212, + "grad_norm": 1.3775049448013306, + "learning_rate": 1.7404795812480416e-05, + "loss": 2.0007, + "mean_token_accuracy": 0.5480746030807495, + "num_tokens": 2558786557.0, + "step": 5003 + }, + { + "epoch": 1.3531638723634396, + "grad_norm": 1.2022298574447632, + "learning_rate": 1.7403688167057836e-05, + "loss": 2.0514, + "mean_token_accuracy": 0.5380784869194031, + "num_tokens": 2559310823.0, + "step": 5004 + }, + { + "epoch": 1.3534342888047592, + "grad_norm": 1.2598820924758911, + "learning_rate": 1.740258032515127e-05, + "loss": 2.1852, + "mean_token_accuracy": 0.5219277143478394, + "num_tokens": 2559777391.0, + "step": 5005 + }, + { + "epoch": 1.3537047052460789, + "grad_norm": 1.3092893362045288, + "learning_rate": 1.7401472286794713e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.5500802993774414, + "num_tokens": 2560301579.0, + "step": 5006 + }, + { + "epoch": 1.3539751216873985, + "grad_norm": 1.2289798259735107, + "learning_rate": 1.740036405202216e-05, + "loss": 2.025, + "mean_token_accuracy": 0.5353416800498962, + "num_tokens": 2560825796.0, + "step": 5007 + }, + { + "epoch": 1.3542455381287182, + "grad_norm": 1.348983883857727, + "learning_rate": 1.7399255620867613e-05, + "loss": 2.0125, + "mean_token_accuracy": 0.5412309169769287, + "num_tokens": 2561350045.0, + "step": 5008 + }, + { + "epoch": 1.3545159545700378, + "grad_norm": 1.1131889820098877, + "learning_rate": 1.739814699336509e-05, + "loss": 1.8493, + "mean_token_accuracy": 0.5636811256408691, + "num_tokens": 2561874296.0, + "step": 5009 + }, + { + "epoch": 1.3547863710113575, + "grad_norm": 1.5677567720413208, + "learning_rate": 1.7397038169548593e-05, + "loss": 2.0789, + "mean_token_accuracy": 0.5249647498130798, + "num_tokens": 2562398441.0, + "step": 5010 + }, + { + "epoch": 1.355056787452677, + "grad_norm": 1.4632996320724487, + "learning_rate": 1.739592914945216e-05, + "loss": 2.1207, + "mean_token_accuracy": 0.5311435461044312, + "num_tokens": 2562922712.0, + "step": 5011 + }, + { + "epoch": 1.3553272038939967, + "grad_norm": 1.1882209777832031, + "learning_rate": 1.739481993310981e-05, + "loss": 1.9559, + "mean_token_accuracy": 0.555939793586731, + "num_tokens": 2563446926.0, + "step": 5012 + }, + { + "epoch": 1.3555976203353164, + "grad_norm": 1.4281816482543945, + "learning_rate": 1.7393710520555577e-05, + "loss": 2.0218, + "mean_token_accuracy": 0.5508534908294678, + "num_tokens": 2563964836.0, + "step": 5013 + }, + { + "epoch": 1.355868036776636, + "grad_norm": 1.1620286703109741, + "learning_rate": 1.7392600911823504e-05, + "loss": 1.8526, + "mean_token_accuracy": 0.5655941963195801, + "num_tokens": 2564489007.0, + "step": 5014 + }, + { + "epoch": 1.3561384532179557, + "grad_norm": 1.3037456274032593, + "learning_rate": 1.7391491106947633e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.5351701974868774, + "num_tokens": 2565013148.0, + "step": 5015 + }, + { + "epoch": 1.3564088696592753, + "grad_norm": 1.2978575229644775, + "learning_rate": 1.7390381105962023e-05, + "loss": 1.945, + "mean_token_accuracy": 0.5583740472793579, + "num_tokens": 2565537395.0, + "step": 5016 + }, + { + "epoch": 1.356679286100595, + "grad_norm": 1.094035029411316, + "learning_rate": 1.7389270908900723e-05, + "loss": 1.9883, + "mean_token_accuracy": 0.5398435592651367, + "num_tokens": 2566061611.0, + "step": 5017 + }, + { + "epoch": 1.3569497025419146, + "grad_norm": 1.213266372680664, + "learning_rate": 1.7388160515797804e-05, + "loss": 1.9858, + "mean_token_accuracy": 0.542339026927948, + "num_tokens": 2566585635.0, + "step": 5018 + }, + { + "epoch": 1.3572201189832342, + "grad_norm": 1.3065274953842163, + "learning_rate": 1.7387049926687335e-05, + "loss": 2.074, + "mean_token_accuracy": 0.5458545684814453, + "num_tokens": 2567109917.0, + "step": 5019 + }, + { + "epoch": 1.357490535424554, + "grad_norm": 1.3018484115600586, + "learning_rate": 1.738593914160339e-05, + "loss": 2.0491, + "mean_token_accuracy": 0.5591626763343811, + "num_tokens": 2567592744.0, + "step": 5020 + }, + { + "epoch": 1.3577609518658735, + "grad_norm": 0.6773998737335205, + "learning_rate": 1.7384828160580052e-05, + "loss": 1.1347, + "mean_token_accuracy": 0.698890209197998, + "num_tokens": 2568116756.0, + "step": 5021 + }, + { + "epoch": 1.3580313683071932, + "grad_norm": 2.118263006210327, + "learning_rate": 1.7383716983651414e-05, + "loss": 2.0297, + "mean_token_accuracy": 0.539855420589447, + "num_tokens": 2568641024.0, + "step": 5022 + }, + { + "epoch": 1.3583017847485128, + "grad_norm": 1.3444862365722656, + "learning_rate": 1.7382605610851558e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.5339993834495544, + "num_tokens": 2569165292.0, + "step": 5023 + }, + { + "epoch": 1.3585722011898325, + "grad_norm": 1.5127143859863281, + "learning_rate": 1.7381494042214597e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5481358766555786, + "num_tokens": 2569689465.0, + "step": 5024 + }, + { + "epoch": 1.358842617631152, + "grad_norm": 1.4262502193450928, + "learning_rate": 1.738038227777463e-05, + "loss": 2.0856, + "mean_token_accuracy": 0.5471370220184326, + "num_tokens": 2570213667.0, + "step": 5025 + }, + { + "epoch": 1.3591130340724715, + "grad_norm": 1.812287449836731, + "learning_rate": 1.7379270317565774e-05, + "loss": 1.6365, + "mean_token_accuracy": 0.6170958280563354, + "num_tokens": 2570737829.0, + "step": 5026 + }, + { + "epoch": 1.3593834505137912, + "grad_norm": 1.8245359659194946, + "learning_rate": 1.737815816162214e-05, + "loss": 2.0301, + "mean_token_accuracy": 0.5563479065895081, + "num_tokens": 2571261996.0, + "step": 5027 + }, + { + "epoch": 1.3596538669551108, + "grad_norm": 1.6121892929077148, + "learning_rate": 1.737704580997786e-05, + "loss": 2.042, + "mean_token_accuracy": 0.5279216170310974, + "num_tokens": 2571786065.0, + "step": 5028 + }, + { + "epoch": 1.3599242833964305, + "grad_norm": 1.0906398296356201, + "learning_rate": 1.7375933262667063e-05, + "loss": 2.0585, + "mean_token_accuracy": 0.5290613174438477, + "num_tokens": 2572310328.0, + "step": 5029 + }, + { + "epoch": 1.36019469983775, + "grad_norm": 1.497130036354065, + "learning_rate": 1.7374820519723878e-05, + "loss": 2.0489, + "mean_token_accuracy": 0.539331316947937, + "num_tokens": 2572834398.0, + "step": 5030 + }, + { + "epoch": 1.3604651162790697, + "grad_norm": 1.4677355289459229, + "learning_rate": 1.7373707581182453e-05, + "loss": 1.9833, + "mean_token_accuracy": 0.5460641384124756, + "num_tokens": 2573358645.0, + "step": 5031 + }, + { + "epoch": 1.3607355327203894, + "grad_norm": 1.4831047058105469, + "learning_rate": 1.737259444707694e-05, + "loss": 1.9189, + "mean_token_accuracy": 0.5693652033805847, + "num_tokens": 2573882725.0, + "step": 5032 + }, + { + "epoch": 1.361005949161709, + "grad_norm": 1.3335586786270142, + "learning_rate": 1.7371481117441483e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.5414513349533081, + "num_tokens": 2574406935.0, + "step": 5033 + }, + { + "epoch": 1.3612763656030287, + "grad_norm": 1.576741337776184, + "learning_rate": 1.737036759231025e-05, + "loss": 1.9939, + "mean_token_accuracy": 0.543835461139679, + "num_tokens": 2574890593.0, + "step": 5034 + }, + { + "epoch": 1.3615467820443483, + "grad_norm": 2.8999783992767334, + "learning_rate": 1.736925387171741e-05, + "loss": 1.9396, + "mean_token_accuracy": 0.5816271305084229, + "num_tokens": 2575414871.0, + "step": 5035 + }, + { + "epoch": 1.361817198485668, + "grad_norm": 4.850346088409424, + "learning_rate": 1.7368139955697124e-05, + "loss": 1.9258, + "mean_token_accuracy": 0.5661320686340332, + "num_tokens": 2575939106.0, + "step": 5036 + }, + { + "epoch": 1.3620876149269876, + "grad_norm": 2.238370180130005, + "learning_rate": 1.736702584428358e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.561696469783783, + "num_tokens": 2576463280.0, + "step": 5037 + }, + { + "epoch": 1.3623580313683072, + "grad_norm": 1.8310906887054443, + "learning_rate": 1.7365911537510956e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.5670619010925293, + "num_tokens": 2576912102.0, + "step": 5038 + }, + { + "epoch": 1.3626284478096269, + "grad_norm": 1.5714582204818726, + "learning_rate": 1.7364797035413447e-05, + "loss": 2.0618, + "mean_token_accuracy": 0.5407388210296631, + "num_tokens": 2577436360.0, + "step": 5039 + }, + { + "epoch": 1.3628988642509465, + "grad_norm": 1.8701339960098267, + "learning_rate": 1.7363682338025244e-05, + "loss": 2.0866, + "mean_token_accuracy": 0.5477234721183777, + "num_tokens": 2577948723.0, + "step": 5040 + }, + { + "epoch": 1.3631692806922662, + "grad_norm": 0.6620078682899475, + "learning_rate": 1.7362567445380557e-05, + "loss": 1.1197, + "mean_token_accuracy": 0.6868652105331421, + "num_tokens": 2578472983.0, + "step": 5041 + }, + { + "epoch": 1.3634396971335856, + "grad_norm": 2.178895950317383, + "learning_rate": 1.736145235751359e-05, + "loss": 2.0826, + "mean_token_accuracy": 0.541749119758606, + "num_tokens": 2578997255.0, + "step": 5042 + }, + { + "epoch": 1.3637101135749052, + "grad_norm": 1.579623818397522, + "learning_rate": 1.7360337074458555e-05, + "loss": 1.9213, + "mean_token_accuracy": 0.5590760707855225, + "num_tokens": 2579521486.0, + "step": 5043 + }, + { + "epoch": 1.3639805300162249, + "grad_norm": 1.56797194480896, + "learning_rate": 1.7359221596249675e-05, + "loss": 2.0855, + "mean_token_accuracy": 0.5366601943969727, + "num_tokens": 2580044522.0, + "step": 5044 + }, + { + "epoch": 1.3642509464575445, + "grad_norm": 1.587427020072937, + "learning_rate": 1.7358105922921176e-05, + "loss": 1.9585, + "mean_token_accuracy": 0.5424273610115051, + "num_tokens": 2580546846.0, + "step": 5045 + }, + { + "epoch": 1.3645213628988642, + "grad_norm": 1.4947428703308105, + "learning_rate": 1.7356990054507288e-05, + "loss": 2.0206, + "mean_token_accuracy": 0.5458498597145081, + "num_tokens": 2581071079.0, + "step": 5046 + }, + { + "epoch": 1.3647917793401838, + "grad_norm": 1.5426310300827026, + "learning_rate": 1.735587399104225e-05, + "loss": 2.0012, + "mean_token_accuracy": 0.5430874824523926, + "num_tokens": 2581580288.0, + "step": 5047 + }, + { + "epoch": 1.3650621957815035, + "grad_norm": 1.4918321371078491, + "learning_rate": 1.7354757732560304e-05, + "loss": 2.0834, + "mean_token_accuracy": 0.5334030389785767, + "num_tokens": 2582104478.0, + "step": 5048 + }, + { + "epoch": 1.365332612222823, + "grad_norm": 1.45299232006073, + "learning_rate": 1.7353641279095708e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.5429590344429016, + "num_tokens": 2582594224.0, + "step": 5049 + }, + { + "epoch": 1.3656030286641427, + "grad_norm": 1.4943397045135498, + "learning_rate": 1.7352524630682707e-05, + "loss": 2.0349, + "mean_token_accuracy": 0.5584729313850403, + "num_tokens": 2583118373.0, + "step": 5050 + }, + { + "epoch": 1.3658734451054624, + "grad_norm": 1.23873770236969, + "learning_rate": 1.7351407787355577e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.5395997762680054, + "num_tokens": 2583641395.0, + "step": 5051 + }, + { + "epoch": 1.366143861546782, + "grad_norm": 1.3328163623809814, + "learning_rate": 1.735029074914857e-05, + "loss": 2.0106, + "mean_token_accuracy": 0.5625768899917603, + "num_tokens": 2584096483.0, + "step": 5052 + }, + { + "epoch": 1.3664142779881017, + "grad_norm": 1.3551148176193237, + "learning_rate": 1.734917351609597e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.5365749597549438, + "num_tokens": 2584620674.0, + "step": 5053 + }, + { + "epoch": 1.3666846944294213, + "grad_norm": 1.1737381219863892, + "learning_rate": 1.7348056088232058e-05, + "loss": 2.0727, + "mean_token_accuracy": 0.5119069814682007, + "num_tokens": 2585144902.0, + "step": 5054 + }, + { + "epoch": 1.366955110870741, + "grad_norm": 1.1483832597732544, + "learning_rate": 1.7346938465591113e-05, + "loss": 1.8654, + "mean_token_accuracy": 0.5706989765167236, + "num_tokens": 2585669122.0, + "step": 5055 + }, + { + "epoch": 1.3672255273120606, + "grad_norm": 1.2310525178909302, + "learning_rate": 1.7345820648207434e-05, + "loss": 1.9495, + "mean_token_accuracy": 0.5529788136482239, + "num_tokens": 2586193309.0, + "step": 5056 + }, + { + "epoch": 1.3674959437533802, + "grad_norm": 1.2462153434753418, + "learning_rate": 1.7344702636115315e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5410804748535156, + "num_tokens": 2586717519.0, + "step": 5057 + }, + { + "epoch": 1.3677663601946999, + "grad_norm": 1.3087303638458252, + "learning_rate": 1.7343584429349055e-05, + "loss": 1.9996, + "mean_token_accuracy": 0.5462269186973572, + "num_tokens": 2587241799.0, + "step": 5058 + }, + { + "epoch": 1.3680367766360195, + "grad_norm": 1.2454400062561035, + "learning_rate": 1.7342466027942976e-05, + "loss": 2.0298, + "mean_token_accuracy": 0.5419814586639404, + "num_tokens": 2587746059.0, + "step": 5059 + }, + { + "epoch": 1.3683071930773392, + "grad_norm": 1.3237406015396118, + "learning_rate": 1.7341347431931385e-05, + "loss": 1.9877, + "mean_token_accuracy": 0.530280351638794, + "num_tokens": 2588270202.0, + "step": 5060 + }, + { + "epoch": 1.3685776095186588, + "grad_norm": 0.7056185007095337, + "learning_rate": 1.7340228641348608e-05, + "loss": 1.1693, + "mean_token_accuracy": 0.6907209157943726, + "num_tokens": 2588794377.0, + "step": 5061 + }, + { + "epoch": 1.3688480259599785, + "grad_norm": 2.330122947692871, + "learning_rate": 1.7339109656228967e-05, + "loss": 2.0946, + "mean_token_accuracy": 0.5246222019195557, + "num_tokens": 2589318651.0, + "step": 5062 + }, + { + "epoch": 1.369118442401298, + "grad_norm": 1.6698764562606812, + "learning_rate": 1.7337990476606803e-05, + "loss": 1.9417, + "mean_token_accuracy": 0.5579730272293091, + "num_tokens": 2589842925.0, + "step": 5063 + }, + { + "epoch": 1.3693888588426177, + "grad_norm": 1.5513203144073486, + "learning_rate": 1.733687110251645e-05, + "loss": 2.0782, + "mean_token_accuracy": 0.5289382934570312, + "num_tokens": 2590367165.0, + "step": 5064 + }, + { + "epoch": 1.3696592752839374, + "grad_norm": 2.086993455886841, + "learning_rate": 1.7335751533992257e-05, + "loss": 1.9975, + "mean_token_accuracy": 0.5603768825531006, + "num_tokens": 2590835205.0, + "step": 5065 + }, + { + "epoch": 1.369929691725257, + "grad_norm": 1.3521945476531982, + "learning_rate": 1.7334631771068573e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5540899038314819, + "num_tokens": 2591359464.0, + "step": 5066 + }, + { + "epoch": 1.3702001081665764, + "grad_norm": 1.7220100164413452, + "learning_rate": 1.733351181377976e-05, + "loss": 2.1173, + "mean_token_accuracy": 0.5350637435913086, + "num_tokens": 2591836289.0, + "step": 5067 + }, + { + "epoch": 1.370470524607896, + "grad_norm": 1.852205753326416, + "learning_rate": 1.733239166216018e-05, + "loss": 2.1481, + "mean_token_accuracy": 0.5396651029586792, + "num_tokens": 2592338365.0, + "step": 5068 + }, + { + "epoch": 1.3707409410492157, + "grad_norm": 1.4290562868118286, + "learning_rate": 1.7331271316244198e-05, + "loss": 1.8403, + "mean_token_accuracy": 0.6070876717567444, + "num_tokens": 2592855109.0, + "step": 5069 + }, + { + "epoch": 1.3710113574905354, + "grad_norm": 1.2780598402023315, + "learning_rate": 1.7330150776066195e-05, + "loss": 2.0151, + "mean_token_accuracy": 0.5458747148513794, + "num_tokens": 2593350930.0, + "step": 5070 + }, + { + "epoch": 1.371281773931855, + "grad_norm": 1.3321855068206787, + "learning_rate": 1.7329030041660548e-05, + "loss": 2.0842, + "mean_token_accuracy": 0.5395044684410095, + "num_tokens": 2593875142.0, + "step": 5071 + }, + { + "epoch": 1.3715521903731747, + "grad_norm": 1.1484075784683228, + "learning_rate": 1.7327909113061648e-05, + "loss": 1.9535, + "mean_token_accuracy": 0.5733497142791748, + "num_tokens": 2594304180.0, + "step": 5072 + }, + { + "epoch": 1.3718226068144943, + "grad_norm": 1.205263376235962, + "learning_rate": 1.7326787990303887e-05, + "loss": 1.9611, + "mean_token_accuracy": 0.5645825862884521, + "num_tokens": 2594781260.0, + "step": 5073 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 1.3217452764511108, + "learning_rate": 1.732566667342167e-05, + "loss": 1.9676, + "mean_token_accuracy": 0.5508328080177307, + "num_tokens": 2595305529.0, + "step": 5074 + }, + { + "epoch": 1.3723634396971336, + "grad_norm": 1.261202335357666, + "learning_rate": 1.7324545162449393e-05, + "loss": 1.9712, + "mean_token_accuracy": 0.5527539253234863, + "num_tokens": 2595829739.0, + "step": 5075 + }, + { + "epoch": 1.3726338561384532, + "grad_norm": 1.0614383220672607, + "learning_rate": 1.7323423457421468e-05, + "loss": 1.7367, + "mean_token_accuracy": 0.6148059368133545, + "num_tokens": 2596353966.0, + "step": 5076 + }, + { + "epoch": 1.3729042725797729, + "grad_norm": 1.4070848226547241, + "learning_rate": 1.732230155837232e-05, + "loss": 1.9469, + "mean_token_accuracy": 0.5699936151504517, + "num_tokens": 2596851658.0, + "step": 5077 + }, + { + "epoch": 1.3731746890210925, + "grad_norm": 1.3083302974700928, + "learning_rate": 1.7321179465336363e-05, + "loss": 2.0269, + "mean_token_accuracy": 0.5291922092437744, + "num_tokens": 2597375899.0, + "step": 5078 + }, + { + "epoch": 1.3734451054624122, + "grad_norm": 1.4046653509140015, + "learning_rate": 1.7320057178348035e-05, + "loss": 2.0371, + "mean_token_accuracy": 0.5557693243026733, + "num_tokens": 2597899981.0, + "step": 5079 + }, + { + "epoch": 1.3737155219037318, + "grad_norm": 1.6697427034378052, + "learning_rate": 1.7318934697441763e-05, + "loss": 1.674, + "mean_token_accuracy": 0.6170767545700073, + "num_tokens": 2598424180.0, + "step": 5080 + }, + { + "epoch": 1.3739859383450514, + "grad_norm": 0.6324762105941772, + "learning_rate": 1.7317812022651994e-05, + "loss": 1.214, + "mean_token_accuracy": 0.6804065704345703, + "num_tokens": 2598948395.0, + "step": 5081 + }, + { + "epoch": 1.374256354786371, + "grad_norm": 2.1453871726989746, + "learning_rate": 1.731668915401317e-05, + "loss": 1.9203, + "mean_token_accuracy": 0.5670968294143677, + "num_tokens": 2599467328.0, + "step": 5082 + }, + { + "epoch": 1.3745267712276905, + "grad_norm": 2.117210626602173, + "learning_rate": 1.731556609155975e-05, + "loss": 2.0515, + "mean_token_accuracy": 0.5469251275062561, + "num_tokens": 2599991596.0, + "step": 5083 + }, + { + "epoch": 1.3747971876690102, + "grad_norm": 1.6533242464065552, + "learning_rate": 1.731444283532619e-05, + "loss": 2.0889, + "mean_token_accuracy": 0.5424855351448059, + "num_tokens": 2600515877.0, + "step": 5084 + }, + { + "epoch": 1.3750676041103298, + "grad_norm": 1.3902759552001953, + "learning_rate": 1.731331938534695e-05, + "loss": 1.8568, + "mean_token_accuracy": 0.5704597234725952, + "num_tokens": 2600984983.0, + "step": 5085 + }, + { + "epoch": 1.3753380205516494, + "grad_norm": 1.4161981344223022, + "learning_rate": 1.731219574165651e-05, + "loss": 1.7984, + "mean_token_accuracy": 0.5802364945411682, + "num_tokens": 2601509107.0, + "step": 5086 + }, + { + "epoch": 1.375608436992969, + "grad_norm": 1.6011797189712524, + "learning_rate": 1.7311071904289337e-05, + "loss": 2.0889, + "mean_token_accuracy": 0.5355685353279114, + "num_tokens": 2602033333.0, + "step": 5087 + }, + { + "epoch": 1.3758788534342887, + "grad_norm": 1.5180100202560425, + "learning_rate": 1.730994787327992e-05, + "loss": 2.097, + "mean_token_accuracy": 0.5354644656181335, + "num_tokens": 2602557563.0, + "step": 5088 + }, + { + "epoch": 1.3761492698756084, + "grad_norm": 1.4355965852737427, + "learning_rate": 1.7308823648662746e-05, + "loss": 2.0759, + "mean_token_accuracy": 0.5288656949996948, + "num_tokens": 2603070893.0, + "step": 5089 + }, + { + "epoch": 1.376419686316928, + "grad_norm": 1.3190544843673706, + "learning_rate": 1.7307699230472313e-05, + "loss": 2.1128, + "mean_token_accuracy": 0.5337517261505127, + "num_tokens": 2603595116.0, + "step": 5090 + }, + { + "epoch": 1.3766901027582477, + "grad_norm": 1.600860595703125, + "learning_rate": 1.7306574618743114e-05, + "loss": 1.9748, + "mean_token_accuracy": 0.5492884516716003, + "num_tokens": 2604119322.0, + "step": 5091 + }, + { + "epoch": 1.3769605191995673, + "grad_norm": 1.5445014238357544, + "learning_rate": 1.730544981350966e-05, + "loss": 2.0885, + "mean_token_accuracy": 0.5477603673934937, + "num_tokens": 2604643499.0, + "step": 5092 + }, + { + "epoch": 1.377230935640887, + "grad_norm": 1.3135268688201904, + "learning_rate": 1.7304324814806463e-05, + "loss": 2.1225, + "mean_token_accuracy": 0.5389961004257202, + "num_tokens": 2605131425.0, + "step": 5093 + }, + { + "epoch": 1.3775013520822066, + "grad_norm": 1.4892756938934326, + "learning_rate": 1.730319962266804e-05, + "loss": 1.9103, + "mean_token_accuracy": 0.5680386424064636, + "num_tokens": 2605655507.0, + "step": 5094 + }, + { + "epoch": 1.3777717685235262, + "grad_norm": 1.484893798828125, + "learning_rate": 1.7302074237128914e-05, + "loss": 2.0952, + "mean_token_accuracy": 0.5351777076721191, + "num_tokens": 2606179729.0, + "step": 5095 + }, + { + "epoch": 1.3780421849648459, + "grad_norm": 1.3775516748428345, + "learning_rate": 1.730094865822362e-05, + "loss": 2.1088, + "mean_token_accuracy": 0.5415634512901306, + "num_tokens": 2606668637.0, + "step": 5096 + }, + { + "epoch": 1.3783126014061655, + "grad_norm": 1.3581321239471436, + "learning_rate": 1.729982288598669e-05, + "loss": 1.9978, + "mean_token_accuracy": 0.5536658763885498, + "num_tokens": 2607140337.0, + "step": 5097 + }, + { + "epoch": 1.3785830178474852, + "grad_norm": 1.362284541130066, + "learning_rate": 1.729869692045267e-05, + "loss": 2.0103, + "mean_token_accuracy": 0.5545284748077393, + "num_tokens": 2607628879.0, + "step": 5098 + }, + { + "epoch": 1.3788534342888048, + "grad_norm": 1.5611332654953003, + "learning_rate": 1.7297570761656102e-05, + "loss": 2.0568, + "mean_token_accuracy": 0.541786789894104, + "num_tokens": 2608153049.0, + "step": 5099 + }, + { + "epoch": 1.3791238507301244, + "grad_norm": 1.2206144332885742, + "learning_rate": 1.7296444409631547e-05, + "loss": 1.952, + "mean_token_accuracy": 0.5623746514320374, + "num_tokens": 2608677186.0, + "step": 5100 + }, + { + "epoch": 1.379394267171444, + "grad_norm": 0.7804016470909119, + "learning_rate": 1.7295317864413555e-05, + "loss": 1.1459, + "mean_token_accuracy": 0.6826323866844177, + "num_tokens": 2609181729.0, + "step": 5101 + }, + { + "epoch": 1.3796646836127637, + "grad_norm": 2.202195644378662, + "learning_rate": 1.72941911260367e-05, + "loss": 1.9985, + "mean_token_accuracy": 0.5746241807937622, + "num_tokens": 2609643390.0, + "step": 5102 + }, + { + "epoch": 1.3799351000540834, + "grad_norm": 1.5610212087631226, + "learning_rate": 1.7293064194535557e-05, + "loss": 1.9198, + "mean_token_accuracy": 0.5616257190704346, + "num_tokens": 2610167551.0, + "step": 5103 + }, + { + "epoch": 1.380205516495403, + "grad_norm": 1.1965525150299072, + "learning_rate": 1.7291937069944692e-05, + "loss": 2.0854, + "mean_token_accuracy": 0.5395860075950623, + "num_tokens": 2610647850.0, + "step": 5104 + }, + { + "epoch": 1.3804759329367227, + "grad_norm": 1.5401113033294678, + "learning_rate": 1.7290809752298694e-05, + "loss": 2.0216, + "mean_token_accuracy": 0.5401334762573242, + "num_tokens": 2611172102.0, + "step": 5105 + }, + { + "epoch": 1.3807463493780423, + "grad_norm": 1.3784306049346924, + "learning_rate": 1.7289682241632153e-05, + "loss": 2.099, + "mean_token_accuracy": 0.5181728601455688, + "num_tokens": 2611696380.0, + "step": 5106 + }, + { + "epoch": 1.381016765819362, + "grad_norm": 1.251950979232788, + "learning_rate": 1.7288554537979664e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5475156903266907, + "num_tokens": 2612167095.0, + "step": 5107 + }, + { + "epoch": 1.3812871822606814, + "grad_norm": 1.2996203899383545, + "learning_rate": 1.7287426641375832e-05, + "loss": 1.9781, + "mean_token_accuracy": 0.5602744221687317, + "num_tokens": 2612691363.0, + "step": 5108 + }, + { + "epoch": 1.381557598702001, + "grad_norm": 1.3288673162460327, + "learning_rate": 1.7286298551855255e-05, + "loss": 1.9757, + "mean_token_accuracy": 0.556127667427063, + "num_tokens": 2613215471.0, + "step": 5109 + }, + { + "epoch": 1.3818280151433207, + "grad_norm": 1.365355134010315, + "learning_rate": 1.7285170269452558e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5766284465789795, + "num_tokens": 2613710285.0, + "step": 5110 + }, + { + "epoch": 1.3820984315846403, + "grad_norm": 1.600954532623291, + "learning_rate": 1.728404179420235e-05, + "loss": 2.1053, + "mean_token_accuracy": 0.5315424203872681, + "num_tokens": 2614229707.0, + "step": 5111 + }, + { + "epoch": 1.38236884802596, + "grad_norm": 1.494405746459961, + "learning_rate": 1.7282913126139258e-05, + "loss": 1.9494, + "mean_token_accuracy": 0.541426956653595, + "num_tokens": 2614753965.0, + "step": 5112 + }, + { + "epoch": 1.3826392644672796, + "grad_norm": 1.1575942039489746, + "learning_rate": 1.728178426529792e-05, + "loss": 2.0945, + "mean_token_accuracy": 0.5386977195739746, + "num_tokens": 2615266825.0, + "step": 5113 + }, + { + "epoch": 1.3829096809085992, + "grad_norm": 1.3978936672210693, + "learning_rate": 1.7280655211712964e-05, + "loss": 1.9561, + "mean_token_accuracy": 0.558417797088623, + "num_tokens": 2615767818.0, + "step": 5114 + }, + { + "epoch": 1.3831800973499189, + "grad_norm": 1.2269337177276611, + "learning_rate": 1.727952596541903e-05, + "loss": 2.0326, + "mean_token_accuracy": 0.5490029454231262, + "num_tokens": 2616256918.0, + "step": 5115 + }, + { + "epoch": 1.3834505137912385, + "grad_norm": 1.465737223625183, + "learning_rate": 1.727839652645078e-05, + "loss": 2.0276, + "mean_token_accuracy": 0.5449993014335632, + "num_tokens": 2616781107.0, + "step": 5116 + }, + { + "epoch": 1.3837209302325582, + "grad_norm": 1.7352662086486816, + "learning_rate": 1.727726689484286e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5648703575134277, + "num_tokens": 2617305263.0, + "step": 5117 + }, + { + "epoch": 1.3839913466738778, + "grad_norm": 1.2503396272659302, + "learning_rate": 1.7276137070629927e-05, + "loss": 2.0606, + "mean_token_accuracy": 0.5337902903556824, + "num_tokens": 2617829365.0, + "step": 5118 + }, + { + "epoch": 1.3842617631151974, + "grad_norm": 1.4701480865478516, + "learning_rate": 1.7275007053846656e-05, + "loss": 1.983, + "mean_token_accuracy": 0.5571414828300476, + "num_tokens": 2618301965.0, + "step": 5119 + }, + { + "epoch": 1.384532179556517, + "grad_norm": 1.4256224632263184, + "learning_rate": 1.7273876844527717e-05, + "loss": 2.1316, + "mean_token_accuracy": 0.504104733467102, + "num_tokens": 2618791566.0, + "step": 5120 + }, + { + "epoch": 1.3848025959978367, + "grad_norm": 0.62373286485672, + "learning_rate": 1.727274644270778e-05, + "loss": 1.2246, + "mean_token_accuracy": 0.6670860648155212, + "num_tokens": 2619315834.0, + "step": 5121 + }, + { + "epoch": 1.3850730124391564, + "grad_norm": 2.810283660888672, + "learning_rate": 1.727161584842154e-05, + "loss": 2.0635, + "mean_token_accuracy": 0.5488122701644897, + "num_tokens": 2619839986.0, + "step": 5122 + }, + { + "epoch": 1.385343428880476, + "grad_norm": 1.8872331380844116, + "learning_rate": 1.7270485061703683e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.5389935970306396, + "num_tokens": 2620364194.0, + "step": 5123 + }, + { + "epoch": 1.3856138453217954, + "grad_norm": 1.9085142612457275, + "learning_rate": 1.72693540825889e-05, + "loss": 2.0124, + "mean_token_accuracy": 0.5626856088638306, + "num_tokens": 2620888481.0, + "step": 5124 + }, + { + "epoch": 1.385884261763115, + "grad_norm": 1.593847632408142, + "learning_rate": 1.72682229111119e-05, + "loss": 1.9578, + "mean_token_accuracy": 0.5421900153160095, + "num_tokens": 2621412758.0, + "step": 5125 + }, + { + "epoch": 1.3861546782044347, + "grad_norm": 1.3871723413467407, + "learning_rate": 1.7267091547307386e-05, + "loss": 2.0271, + "mean_token_accuracy": 0.5465254783630371, + "num_tokens": 2621937022.0, + "step": 5126 + }, + { + "epoch": 1.3864250946457544, + "grad_norm": 1.8532466888427734, + "learning_rate": 1.726595999121007e-05, + "loss": 2.0334, + "mean_token_accuracy": 0.5623964071273804, + "num_tokens": 2622461299.0, + "step": 5127 + }, + { + "epoch": 1.386695511087074, + "grad_norm": 1.5394436120986938, + "learning_rate": 1.7264828242854677e-05, + "loss": 2.0758, + "mean_token_accuracy": 0.5429856777191162, + "num_tokens": 2622907692.0, + "step": 5128 + }, + { + "epoch": 1.3869659275283936, + "grad_norm": 1.4069722890853882, + "learning_rate": 1.726369630227593e-05, + "loss": 2.0901, + "mean_token_accuracy": 0.5385890007019043, + "num_tokens": 2623374135.0, + "step": 5129 + }, + { + "epoch": 1.3872363439697133, + "grad_norm": 1.3069179058074951, + "learning_rate": 1.7262564169508563e-05, + "loss": 2.0701, + "mean_token_accuracy": 0.5163558125495911, + "num_tokens": 2623898378.0, + "step": 5130 + }, + { + "epoch": 1.387506760411033, + "grad_norm": 1.5208101272583008, + "learning_rate": 1.7261431844587304e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5447516441345215, + "num_tokens": 2624422594.0, + "step": 5131 + }, + { + "epoch": 1.3877771768523526, + "grad_norm": 1.513311505317688, + "learning_rate": 1.7260299327546908e-05, + "loss": 1.8833, + "mean_token_accuracy": 0.5912922620773315, + "num_tokens": 2624946784.0, + "step": 5132 + }, + { + "epoch": 1.3880475932936722, + "grad_norm": 1.4127566814422607, + "learning_rate": 1.725916661842212e-05, + "loss": 1.8332, + "mean_token_accuracy": 0.5626073479652405, + "num_tokens": 2625470992.0, + "step": 5133 + }, + { + "epoch": 1.3883180097349919, + "grad_norm": 1.616169810295105, + "learning_rate": 1.7258033717247686e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.5253136157989502, + "num_tokens": 2625991487.0, + "step": 5134 + }, + { + "epoch": 1.3885884261763115, + "grad_norm": 1.6029714345932007, + "learning_rate": 1.7256900624058375e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.5560745596885681, + "num_tokens": 2626513344.0, + "step": 5135 + }, + { + "epoch": 1.3888588426176312, + "grad_norm": 1.3293356895446777, + "learning_rate": 1.725576733888896e-05, + "loss": 1.9653, + "mean_token_accuracy": 0.5434044003486633, + "num_tokens": 2627037593.0, + "step": 5136 + }, + { + "epoch": 1.3891292590589508, + "grad_norm": 1.3204131126403809, + "learning_rate": 1.7254633861774196e-05, + "loss": 2.0008, + "mean_token_accuracy": 0.5553334951400757, + "num_tokens": 2627516260.0, + "step": 5137 + }, + { + "epoch": 1.3893996755002704, + "grad_norm": 1.308821678161621, + "learning_rate": 1.7253500192748876e-05, + "loss": 1.9984, + "mean_token_accuracy": 0.5628319382667542, + "num_tokens": 2628026389.0, + "step": 5138 + }, + { + "epoch": 1.38967009194159, + "grad_norm": 1.3200889825820923, + "learning_rate": 1.725236633184778e-05, + "loss": 1.9302, + "mean_token_accuracy": 0.5544701814651489, + "num_tokens": 2628476271.0, + "step": 5139 + }, + { + "epoch": 1.3899405083829097, + "grad_norm": 1.1440235376358032, + "learning_rate": 1.7251232279105697e-05, + "loss": 1.6644, + "mean_token_accuracy": 0.5812032222747803, + "num_tokens": 2628987828.0, + "step": 5140 + }, + { + "epoch": 1.3902109248242294, + "grad_norm": 0.706315279006958, + "learning_rate": 1.7250098034557427e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.6891970038414001, + "num_tokens": 2629512002.0, + "step": 5141 + }, + { + "epoch": 1.390481341265549, + "grad_norm": 2.2691004276275635, + "learning_rate": 1.7248963598237767e-05, + "loss": 2.1227, + "mean_token_accuracy": 0.5147948265075684, + "num_tokens": 2630036184.0, + "step": 5142 + }, + { + "epoch": 1.3907517577068687, + "grad_norm": 1.6474499702453613, + "learning_rate": 1.7247828970181527e-05, + "loss": 1.752, + "mean_token_accuracy": 0.5768718719482422, + "num_tokens": 2630560399.0, + "step": 5143 + }, + { + "epoch": 1.3910221741481883, + "grad_norm": 1.5296194553375244, + "learning_rate": 1.724669415042352e-05, + "loss": 1.9216, + "mean_token_accuracy": 0.5498871803283691, + "num_tokens": 2631084650.0, + "step": 5144 + }, + { + "epoch": 1.391292590589508, + "grad_norm": 1.8037201166152954, + "learning_rate": 1.7245559138998565e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.5428589582443237, + "num_tokens": 2631608920.0, + "step": 5145 + }, + { + "epoch": 1.3915630070308276, + "grad_norm": 42.9239387512207, + "learning_rate": 1.7244423935941493e-05, + "loss": 1.7392, + "mean_token_accuracy": 0.6006199717521667, + "num_tokens": 2632133142.0, + "step": 5146 + }, + { + "epoch": 1.3918334234721472, + "grad_norm": 1.833353877067566, + "learning_rate": 1.7243288541287123e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.5694236159324646, + "num_tokens": 2632609676.0, + "step": 5147 + }, + { + "epoch": 1.3921038399134669, + "grad_norm": 1.6818101406097412, + "learning_rate": 1.7242152955070307e-05, + "loss": 2.1185, + "mean_token_accuracy": 0.5225892066955566, + "num_tokens": 2633133908.0, + "step": 5148 + }, + { + "epoch": 1.3923742563547863, + "grad_norm": 1.4164628982543945, + "learning_rate": 1.7241017177325883e-05, + "loss": 1.9717, + "mean_token_accuracy": 0.5459965467453003, + "num_tokens": 2633658136.0, + "step": 5149 + }, + { + "epoch": 1.392644672796106, + "grad_norm": 1.4537310600280762, + "learning_rate": 1.7239881208088692e-05, + "loss": 2.0015, + "mean_token_accuracy": 0.5481759309768677, + "num_tokens": 2634182417.0, + "step": 5150 + }, + { + "epoch": 1.3929150892374256, + "grad_norm": 1.4662796258926392, + "learning_rate": 1.72387450473936e-05, + "loss": 2.0639, + "mean_token_accuracy": 0.545677661895752, + "num_tokens": 2634706582.0, + "step": 5151 + }, + { + "epoch": 1.3931855056787452, + "grad_norm": 1.5167436599731445, + "learning_rate": 1.723760869527546e-05, + "loss": 1.908, + "mean_token_accuracy": 0.5625910758972168, + "num_tokens": 2635230839.0, + "step": 5152 + }, + { + "epoch": 1.3934559221200649, + "grad_norm": 1.4958724975585938, + "learning_rate": 1.723647215176914e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.5360875129699707, + "num_tokens": 2635755091.0, + "step": 5153 + }, + { + "epoch": 1.3937263385613845, + "grad_norm": 1.2415529489517212, + "learning_rate": 1.7235335416909518e-05, + "loss": 2.1117, + "mean_token_accuracy": 0.5099272131919861, + "num_tokens": 2636279308.0, + "step": 5154 + }, + { + "epoch": 1.3939967550027041, + "grad_norm": 1.5077296495437622, + "learning_rate": 1.723419849073147e-05, + "loss": 1.9444, + "mean_token_accuracy": 0.5641509294509888, + "num_tokens": 2636803574.0, + "step": 5155 + }, + { + "epoch": 1.3942671714440238, + "grad_norm": 1.522081971168518, + "learning_rate": 1.7233061373269875e-05, + "loss": 2.0196, + "mean_token_accuracy": 0.5496494770050049, + "num_tokens": 2637327742.0, + "step": 5156 + }, + { + "epoch": 1.3945375878853434, + "grad_norm": 1.924957513809204, + "learning_rate": 1.7231924064559627e-05, + "loss": 1.8659, + "mean_token_accuracy": 0.5445535182952881, + "num_tokens": 2637827843.0, + "step": 5157 + }, + { + "epoch": 1.394808004326663, + "grad_norm": 2.1495048999786377, + "learning_rate": 1.723078656463562e-05, + "loss": 1.9716, + "mean_token_accuracy": 0.5868309140205383, + "num_tokens": 2638289030.0, + "step": 5158 + }, + { + "epoch": 1.3950784207679827, + "grad_norm": 1.9184597730636597, + "learning_rate": 1.7229648873532762e-05, + "loss": 2.0801, + "mean_token_accuracy": 0.5394379496574402, + "num_tokens": 2638813259.0, + "step": 5159 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 2.0179622173309326, + "learning_rate": 1.7228510991285955e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.5638850331306458, + "num_tokens": 2639293852.0, + "step": 5160 + }, + { + "epoch": 1.395619253650622, + "grad_norm": 0.7706872224807739, + "learning_rate": 1.7227372917930114e-05, + "loss": 1.2067, + "mean_token_accuracy": 0.6664530038833618, + "num_tokens": 2639817933.0, + "step": 5161 + }, + { + "epoch": 1.3958896700919416, + "grad_norm": 2.864077568054199, + "learning_rate": 1.7226234653500157e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.5252162218093872, + "num_tokens": 2640342085.0, + "step": 5162 + }, + { + "epoch": 1.3961600865332613, + "grad_norm": 2.6883552074432373, + "learning_rate": 1.7225096198031012e-05, + "loss": 2.1091, + "mean_token_accuracy": 0.5591278076171875, + "num_tokens": 2640866352.0, + "step": 5163 + }, + { + "epoch": 1.396430502974581, + "grad_norm": 1.8661803007125854, + "learning_rate": 1.7223957551557608e-05, + "loss": 2.1861, + "mean_token_accuracy": 0.5172016620635986, + "num_tokens": 2641341243.0, + "step": 5164 + }, + { + "epoch": 1.3967009194159004, + "grad_norm": 2.1543636322021484, + "learning_rate": 1.7222818714114884e-05, + "loss": 2.1262, + "mean_token_accuracy": 0.5537842512130737, + "num_tokens": 2641801240.0, + "step": 5165 + }, + { + "epoch": 1.39697133585722, + "grad_norm": 1.7765908241271973, + "learning_rate": 1.7221679685737783e-05, + "loss": 2.0966, + "mean_token_accuracy": 0.5306445360183716, + "num_tokens": 2642325463.0, + "step": 5166 + }, + { + "epoch": 1.3972417522985396, + "grad_norm": 1.929215908050537, + "learning_rate": 1.722054046646125e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.5548301935195923, + "num_tokens": 2642849727.0, + "step": 5167 + }, + { + "epoch": 1.3975121687398593, + "grad_norm": 1.611027479171753, + "learning_rate": 1.7219401056320243e-05, + "loss": 1.9226, + "mean_token_accuracy": 0.559377133846283, + "num_tokens": 2643373996.0, + "step": 5168 + }, + { + "epoch": 1.397782585181179, + "grad_norm": 1.6015633344650269, + "learning_rate": 1.721826145534972e-05, + "loss": 1.8997, + "mean_token_accuracy": 0.5550759434700012, + "num_tokens": 2643811179.0, + "step": 5169 + }, + { + "epoch": 1.3980530016224986, + "grad_norm": 1.966338872909546, + "learning_rate": 1.721712166358465e-05, + "loss": 2.0367, + "mean_token_accuracy": 0.5549575090408325, + "num_tokens": 2644335440.0, + "step": 5170 + }, + { + "epoch": 1.3983234180638182, + "grad_norm": 1.6416996717453003, + "learning_rate": 1.7215981681060003e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5344660878181458, + "num_tokens": 2644859567.0, + "step": 5171 + }, + { + "epoch": 1.3985938345051379, + "grad_norm": 2.0019142627716064, + "learning_rate": 1.721484150781076e-05, + "loss": 2.0225, + "mean_token_accuracy": 0.5582845211029053, + "num_tokens": 2645383612.0, + "step": 5172 + }, + { + "epoch": 1.3988642509464575, + "grad_norm": 2.3696365356445312, + "learning_rate": 1.72137011438719e-05, + "loss": 2.069, + "mean_token_accuracy": 0.5521470308303833, + "num_tokens": 2645870391.0, + "step": 5173 + }, + { + "epoch": 1.3991346673877771, + "grad_norm": 1.534858226776123, + "learning_rate": 1.7212560589278418e-05, + "loss": 2.0607, + "mean_token_accuracy": 0.540841281414032, + "num_tokens": 2646394489.0, + "step": 5174 + }, + { + "epoch": 1.3994050838290968, + "grad_norm": 1.2686582803726196, + "learning_rate": 1.7211419844065308e-05, + "loss": 1.8232, + "mean_token_accuracy": 0.5908459424972534, + "num_tokens": 2646918550.0, + "step": 5175 + }, + { + "epoch": 1.3996755002704164, + "grad_norm": 1.7260632514953613, + "learning_rate": 1.721027890826757e-05, + "loss": 1.997, + "mean_token_accuracy": 0.5559175610542297, + "num_tokens": 2647438651.0, + "step": 5176 + }, + { + "epoch": 1.399945916711736, + "grad_norm": 1.7131034135818481, + "learning_rate": 1.720913778192021e-05, + "loss": 2.1569, + "mean_token_accuracy": 0.5385549068450928, + "num_tokens": 2647903721.0, + "step": 5177 + }, + { + "epoch": 1.4002163331530557, + "grad_norm": 1.6141213178634644, + "learning_rate": 1.7207996465058247e-05, + "loss": 2.0865, + "mean_token_accuracy": 0.5362858772277832, + "num_tokens": 2648395258.0, + "step": 5178 + }, + { + "epoch": 1.4004867495943754, + "grad_norm": 1.4995111227035522, + "learning_rate": 1.720685495771669e-05, + "loss": 2.0888, + "mean_token_accuracy": 0.54454505443573, + "num_tokens": 2648917188.0, + "step": 5179 + }, + { + "epoch": 1.400757166035695, + "grad_norm": 1.2446569204330444, + "learning_rate": 1.7205713259930572e-05, + "loss": 1.9384, + "mean_token_accuracy": 0.5542633533477783, + "num_tokens": 2649441343.0, + "step": 5180 + }, + { + "epoch": 1.4010275824770146, + "grad_norm": 0.621030867099762, + "learning_rate": 1.7204571371734922e-05, + "loss": 1.1641, + "mean_token_accuracy": 0.6895797252655029, + "num_tokens": 2649965581.0, + "step": 5181 + }, + { + "epoch": 1.4012979989183343, + "grad_norm": 1.702921748161316, + "learning_rate": 1.7203429293164776e-05, + "loss": 1.9272, + "mean_token_accuracy": 0.5659936666488647, + "num_tokens": 2650489837.0, + "step": 5182 + }, + { + "epoch": 1.401568415359654, + "grad_norm": 1.8807111978530884, + "learning_rate": 1.7202287024255175e-05, + "loss": 2.05, + "mean_token_accuracy": 0.5371993184089661, + "num_tokens": 2651014022.0, + "step": 5183 + }, + { + "epoch": 1.4018388318009736, + "grad_norm": 1.2827893495559692, + "learning_rate": 1.7201144565041172e-05, + "loss": 2.1527, + "mean_token_accuracy": 0.5393380522727966, + "num_tokens": 2651501565.0, + "step": 5184 + }, + { + "epoch": 1.4021092482422932, + "grad_norm": 1.8541196584701538, + "learning_rate": 1.7200001915557812e-05, + "loss": 2.1185, + "mean_token_accuracy": 0.5391297340393066, + "num_tokens": 2652025820.0, + "step": 5185 + }, + { + "epoch": 1.4023796646836129, + "grad_norm": 1.7406704425811768, + "learning_rate": 1.7198859075840166e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5539910197257996, + "num_tokens": 2652549976.0, + "step": 5186 + }, + { + "epoch": 1.4026500811249325, + "grad_norm": 1.2513468265533447, + "learning_rate": 1.719771604592329e-05, + "loss": 1.9894, + "mean_token_accuracy": 0.5522551536560059, + "num_tokens": 2653005716.0, + "step": 5187 + }, + { + "epoch": 1.4029204975662521, + "grad_norm": 1.6584442853927612, + "learning_rate": 1.719657282584226e-05, + "loss": 2.0874, + "mean_token_accuracy": 0.5373998880386353, + "num_tokens": 2653529980.0, + "step": 5188 + }, + { + "epoch": 1.4031909140075718, + "grad_norm": 1.5061352252960205, + "learning_rate": 1.7195429415632153e-05, + "loss": 1.9221, + "mean_token_accuracy": 0.5503109693527222, + "num_tokens": 2654054174.0, + "step": 5189 + }, + { + "epoch": 1.4034613304488914, + "grad_norm": 1.4271754026412964, + "learning_rate": 1.7194285815328055e-05, + "loss": 2.006, + "mean_token_accuracy": 0.5515874028205872, + "num_tokens": 2654578375.0, + "step": 5190 + }, + { + "epoch": 1.4037317468902109, + "grad_norm": 1.6106953620910645, + "learning_rate": 1.7193142024965047e-05, + "loss": 2.0829, + "mean_token_accuracy": 0.556119978427887, + "num_tokens": 2655072456.0, + "step": 5191 + }, + { + "epoch": 1.4040021633315305, + "grad_norm": 1.3388848304748535, + "learning_rate": 1.719199804457823e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5577929019927979, + "num_tokens": 2655555198.0, + "step": 5192 + }, + { + "epoch": 1.4042725797728501, + "grad_norm": 1.6692912578582764, + "learning_rate": 1.7190853874202708e-05, + "loss": 2.059, + "mean_token_accuracy": 0.5421401262283325, + "num_tokens": 2656079403.0, + "step": 5193 + }, + { + "epoch": 1.4045429962141698, + "grad_norm": 1.1641148328781128, + "learning_rate": 1.7189709513873575e-05, + "loss": 1.8732, + "mean_token_accuracy": 0.5700050592422485, + "num_tokens": 2656603663.0, + "step": 5194 + }, + { + "epoch": 1.4048134126554894, + "grad_norm": 1.686324119567871, + "learning_rate": 1.7188564963625957e-05, + "loss": 2.0916, + "mean_token_accuracy": 0.5453693866729736, + "num_tokens": 2657127800.0, + "step": 5195 + }, + { + "epoch": 1.405083829096809, + "grad_norm": 1.564271092414856, + "learning_rate": 1.7187420223494965e-05, + "loss": 2.0773, + "mean_token_accuracy": 0.5688326358795166, + "num_tokens": 2657528701.0, + "step": 5196 + }, + { + "epoch": 1.4053542455381287, + "grad_norm": 1.1941794157028198, + "learning_rate": 1.7186275293515724e-05, + "loss": 1.9017, + "mean_token_accuracy": 0.5722191333770752, + "num_tokens": 2658052837.0, + "step": 5197 + }, + { + "epoch": 1.4056246619794484, + "grad_norm": 1.593612551689148, + "learning_rate": 1.7185130173723365e-05, + "loss": 2.0452, + "mean_token_accuracy": 0.572187066078186, + "num_tokens": 2658422953.0, + "step": 5198 + }, + { + "epoch": 1.405895078420768, + "grad_norm": 2.046386957168579, + "learning_rate": 1.7183984864153024e-05, + "loss": 2.0471, + "mean_token_accuracy": 0.5640102624893188, + "num_tokens": 2658905203.0, + "step": 5199 + }, + { + "epoch": 1.4061654948620876, + "grad_norm": 1.3996403217315674, + "learning_rate": 1.7182839364839843e-05, + "loss": 1.9828, + "mean_token_accuracy": 0.5296878218650818, + "num_tokens": 2659429463.0, + "step": 5200 + }, + { + "epoch": 1.4064359113034073, + "grad_norm": 0.6267055869102478, + "learning_rate": 1.7181693675818965e-05, + "loss": 1.0366, + "mean_token_accuracy": 0.7175127267837524, + "num_tokens": 2659953645.0, + "step": 5201 + }, + { + "epoch": 1.406706327744727, + "grad_norm": 2.148069381713867, + "learning_rate": 1.7180547797125544e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.5444844961166382, + "num_tokens": 2660477923.0, + "step": 5202 + }, + { + "epoch": 1.4069767441860466, + "grad_norm": 1.751847743988037, + "learning_rate": 1.7179401728794744e-05, + "loss": 1.9508, + "mean_token_accuracy": 0.5718823671340942, + "num_tokens": 2660989263.0, + "step": 5203 + }, + { + "epoch": 1.4072471606273662, + "grad_norm": 1.2023382186889648, + "learning_rate": 1.7178255470861724e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.5570939779281616, + "num_tokens": 2661513531.0, + "step": 5204 + }, + { + "epoch": 1.4075175770686859, + "grad_norm": 1.5230306386947632, + "learning_rate": 1.717710902336166e-05, + "loss": 1.9176, + "mean_token_accuracy": 0.5635668039321899, + "num_tokens": 2662037585.0, + "step": 5205 + }, + { + "epoch": 1.4077879935100053, + "grad_norm": 1.4169138669967651, + "learning_rate": 1.7175962386329724e-05, + "loss": 2.0145, + "mean_token_accuracy": 0.5361631512641907, + "num_tokens": 2662561868.0, + "step": 5206 + }, + { + "epoch": 1.408058409951325, + "grad_norm": 1.5271961688995361, + "learning_rate": 1.7174815559801098e-05, + "loss": 2.0562, + "mean_token_accuracy": 0.5539105534553528, + "num_tokens": 2663026024.0, + "step": 5207 + }, + { + "epoch": 1.4083288263926446, + "grad_norm": 1.0618654489517212, + "learning_rate": 1.7173668543810972e-05, + "loss": 2.0088, + "mean_token_accuracy": 0.5468868017196655, + "num_tokens": 2663550202.0, + "step": 5208 + }, + { + "epoch": 1.4085992428339642, + "grad_norm": 1.2477707862854004, + "learning_rate": 1.7172521338394544e-05, + "loss": 1.9358, + "mean_token_accuracy": 0.547792911529541, + "num_tokens": 2664040215.0, + "step": 5209 + }, + { + "epoch": 1.4088696592752838, + "grad_norm": 1.1894066333770752, + "learning_rate": 1.7171373943587004e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5567085146903992, + "num_tokens": 2664564294.0, + "step": 5210 + }, + { + "epoch": 1.4091400757166035, + "grad_norm": 1.5181527137756348, + "learning_rate": 1.7170226359423566e-05, + "loss": 2.1575, + "mean_token_accuracy": 0.5142296552658081, + "num_tokens": 2665088516.0, + "step": 5211 + }, + { + "epoch": 1.4094104921579231, + "grad_norm": 1.3473089933395386, + "learning_rate": 1.7169078585939437e-05, + "loss": 2.0438, + "mean_token_accuracy": 0.5532251596450806, + "num_tokens": 2665612738.0, + "step": 5212 + }, + { + "epoch": 1.4096809085992428, + "grad_norm": 1.4927583932876587, + "learning_rate": 1.716793062316983e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.5414127111434937, + "num_tokens": 2666136867.0, + "step": 5213 + }, + { + "epoch": 1.4099513250405624, + "grad_norm": 1.4112963676452637, + "learning_rate": 1.7166782471149982e-05, + "loss": 2.0824, + "mean_token_accuracy": 0.5447803735733032, + "num_tokens": 2666661045.0, + "step": 5214 + }, + { + "epoch": 1.410221741481882, + "grad_norm": 1.40534508228302, + "learning_rate": 1.7165634129915108e-05, + "loss": 2.1256, + "mean_token_accuracy": 0.5142188668251038, + "num_tokens": 2667185224.0, + "step": 5215 + }, + { + "epoch": 1.4104921579232017, + "grad_norm": 1.263533592224121, + "learning_rate": 1.7164485599500447e-05, + "loss": 2.0144, + "mean_token_accuracy": 0.5521014928817749, + "num_tokens": 2667693262.0, + "step": 5216 + }, + { + "epoch": 1.4107625743645213, + "grad_norm": 1.194168210029602, + "learning_rate": 1.7163336879941235e-05, + "loss": 2.0476, + "mean_token_accuracy": 0.5448112487792969, + "num_tokens": 2668165250.0, + "step": 5217 + }, + { + "epoch": 1.411032990805841, + "grad_norm": 1.382216453552246, + "learning_rate": 1.716218797127273e-05, + "loss": 2.0571, + "mean_token_accuracy": 0.5395063161849976, + "num_tokens": 2668683837.0, + "step": 5218 + }, + { + "epoch": 1.4113034072471606, + "grad_norm": 1.2863469123840332, + "learning_rate": 1.7161038873530172e-05, + "loss": 2.0425, + "mean_token_accuracy": 0.5359971523284912, + "num_tokens": 2669208089.0, + "step": 5219 + }, + { + "epoch": 1.4115738236884803, + "grad_norm": 1.252807378768921, + "learning_rate": 1.7159889586748824e-05, + "loss": 1.9692, + "mean_token_accuracy": 0.5551077127456665, + "num_tokens": 2669732365.0, + "step": 5220 + }, + { + "epoch": 1.4118442401298, + "grad_norm": 0.6753035187721252, + "learning_rate": 1.7158740110963944e-05, + "loss": 1.1736, + "mean_token_accuracy": 0.6771558523178101, + "num_tokens": 2670256617.0, + "step": 5221 + }, + { + "epoch": 1.4121146565711196, + "grad_norm": 1.807808518409729, + "learning_rate": 1.7157590446210813e-05, + "loss": 2.0529, + "mean_token_accuracy": 0.5480819940567017, + "num_tokens": 2670780791.0, + "step": 5222 + }, + { + "epoch": 1.4123850730124392, + "grad_norm": 1.346610188484192, + "learning_rate": 1.7156440592524693e-05, + "loss": 2.0239, + "mean_token_accuracy": 0.5350481867790222, + "num_tokens": 2671305043.0, + "step": 5223 + }, + { + "epoch": 1.4126554894537589, + "grad_norm": 1.4275254011154175, + "learning_rate": 1.7155290549940872e-05, + "loss": 2.1033, + "mean_token_accuracy": 0.5366083979606628, + "num_tokens": 2671829272.0, + "step": 5224 + }, + { + "epoch": 1.4129259058950785, + "grad_norm": 1.1601226329803467, + "learning_rate": 1.7154140318494636e-05, + "loss": 2.0229, + "mean_token_accuracy": 0.5644951462745667, + "num_tokens": 2672289327.0, + "step": 5225 + }, + { + "epoch": 1.4131963223363981, + "grad_norm": 1.2068514823913574, + "learning_rate": 1.715298989822128e-05, + "loss": 1.8664, + "mean_token_accuracy": 0.5811410546302795, + "num_tokens": 2672813358.0, + "step": 5226 + }, + { + "epoch": 1.4134667387777178, + "grad_norm": 1.2396631240844727, + "learning_rate": 1.7151839289156097e-05, + "loss": 1.9395, + "mean_token_accuracy": 0.5685123801231384, + "num_tokens": 2673247192.0, + "step": 5227 + }, + { + "epoch": 1.4137371552190374, + "grad_norm": 1.481613278388977, + "learning_rate": 1.7150688491334392e-05, + "loss": 2.0011, + "mean_token_accuracy": 0.5524661540985107, + "num_tokens": 2673771207.0, + "step": 5228 + }, + { + "epoch": 1.414007571660357, + "grad_norm": 1.3054190874099731, + "learning_rate": 1.7149537504791477e-05, + "loss": 2.0031, + "mean_token_accuracy": 0.5456441640853882, + "num_tokens": 2674295440.0, + "step": 5229 + }, + { + "epoch": 1.4142779881016767, + "grad_norm": 1.5353096723556519, + "learning_rate": 1.7148386329562666e-05, + "loss": 1.9861, + "mean_token_accuracy": 0.5523357391357422, + "num_tokens": 2674819590.0, + "step": 5230 + }, + { + "epoch": 1.4145484045429964, + "grad_norm": 1.194365382194519, + "learning_rate": 1.714723496568328e-05, + "loss": 2.0031, + "mean_token_accuracy": 0.557992696762085, + "num_tokens": 2675343867.0, + "step": 5231 + }, + { + "epoch": 1.4148188209843158, + "grad_norm": 1.2751530408859253, + "learning_rate": 1.714608341318865e-05, + "loss": 2.0013, + "mean_token_accuracy": 0.547463059425354, + "num_tokens": 2675868065.0, + "step": 5232 + }, + { + "epoch": 1.4150892374256354, + "grad_norm": 1.2414523363113403, + "learning_rate": 1.7144931672114105e-05, + "loss": 1.9781, + "mean_token_accuracy": 0.5587921738624573, + "num_tokens": 2676392318.0, + "step": 5233 + }, + { + "epoch": 1.415359653866955, + "grad_norm": 1.4135698080062866, + "learning_rate": 1.7143779742494986e-05, + "loss": 2.0135, + "mean_token_accuracy": 0.5367900729179382, + "num_tokens": 2676916500.0, + "step": 5234 + }, + { + "epoch": 1.4156300703082747, + "grad_norm": 1.248170256614685, + "learning_rate": 1.7142627624366635e-05, + "loss": 2.0048, + "mean_token_accuracy": 0.5502927303314209, + "num_tokens": 2677440676.0, + "step": 5235 + }, + { + "epoch": 1.4159004867495943, + "grad_norm": 1.2052100896835327, + "learning_rate": 1.714147531776441e-05, + "loss": 2.0092, + "mean_token_accuracy": 0.5326017737388611, + "num_tokens": 2677961231.0, + "step": 5236 + }, + { + "epoch": 1.416170903190914, + "grad_norm": 1.4911412000656128, + "learning_rate": 1.7140322822723653e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.5408673286437988, + "num_tokens": 2678431250.0, + "step": 5237 + }, + { + "epoch": 1.4164413196322336, + "grad_norm": 1.5565377473831177, + "learning_rate": 1.713917013927974e-05, + "loss": 2.0178, + "mean_token_accuracy": 0.5400419235229492, + "num_tokens": 2678955446.0, + "step": 5238 + }, + { + "epoch": 1.4167117360735533, + "grad_norm": 1.6411763429641724, + "learning_rate": 1.713801726746803e-05, + "loss": 2.0893, + "mean_token_accuracy": 0.5155582427978516, + "num_tokens": 2679479592.0, + "step": 5239 + }, + { + "epoch": 1.416982152514873, + "grad_norm": 1.8015110492706299, + "learning_rate": 1.71368642073239e-05, + "loss": 2.103, + "mean_token_accuracy": 0.5215965509414673, + "num_tokens": 2680003816.0, + "step": 5240 + }, + { + "epoch": 1.4172525689561926, + "grad_norm": 0.6112045049667358, + "learning_rate": 1.7135710958882733e-05, + "loss": 1.082, + "mean_token_accuracy": 0.7108233571052551, + "num_tokens": 2680528007.0, + "step": 5241 + }, + { + "epoch": 1.4175229853975122, + "grad_norm": 2.6472623348236084, + "learning_rate": 1.7134557522179904e-05, + "loss": 2.0428, + "mean_token_accuracy": 0.5485525727272034, + "num_tokens": 2681052266.0, + "step": 5242 + }, + { + "epoch": 1.4177934018388318, + "grad_norm": 2.5199503898620605, + "learning_rate": 1.7133403897250813e-05, + "loss": 1.9697, + "mean_token_accuracy": 0.5379994511604309, + "num_tokens": 2681576394.0, + "step": 5243 + }, + { + "epoch": 1.4180638182801515, + "grad_norm": 1.5345743894577026, + "learning_rate": 1.713225008413085e-05, + "loss": 1.9989, + "mean_token_accuracy": 0.5337189435958862, + "num_tokens": 2682100581.0, + "step": 5244 + }, + { + "epoch": 1.4183342347214711, + "grad_norm": 1.8754405975341797, + "learning_rate": 1.7131096082855426e-05, + "loss": 2.002, + "mean_token_accuracy": 0.5472785234451294, + "num_tokens": 2682624856.0, + "step": 5245 + }, + { + "epoch": 1.4186046511627908, + "grad_norm": 1.8720701932907104, + "learning_rate": 1.712994189345994e-05, + "loss": 2.0883, + "mean_token_accuracy": 0.540632963180542, + "num_tokens": 2683149042.0, + "step": 5246 + }, + { + "epoch": 1.4188750676041102, + "grad_norm": 1.8847538232803345, + "learning_rate": 1.712878751597981e-05, + "loss": 1.9378, + "mean_token_accuracy": 0.5559067130088806, + "num_tokens": 2683673187.0, + "step": 5247 + }, + { + "epoch": 1.4191454840454298, + "grad_norm": 1.829426646232605, + "learning_rate": 1.712763295045046e-05, + "loss": 1.9973, + "mean_token_accuracy": 0.5358401536941528, + "num_tokens": 2684197414.0, + "step": 5248 + }, + { + "epoch": 1.4194159004867495, + "grad_norm": 2.1551568508148193, + "learning_rate": 1.7126478196907302e-05, + "loss": 1.8532, + "mean_token_accuracy": 0.5697014927864075, + "num_tokens": 2684721657.0, + "step": 5249 + }, + { + "epoch": 1.4196863169280691, + "grad_norm": 1.9908428192138672, + "learning_rate": 1.7125323255385783e-05, + "loss": 2.061, + "mean_token_accuracy": 0.5393725633621216, + "num_tokens": 2685245940.0, + "step": 5250 + }, + { + "epoch": 1.4199567333693888, + "grad_norm": 1.5717676877975464, + "learning_rate": 1.712416812592133e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.5488683581352234, + "num_tokens": 2685770179.0, + "step": 5251 + }, + { + "epoch": 1.4202271498107084, + "grad_norm": 2.593398094177246, + "learning_rate": 1.7123012808549392e-05, + "loss": 2.1045, + "mean_token_accuracy": 0.540575385093689, + "num_tokens": 2686294441.0, + "step": 5252 + }, + { + "epoch": 1.420497566252028, + "grad_norm": 2.03765869140625, + "learning_rate": 1.712185730330541e-05, + "loss": 1.9423, + "mean_token_accuracy": 0.5461143851280212, + "num_tokens": 2686818614.0, + "step": 5253 + }, + { + "epoch": 1.4207679826933477, + "grad_norm": 1.8670066595077515, + "learning_rate": 1.7120701610224846e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.5538981556892395, + "num_tokens": 2687269912.0, + "step": 5254 + }, + { + "epoch": 1.4210383991346673, + "grad_norm": 1.721900463104248, + "learning_rate": 1.7119545729343158e-05, + "loss": 1.9583, + "mean_token_accuracy": 0.5580105185508728, + "num_tokens": 2687794089.0, + "step": 5255 + }, + { + "epoch": 1.421308815575987, + "grad_norm": 1.5018551349639893, + "learning_rate": 1.7118389660695808e-05, + "loss": 1.9847, + "mean_token_accuracy": 0.5611640810966492, + "num_tokens": 2688318308.0, + "step": 5256 + }, + { + "epoch": 1.4215792320173066, + "grad_norm": 1.8256051540374756, + "learning_rate": 1.7117233404318268e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.5398190021514893, + "num_tokens": 2688837940.0, + "step": 5257 + }, + { + "epoch": 1.4218496484586263, + "grad_norm": 1.5298691987991333, + "learning_rate": 1.711607696024602e-05, + "loss": 2.0086, + "mean_token_accuracy": 0.5565600991249084, + "num_tokens": 2689362052.0, + "step": 5258 + }, + { + "epoch": 1.422120064899946, + "grad_norm": 1.4732763767242432, + "learning_rate": 1.7114920328514545e-05, + "loss": 1.7806, + "mean_token_accuracy": 0.5857664346694946, + "num_tokens": 2689886042.0, + "step": 5259 + }, + { + "epoch": 1.4223904813412656, + "grad_norm": 1.4544936418533325, + "learning_rate": 1.7113763509159332e-05, + "loss": 1.9138, + "mean_token_accuracy": 0.5606671571731567, + "num_tokens": 2690410318.0, + "step": 5260 + }, + { + "epoch": 1.4226608977825852, + "grad_norm": 0.6699237823486328, + "learning_rate": 1.7112606502215876e-05, + "loss": 1.1067, + "mean_token_accuracy": 0.700243353843689, + "num_tokens": 2690934476.0, + "step": 5261 + }, + { + "epoch": 1.4229313142239048, + "grad_norm": 1.936118483543396, + "learning_rate": 1.7111449307719674e-05, + "loss": 2.0416, + "mean_token_accuracy": 0.5485675930976868, + "num_tokens": 2691458643.0, + "step": 5262 + }, + { + "epoch": 1.4232017306652245, + "grad_norm": 1.5701653957366943, + "learning_rate": 1.7110291925706238e-05, + "loss": 1.9336, + "mean_token_accuracy": 0.5494484901428223, + "num_tokens": 2691982810.0, + "step": 5263 + }, + { + "epoch": 1.4234721471065441, + "grad_norm": 1.3592267036437988, + "learning_rate": 1.710913435621107e-05, + "loss": 2.0774, + "mean_token_accuracy": 0.5280078649520874, + "num_tokens": 2692507079.0, + "step": 5264 + }, + { + "epoch": 1.4237425635478638, + "grad_norm": 1.6282196044921875, + "learning_rate": 1.71079765992697e-05, + "loss": 2.0127, + "mean_token_accuracy": 0.5500873327255249, + "num_tokens": 2692992926.0, + "step": 5265 + }, + { + "epoch": 1.4240129799891834, + "grad_norm": 1.6113510131835938, + "learning_rate": 1.7106818654917645e-05, + "loss": 1.8648, + "mean_token_accuracy": 0.5614615678787231, + "num_tokens": 2693485314.0, + "step": 5266 + }, + { + "epoch": 1.424283396430503, + "grad_norm": 1.837146520614624, + "learning_rate": 1.7105660523190434e-05, + "loss": 2.0079, + "mean_token_accuracy": 0.5626800060272217, + "num_tokens": 2694009420.0, + "step": 5267 + }, + { + "epoch": 1.4245538128718227, + "grad_norm": 1.4577960968017578, + "learning_rate": 1.7104502204123604e-05, + "loss": 1.9856, + "mean_token_accuracy": 0.5660891532897949, + "num_tokens": 2694523592.0, + "step": 5268 + }, + { + "epoch": 1.4248242293131423, + "grad_norm": 2.0296528339385986, + "learning_rate": 1.710334369775269e-05, + "loss": 2.0236, + "mean_token_accuracy": 0.5581262111663818, + "num_tokens": 2695019924.0, + "step": 5269 + }, + { + "epoch": 1.425094645754462, + "grad_norm": 1.7316083908081055, + "learning_rate": 1.7102185004113245e-05, + "loss": 1.9785, + "mean_token_accuracy": 0.5353928208351135, + "num_tokens": 2695544099.0, + "step": 5270 + }, + { + "epoch": 1.4253650621957816, + "grad_norm": 1.4957393407821655, + "learning_rate": 1.710102612324082e-05, + "loss": 1.9479, + "mean_token_accuracy": 0.547612190246582, + "num_tokens": 2696068378.0, + "step": 5271 + }, + { + "epoch": 1.4256354786371013, + "grad_norm": 1.7432597875595093, + "learning_rate": 1.709986705517097e-05, + "loss": 2.0863, + "mean_token_accuracy": 0.5372316837310791, + "num_tokens": 2696592646.0, + "step": 5272 + }, + { + "epoch": 1.4259058950784207, + "grad_norm": 2.010143756866455, + "learning_rate": 1.709870779993926e-05, + "loss": 1.9664, + "mean_token_accuracy": 0.5494493246078491, + "num_tokens": 2697116930.0, + "step": 5273 + }, + { + "epoch": 1.4261763115197403, + "grad_norm": 1.8018168210983276, + "learning_rate": 1.709754835758126e-05, + "loss": 2.0105, + "mean_token_accuracy": 0.5550464391708374, + "num_tokens": 2697641118.0, + "step": 5274 + }, + { + "epoch": 1.42644672796106, + "grad_norm": 1.8012970685958862, + "learning_rate": 1.7096388728132546e-05, + "loss": 2.063, + "mean_token_accuracy": 0.5443891286849976, + "num_tokens": 2698165317.0, + "step": 5275 + }, + { + "epoch": 1.4267171444023796, + "grad_norm": 1.5628629922866821, + "learning_rate": 1.7095228911628695e-05, + "loss": 2.0241, + "mean_token_accuracy": 0.5396887063980103, + "num_tokens": 2698669117.0, + "step": 5276 + }, + { + "epoch": 1.4269875608436993, + "grad_norm": 1.518322229385376, + "learning_rate": 1.70940689081053e-05, + "loss": 2.0625, + "mean_token_accuracy": 0.5192337036132812, + "num_tokens": 2699193386.0, + "step": 5277 + }, + { + "epoch": 1.427257977285019, + "grad_norm": 1.886398196220398, + "learning_rate": 1.7092908717597943e-05, + "loss": 1.9467, + "mean_token_accuracy": 0.5567395687103271, + "num_tokens": 2699717606.0, + "step": 5278 + }, + { + "epoch": 1.4275283937263386, + "grad_norm": 2.1050825119018555, + "learning_rate": 1.7091748340142234e-05, + "loss": 2.1003, + "mean_token_accuracy": 0.5356454849243164, + "num_tokens": 2700241777.0, + "step": 5279 + }, + { + "epoch": 1.4277988101676582, + "grad_norm": 1.388290286064148, + "learning_rate": 1.7090587775773766e-05, + "loss": 1.9119, + "mean_token_accuracy": 0.5664083361625671, + "num_tokens": 2700734574.0, + "step": 5280 + }, + { + "epoch": 1.4280692266089778, + "grad_norm": 1.5543001890182495, + "learning_rate": 1.7089427024528163e-05, + "loss": 1.1849, + "mean_token_accuracy": 0.6949284076690674, + "num_tokens": 2701210302.0, + "step": 5281 + }, + { + "epoch": 1.4283396430502975, + "grad_norm": 2.3723549842834473, + "learning_rate": 1.708826608644102e-05, + "loss": 2.0401, + "mean_token_accuracy": 0.5372026562690735, + "num_tokens": 2701734567.0, + "step": 5282 + }, + { + "epoch": 1.4286100594916171, + "grad_norm": 2.1097800731658936, + "learning_rate": 1.7087104961547975e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.5538296699523926, + "num_tokens": 2702258735.0, + "step": 5283 + }, + { + "epoch": 1.4288804759329368, + "grad_norm": 1.248408555984497, + "learning_rate": 1.708594364988465e-05, + "loss": 1.9046, + "mean_token_accuracy": 0.5363962054252625, + "num_tokens": 2702783000.0, + "step": 5284 + }, + { + "epoch": 1.4291508923742564, + "grad_norm": 1.533756971359253, + "learning_rate": 1.7084782151486673e-05, + "loss": 2.0137, + "mean_token_accuracy": 0.5460292100906372, + "num_tokens": 2703307098.0, + "step": 5285 + }, + { + "epoch": 1.429421308815576, + "grad_norm": 1.8440673351287842, + "learning_rate": 1.7083620466389688e-05, + "loss": 2.0272, + "mean_token_accuracy": 0.5319271087646484, + "num_tokens": 2703776976.0, + "step": 5286 + }, + { + "epoch": 1.4296917252568957, + "grad_norm": 1.373758316040039, + "learning_rate": 1.7082458594629336e-05, + "loss": 2.0044, + "mean_token_accuracy": 0.5619955658912659, + "num_tokens": 2704301002.0, + "step": 5287 + }, + { + "epoch": 1.4299621416982151, + "grad_norm": 1.399326205253601, + "learning_rate": 1.7081296536241266e-05, + "loss": 1.8573, + "mean_token_accuracy": 0.5731407403945923, + "num_tokens": 2704825247.0, + "step": 5288 + }, + { + "epoch": 1.4302325581395348, + "grad_norm": 1.2851687669754028, + "learning_rate": 1.7080134291261134e-05, + "loss": 2.0858, + "mean_token_accuracy": 0.5575791597366333, + "num_tokens": 2705250466.0, + "step": 5289 + }, + { + "epoch": 1.4305029745808544, + "grad_norm": 1.6136775016784668, + "learning_rate": 1.70789718597246e-05, + "loss": 2.0912, + "mean_token_accuracy": 0.5416634678840637, + "num_tokens": 2705750151.0, + "step": 5290 + }, + { + "epoch": 1.430773391022174, + "grad_norm": 1.3781896829605103, + "learning_rate": 1.707780924166734e-05, + "loss": 2.0113, + "mean_token_accuracy": 0.5471014976501465, + "num_tokens": 2706274337.0, + "step": 5291 + }, + { + "epoch": 1.4310438074634937, + "grad_norm": 1.8761379718780518, + "learning_rate": 1.7076646437125008e-05, + "loss": 1.998, + "mean_token_accuracy": 0.5682231783866882, + "num_tokens": 2706743987.0, + "step": 5292 + }, + { + "epoch": 1.4313142239048133, + "grad_norm": 1.4917658567428589, + "learning_rate": 1.70754834461333e-05, + "loss": 2.0828, + "mean_token_accuracy": 0.5269110202789307, + "num_tokens": 2707268268.0, + "step": 5293 + }, + { + "epoch": 1.431584640346133, + "grad_norm": 1.3293797969818115, + "learning_rate": 1.7074320268727892e-05, + "loss": 2.0517, + "mean_token_accuracy": 0.5490567684173584, + "num_tokens": 2707792446.0, + "step": 5294 + }, + { + "epoch": 1.4318550567874526, + "grad_norm": 1.5660618543624878, + "learning_rate": 1.7073156904944472e-05, + "loss": 2.0428, + "mean_token_accuracy": 0.5477708578109741, + "num_tokens": 2708294947.0, + "step": 5295 + }, + { + "epoch": 1.4321254732287723, + "grad_norm": 1.423337459564209, + "learning_rate": 1.707199335481874e-05, + "loss": 2.0585, + "mean_token_accuracy": 0.5487664341926575, + "num_tokens": 2708805453.0, + "step": 5296 + }, + { + "epoch": 1.432395889670092, + "grad_norm": 1.215409278869629, + "learning_rate": 1.7070829618386397e-05, + "loss": 1.9735, + "mean_token_accuracy": 0.5499254465103149, + "num_tokens": 2709329549.0, + "step": 5297 + }, + { + "epoch": 1.4326663061114115, + "grad_norm": 1.2280519008636475, + "learning_rate": 1.7069665695683143e-05, + "loss": 2.0141, + "mean_token_accuracy": 0.5460723638534546, + "num_tokens": 2709853826.0, + "step": 5298 + }, + { + "epoch": 1.4329367225527312, + "grad_norm": 1.3998479843139648, + "learning_rate": 1.70685015867447e-05, + "loss": 2.0797, + "mean_token_accuracy": 0.5492755770683289, + "num_tokens": 2710368081.0, + "step": 5299 + }, + { + "epoch": 1.4332071389940508, + "grad_norm": 1.088105320930481, + "learning_rate": 1.706733729160678e-05, + "loss": 1.934, + "mean_token_accuracy": 0.5570741891860962, + "num_tokens": 2710888693.0, + "step": 5300 + }, + { + "epoch": 1.4334775554353705, + "grad_norm": 1.7728955745697021, + "learning_rate": 1.706617281030511e-05, + "loss": 1.1956, + "mean_token_accuracy": 0.6856479644775391, + "num_tokens": 2711387838.0, + "step": 5301 + }, + { + "epoch": 1.4337479718766901, + "grad_norm": 1.9614697694778442, + "learning_rate": 1.7065008142875422e-05, + "loss": 2.1253, + "mean_token_accuracy": 0.5192012786865234, + "num_tokens": 2711912074.0, + "step": 5302 + }, + { + "epoch": 1.4340183883180098, + "grad_norm": 1.7247928380966187, + "learning_rate": 1.7063843289353443e-05, + "loss": 2.066, + "mean_token_accuracy": 0.5591802597045898, + "num_tokens": 2712396750.0, + "step": 5303 + }, + { + "epoch": 1.4342888047593294, + "grad_norm": 1.2193158864974976, + "learning_rate": 1.7062678249774924e-05, + "loss": 1.9357, + "mean_token_accuracy": 0.5534819960594177, + "num_tokens": 2712920958.0, + "step": 5304 + }, + { + "epoch": 1.434559221200649, + "grad_norm": 2.5636918544769287, + "learning_rate": 1.7061513024175602e-05, + "loss": 1.736, + "mean_token_accuracy": 0.6053858399391174, + "num_tokens": 2713445220.0, + "step": 5305 + }, + { + "epoch": 1.4348296376419687, + "grad_norm": 2.105320692062378, + "learning_rate": 1.7060347612591238e-05, + "loss": 2.0345, + "mean_token_accuracy": 0.5520402789115906, + "num_tokens": 2713884126.0, + "step": 5306 + }, + { + "epoch": 1.4351000540832883, + "grad_norm": 1.7799631357192993, + "learning_rate": 1.7059182015057584e-05, + "loss": 1.9814, + "mean_token_accuracy": 0.565349817276001, + "num_tokens": 2714408306.0, + "step": 5307 + }, + { + "epoch": 1.435370470524608, + "grad_norm": 1.429175853729248, + "learning_rate": 1.705801623161041e-05, + "loss": 2.1002, + "mean_token_accuracy": 0.5335009098052979, + "num_tokens": 2714932569.0, + "step": 5308 + }, + { + "epoch": 1.4356408869659276, + "grad_norm": 1.3611130714416504, + "learning_rate": 1.7056850262285483e-05, + "loss": 1.953, + "mean_token_accuracy": 0.5561880469322205, + "num_tokens": 2715456774.0, + "step": 5309 + }, + { + "epoch": 1.4359113034072473, + "grad_norm": 1.4022465944290161, + "learning_rate": 1.7055684107118575e-05, + "loss": 2.0246, + "mean_token_accuracy": 0.5413538217544556, + "num_tokens": 2715981048.0, + "step": 5310 + }, + { + "epoch": 1.436181719848567, + "grad_norm": 1.436883807182312, + "learning_rate": 1.7054517766145472e-05, + "loss": 2.1785, + "mean_token_accuracy": 0.49074265360832214, + "num_tokens": 2716505111.0, + "step": 5311 + }, + { + "epoch": 1.4364521362898865, + "grad_norm": 1.1969735622406006, + "learning_rate": 1.7053351239401958e-05, + "loss": 1.6082, + "mean_token_accuracy": 0.5996145606040955, + "num_tokens": 2717029232.0, + "step": 5312 + }, + { + "epoch": 1.4367225527312062, + "grad_norm": 1.2738863229751587, + "learning_rate": 1.7052184526923827e-05, + "loss": 1.843, + "mean_token_accuracy": 0.5867368578910828, + "num_tokens": 2717552344.0, + "step": 5313 + }, + { + "epoch": 1.4369929691725256, + "grad_norm": 1.3468966484069824, + "learning_rate": 1.7051017628746875e-05, + "loss": 1.864, + "mean_token_accuracy": 0.5711339712142944, + "num_tokens": 2718076562.0, + "step": 5314 + }, + { + "epoch": 1.4372633856138453, + "grad_norm": 1.2875808477401733, + "learning_rate": 1.7049850544906907e-05, + "loss": 1.9482, + "mean_token_accuracy": 0.5415083765983582, + "num_tokens": 2718589224.0, + "step": 5315 + }, + { + "epoch": 1.437533802055165, + "grad_norm": 1.2382910251617432, + "learning_rate": 1.7048683275439734e-05, + "loss": 2.1087, + "mean_token_accuracy": 0.5383232831954956, + "num_tokens": 2719113398.0, + "step": 5316 + }, + { + "epoch": 1.4378042184964845, + "grad_norm": 1.3318713903427124, + "learning_rate": 1.704751582038117e-05, + "loss": 2.0345, + "mean_token_accuracy": 0.5420882105827332, + "num_tokens": 2719637507.0, + "step": 5317 + }, + { + "epoch": 1.4380746349378042, + "grad_norm": 1.2558561563491821, + "learning_rate": 1.7046348179767037e-05, + "loss": 1.8991, + "mean_token_accuracy": 0.5644669532775879, + "num_tokens": 2720094076.0, + "step": 5318 + }, + { + "epoch": 1.4383450513791238, + "grad_norm": 1.2807674407958984, + "learning_rate": 1.7045180353633158e-05, + "loss": 2.0341, + "mean_token_accuracy": 0.5534176826477051, + "num_tokens": 2720594043.0, + "step": 5319 + }, + { + "epoch": 1.4386154678204435, + "grad_norm": 1.2443616390228271, + "learning_rate": 1.704401234201537e-05, + "loss": 1.9713, + "mean_token_accuracy": 0.5480611324310303, + "num_tokens": 2721060442.0, + "step": 5320 + }, + { + "epoch": 1.4388858842617631, + "grad_norm": 1.6982827186584473, + "learning_rate": 1.704284414494951e-05, + "loss": 1.189, + "mean_token_accuracy": 0.6808116436004639, + "num_tokens": 2721569174.0, + "step": 5321 + }, + { + "epoch": 1.4391563007030828, + "grad_norm": 1.9408520460128784, + "learning_rate": 1.704167576247142e-05, + "loss": 2.0369, + "mean_token_accuracy": 0.5376276969909668, + "num_tokens": 2722093419.0, + "step": 5322 + }, + { + "epoch": 1.4394267171444024, + "grad_norm": 1.6358027458190918, + "learning_rate": 1.704050719461695e-05, + "loss": 2.0944, + "mean_token_accuracy": 0.5331878066062927, + "num_tokens": 2722617639.0, + "step": 5323 + }, + { + "epoch": 1.439697133585722, + "grad_norm": 1.4698421955108643, + "learning_rate": 1.7039338441421956e-05, + "loss": 2.1706, + "mean_token_accuracy": 0.5398294925689697, + "num_tokens": 2723141912.0, + "step": 5324 + }, + { + "epoch": 1.4399675500270417, + "grad_norm": 1.4962220191955566, + "learning_rate": 1.7038169502922296e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.5395122170448303, + "num_tokens": 2723666089.0, + "step": 5325 + }, + { + "epoch": 1.4402379664683613, + "grad_norm": 1.3895701169967651, + "learning_rate": 1.7037000379153838e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.5439314246177673, + "num_tokens": 2724156575.0, + "step": 5326 + }, + { + "epoch": 1.440508382909681, + "grad_norm": 1.5071619749069214, + "learning_rate": 1.7035831070152456e-05, + "loss": 2.0919, + "mean_token_accuracy": 0.5402047038078308, + "num_tokens": 2724664428.0, + "step": 5327 + }, + { + "epoch": 1.4407787993510006, + "grad_norm": 1.483674168586731, + "learning_rate": 1.7034661575954027e-05, + "loss": 1.9882, + "mean_token_accuracy": 0.557429313659668, + "num_tokens": 2725138009.0, + "step": 5328 + }, + { + "epoch": 1.44104921579232, + "grad_norm": 1.3614468574523926, + "learning_rate": 1.7033491896594435e-05, + "loss": 2.0213, + "mean_token_accuracy": 0.5481319427490234, + "num_tokens": 2725662225.0, + "step": 5329 + }, + { + "epoch": 1.4413196322336397, + "grad_norm": 1.3681560754776, + "learning_rate": 1.703232203210957e-05, + "loss": 1.7569, + "mean_token_accuracy": 0.5847932696342468, + "num_tokens": 2726141895.0, + "step": 5330 + }, + { + "epoch": 1.4415900486749593, + "grad_norm": 1.4331787824630737, + "learning_rate": 1.703115198253532e-05, + "loss": 1.9987, + "mean_token_accuracy": 0.5503950715065002, + "num_tokens": 2726666024.0, + "step": 5331 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 1.7183054685592651, + "learning_rate": 1.702998174790759e-05, + "loss": 2.1292, + "mean_token_accuracy": 0.536089301109314, + "num_tokens": 2727164394.0, + "step": 5332 + }, + { + "epoch": 1.4421308815575986, + "grad_norm": 1.6443150043487549, + "learning_rate": 1.7028811328262295e-05, + "loss": 1.9746, + "mean_token_accuracy": 0.5552559494972229, + "num_tokens": 2727688592.0, + "step": 5333 + }, + { + "epoch": 1.4424012979989183, + "grad_norm": 1.4380091428756714, + "learning_rate": 1.7027640723635333e-05, + "loss": 2.0219, + "mean_token_accuracy": 0.53126060962677, + "num_tokens": 2728212843.0, + "step": 5334 + }, + { + "epoch": 1.442671714440238, + "grad_norm": 1.6105719804763794, + "learning_rate": 1.7026469934062627e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.5649409294128418, + "num_tokens": 2728674590.0, + "step": 5335 + }, + { + "epoch": 1.4429421308815575, + "grad_norm": 1.2634429931640625, + "learning_rate": 1.70252989595801e-05, + "loss": 1.9595, + "mean_token_accuracy": 0.564956784248352, + "num_tokens": 2729163330.0, + "step": 5336 + }, + { + "epoch": 1.4432125473228772, + "grad_norm": 1.4981017112731934, + "learning_rate": 1.7024127800223683e-05, + "loss": 2.0494, + "mean_token_accuracy": 0.5363849401473999, + "num_tokens": 2729687488.0, + "step": 5337 + }, + { + "epoch": 1.4434829637641968, + "grad_norm": 1.2443407773971558, + "learning_rate": 1.7022956456029314e-05, + "loss": 2.1484, + "mean_token_accuracy": 0.5243215560913086, + "num_tokens": 2730211621.0, + "step": 5338 + }, + { + "epoch": 1.4437533802055165, + "grad_norm": 1.3250809907913208, + "learning_rate": 1.7021784927032925e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.564579963684082, + "num_tokens": 2730735748.0, + "step": 5339 + }, + { + "epoch": 1.444023796646836, + "grad_norm": 1.1987744569778442, + "learning_rate": 1.7020613213270465e-05, + "loss": 1.9993, + "mean_token_accuracy": 0.5467283725738525, + "num_tokens": 2731215210.0, + "step": 5340 + }, + { + "epoch": 1.4442942130881558, + "grad_norm": 0.584923505783081, + "learning_rate": 1.7019441314777888e-05, + "loss": 1.1192, + "mean_token_accuracy": 0.7026588916778564, + "num_tokens": 2731739407.0, + "step": 5341 + }, + { + "epoch": 1.4445646295294754, + "grad_norm": 1.872965931892395, + "learning_rate": 1.7018269231591142e-05, + "loss": 2.1771, + "mean_token_accuracy": 0.5237339735031128, + "num_tokens": 2732204581.0, + "step": 5342 + }, + { + "epoch": 1.444835045970795, + "grad_norm": 1.6736526489257812, + "learning_rate": 1.7017096963746206e-05, + "loss": 1.9169, + "mean_token_accuracy": 0.5602480173110962, + "num_tokens": 2732728795.0, + "step": 5343 + }, + { + "epoch": 1.4451054624121147, + "grad_norm": 1.2404069900512695, + "learning_rate": 1.7015924511279038e-05, + "loss": 1.9743, + "mean_token_accuracy": 0.5706448554992676, + "num_tokens": 2733197192.0, + "step": 5344 + }, + { + "epoch": 1.4453758788534343, + "grad_norm": 1.6075491905212402, + "learning_rate": 1.7014751874225613e-05, + "loss": 2.1607, + "mean_token_accuracy": 0.518836259841919, + "num_tokens": 2733721364.0, + "step": 5345 + }, + { + "epoch": 1.445646295294754, + "grad_norm": 1.2640873193740845, + "learning_rate": 1.7013579052621913e-05, + "loss": 1.845, + "mean_token_accuracy": 0.5689295530319214, + "num_tokens": 2734201019.0, + "step": 5346 + }, + { + "epoch": 1.4459167117360736, + "grad_norm": 1.3976362943649292, + "learning_rate": 1.701240604650392e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5339062213897705, + "num_tokens": 2734725219.0, + "step": 5347 + }, + { + "epoch": 1.4461871281773933, + "grad_norm": 1.242178201675415, + "learning_rate": 1.701123285590763e-05, + "loss": 1.9113, + "mean_token_accuracy": 0.5621514320373535, + "num_tokens": 2735249378.0, + "step": 5348 + }, + { + "epoch": 1.446457544618713, + "grad_norm": 1.153270959854126, + "learning_rate": 1.701005948086904e-05, + "loss": 2.0714, + "mean_token_accuracy": 0.5446813106536865, + "num_tokens": 2735773645.0, + "step": 5349 + }, + { + "epoch": 1.4467279610600325, + "grad_norm": 1.4031157493591309, + "learning_rate": 1.7008885921424153e-05, + "loss": 1.9289, + "mean_token_accuracy": 0.5642889738082886, + "num_tokens": 2736297805.0, + "step": 5350 + }, + { + "epoch": 1.4469983775013522, + "grad_norm": 1.2197136878967285, + "learning_rate": 1.700771217760897e-05, + "loss": 2.0369, + "mean_token_accuracy": 0.5355688333511353, + "num_tokens": 2736822043.0, + "step": 5351 + }, + { + "epoch": 1.4472687939426718, + "grad_norm": 1.744944453239441, + "learning_rate": 1.700653824945951e-05, + "loss": 1.9681, + "mean_token_accuracy": 0.5430208444595337, + "num_tokens": 2737346213.0, + "step": 5352 + }, + { + "epoch": 1.4475392103839915, + "grad_norm": 1.7354713678359985, + "learning_rate": 1.7005364137011797e-05, + "loss": 1.8983, + "mean_token_accuracy": 0.5662168264389038, + "num_tokens": 2737870367.0, + "step": 5353 + }, + { + "epoch": 1.4478096268253111, + "grad_norm": 1.4972550868988037, + "learning_rate": 1.7004189840301846e-05, + "loss": 2.0512, + "mean_token_accuracy": 0.5437266230583191, + "num_tokens": 2738394558.0, + "step": 5354 + }, + { + "epoch": 1.4480800432666305, + "grad_norm": 1.3400027751922607, + "learning_rate": 1.70030153593657e-05, + "loss": 2.0839, + "mean_token_accuracy": 0.543340265750885, + "num_tokens": 2738882046.0, + "step": 5355 + }, + { + "epoch": 1.4483504597079502, + "grad_norm": 1.35032320022583, + "learning_rate": 1.700184069423938e-05, + "loss": 2.0198, + "mean_token_accuracy": 0.5482317805290222, + "num_tokens": 2739406183.0, + "step": 5356 + }, + { + "epoch": 1.4486208761492698, + "grad_norm": 1.267232894897461, + "learning_rate": 1.7000665844958945e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.5691715478897095, + "num_tokens": 2739874135.0, + "step": 5357 + }, + { + "epoch": 1.4488912925905895, + "grad_norm": 1.5748274326324463, + "learning_rate": 1.6999490811560432e-05, + "loss": 2.0169, + "mean_token_accuracy": 0.5567381381988525, + "num_tokens": 2740398158.0, + "step": 5358 + }, + { + "epoch": 1.449161709031909, + "grad_norm": 1.4302253723144531, + "learning_rate": 1.6998315594079897e-05, + "loss": 2.0675, + "mean_token_accuracy": 0.5422537326812744, + "num_tokens": 2740922306.0, + "step": 5359 + }, + { + "epoch": 1.4494321254732287, + "grad_norm": 1.3911030292510986, + "learning_rate": 1.6997140192553404e-05, + "loss": 2.1087, + "mean_token_accuracy": 0.5204300284385681, + "num_tokens": 2741446309.0, + "step": 5360 + }, + { + "epoch": 1.4497025419145484, + "grad_norm": 0.4316292405128479, + "learning_rate": 1.699596460701701e-05, + "loss": 0.987, + "mean_token_accuracy": 0.7406035661697388, + "num_tokens": 2741949703.0, + "step": 5361 + }, + { + "epoch": 1.449972958355868, + "grad_norm": 2.41709566116333, + "learning_rate": 1.6994788837506784e-05, + "loss": 2.0818, + "mean_token_accuracy": 0.5409365892410278, + "num_tokens": 2742473859.0, + "step": 5362 + }, + { + "epoch": 1.4502433747971877, + "grad_norm": 1.638909101486206, + "learning_rate": 1.6993612884058813e-05, + "loss": 2.0632, + "mean_token_accuracy": 0.5504093766212463, + "num_tokens": 2742998062.0, + "step": 5363 + }, + { + "epoch": 1.4505137912385073, + "grad_norm": 1.7950313091278076, + "learning_rate": 1.6992436746709174e-05, + "loss": 2.0212, + "mean_token_accuracy": 0.5450798273086548, + "num_tokens": 2743522189.0, + "step": 5364 + }, + { + "epoch": 1.450784207679827, + "grad_norm": 1.5097999572753906, + "learning_rate": 1.699126042549395e-05, + "loss": 1.8568, + "mean_token_accuracy": 0.578156590461731, + "num_tokens": 2744046301.0, + "step": 5365 + }, + { + "epoch": 1.4510546241211466, + "grad_norm": 1.5416284799575806, + "learning_rate": 1.6990083920449235e-05, + "loss": 2.065, + "mean_token_accuracy": 0.5355455875396729, + "num_tokens": 2744570575.0, + "step": 5366 + }, + { + "epoch": 1.4513250405624663, + "grad_norm": 1.4759050607681274, + "learning_rate": 1.6988907231611132e-05, + "loss": 2.1031, + "mean_token_accuracy": 0.5358004570007324, + "num_tokens": 2745094665.0, + "step": 5367 + }, + { + "epoch": 1.451595457003786, + "grad_norm": 1.3686286211013794, + "learning_rate": 1.6987730359015743e-05, + "loss": 2.0131, + "mean_token_accuracy": 0.5608669519424438, + "num_tokens": 2745589311.0, + "step": 5368 + }, + { + "epoch": 1.4518658734451055, + "grad_norm": 1.4808855056762695, + "learning_rate": 1.698655330269918e-05, + "loss": 1.9959, + "mean_token_accuracy": 0.5577290058135986, + "num_tokens": 2746091028.0, + "step": 5369 + }, + { + "epoch": 1.452136289886425, + "grad_norm": 1.4808040857315063, + "learning_rate": 1.6985376062697554e-05, + "loss": 2.0664, + "mean_token_accuracy": 0.5386863350868225, + "num_tokens": 2746615229.0, + "step": 5370 + }, + { + "epoch": 1.4524067063277446, + "grad_norm": 1.3166061639785767, + "learning_rate": 1.698419863904699e-05, + "loss": 2.0582, + "mean_token_accuracy": 0.5389423370361328, + "num_tokens": 2747139445.0, + "step": 5371 + }, + { + "epoch": 1.4526771227690642, + "grad_norm": 1.5059056282043457, + "learning_rate": 1.698302103178361e-05, + "loss": 2.001, + "mean_token_accuracy": 0.5490527153015137, + "num_tokens": 2747663728.0, + "step": 5372 + }, + { + "epoch": 1.4529475392103839, + "grad_norm": 1.3333067893981934, + "learning_rate": 1.6981843240943555e-05, + "loss": 1.9902, + "mean_token_accuracy": 0.5516735315322876, + "num_tokens": 2748187923.0, + "step": 5373 + }, + { + "epoch": 1.4532179556517035, + "grad_norm": 1.5794775485992432, + "learning_rate": 1.698066526656295e-05, + "loss": 2.0737, + "mean_token_accuracy": 0.5372262001037598, + "num_tokens": 2748712076.0, + "step": 5374 + }, + { + "epoch": 1.4534883720930232, + "grad_norm": 1.3939852714538574, + "learning_rate": 1.6979487108677956e-05, + "loss": 1.9246, + "mean_token_accuracy": 0.5626912117004395, + "num_tokens": 2749236121.0, + "step": 5375 + }, + { + "epoch": 1.4537587885343428, + "grad_norm": 1.3420385122299194, + "learning_rate": 1.6978308767324706e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5389900207519531, + "num_tokens": 2749760289.0, + "step": 5376 + }, + { + "epoch": 1.4540292049756625, + "grad_norm": 1.5160664319992065, + "learning_rate": 1.6977130242539366e-05, + "loss": 2.1382, + "mean_token_accuracy": 0.5174171924591064, + "num_tokens": 2750284503.0, + "step": 5377 + }, + { + "epoch": 1.454299621416982, + "grad_norm": 1.5787091255187988, + "learning_rate": 1.697595153435809e-05, + "loss": 1.9774, + "mean_token_accuracy": 0.5765680074691772, + "num_tokens": 2750792617.0, + "step": 5378 + }, + { + "epoch": 1.4545700378583017, + "grad_norm": 1.6100785732269287, + "learning_rate": 1.6974772642817046e-05, + "loss": 2.0017, + "mean_token_accuracy": 0.5350542068481445, + "num_tokens": 2751316817.0, + "step": 5379 + }, + { + "epoch": 1.4548404542996214, + "grad_norm": 1.530088186264038, + "learning_rate": 1.6973593567952414e-05, + "loss": 1.9546, + "mean_token_accuracy": 0.5378497838973999, + "num_tokens": 2751841054.0, + "step": 5380 + }, + { + "epoch": 1.455110870740941, + "grad_norm": 0.7327995300292969, + "learning_rate": 1.6972414309800354e-05, + "loss": 1.2437, + "mean_token_accuracy": 0.6787782907485962, + "num_tokens": 2752364365.0, + "step": 5381 + }, + { + "epoch": 1.4553812871822607, + "grad_norm": 1.917222261428833, + "learning_rate": 1.6971234868397064e-05, + "loss": 1.8897, + "mean_token_accuracy": 0.5339652299880981, + "num_tokens": 2752888627.0, + "step": 5382 + }, + { + "epoch": 1.4556517036235803, + "grad_norm": 4.000690937042236, + "learning_rate": 1.697005524377873e-05, + "loss": 1.7689, + "mean_token_accuracy": 0.6122453808784485, + "num_tokens": 2753353771.0, + "step": 5383 + }, + { + "epoch": 1.4559221200649, + "grad_norm": 1.5526419878005981, + "learning_rate": 1.696887543598154e-05, + "loss": 2.0211, + "mean_token_accuracy": 0.5531582832336426, + "num_tokens": 2753878013.0, + "step": 5384 + }, + { + "epoch": 1.4561925365062196, + "grad_norm": 1.2247380018234253, + "learning_rate": 1.69676954450417e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5550898313522339, + "num_tokens": 2754369429.0, + "step": 5385 + }, + { + "epoch": 1.4564629529475392, + "grad_norm": 1.7237752676010132, + "learning_rate": 1.6966515270995417e-05, + "loss": 2.11, + "mean_token_accuracy": 0.5433230400085449, + "num_tokens": 2754875726.0, + "step": 5386 + }, + { + "epoch": 1.456733369388859, + "grad_norm": 1.223889708518982, + "learning_rate": 1.6965334913878898e-05, + "loss": 1.959, + "mean_token_accuracy": 0.54133141040802, + "num_tokens": 2755399883.0, + "step": 5387 + }, + { + "epoch": 1.4570037858301785, + "grad_norm": 1.2744544744491577, + "learning_rate": 1.6964154373728355e-05, + "loss": 1.9, + "mean_token_accuracy": 0.5529807209968567, + "num_tokens": 2755924080.0, + "step": 5388 + }, + { + "epoch": 1.4572742022714982, + "grad_norm": 1.378981590270996, + "learning_rate": 1.6962973650580023e-05, + "loss": 2.0627, + "mean_token_accuracy": 0.5280687212944031, + "num_tokens": 2756448263.0, + "step": 5389 + }, + { + "epoch": 1.4575446187128178, + "grad_norm": 1.424432396888733, + "learning_rate": 1.696179274447012e-05, + "loss": 2.0211, + "mean_token_accuracy": 0.5260289907455444, + "num_tokens": 2756972536.0, + "step": 5390 + }, + { + "epoch": 1.4578150351541375, + "grad_norm": 1.2082520723342896, + "learning_rate": 1.6960611655434886e-05, + "loss": 1.9892, + "mean_token_accuracy": 0.5316774249076843, + "num_tokens": 2757496808.0, + "step": 5391 + }, + { + "epoch": 1.458085451595457, + "grad_norm": 1.251132607460022, + "learning_rate": 1.6959430383510556e-05, + "loss": 2.0237, + "mean_token_accuracy": 0.5651218891143799, + "num_tokens": 2758008135.0, + "step": 5392 + }, + { + "epoch": 1.4583558680367767, + "grad_norm": 1.3672535419464111, + "learning_rate": 1.6958248928733376e-05, + "loss": 2.0706, + "mean_token_accuracy": 0.5493715405464172, + "num_tokens": 2758521057.0, + "step": 5393 + }, + { + "epoch": 1.4586262844780964, + "grad_norm": 1.130251407623291, + "learning_rate": 1.6957067291139597e-05, + "loss": 2.0137, + "mean_token_accuracy": 0.5584536790847778, + "num_tokens": 2758987695.0, + "step": 5394 + }, + { + "epoch": 1.458896700919416, + "grad_norm": 1.511469841003418, + "learning_rate": 1.6955885470765473e-05, + "loss": 1.9825, + "mean_token_accuracy": 0.5514498353004456, + "num_tokens": 2759511902.0, + "step": 5395 + }, + { + "epoch": 1.4591671173607355, + "grad_norm": 1.2765743732452393, + "learning_rate": 1.6954703467647273e-05, + "loss": 1.9817, + "mean_token_accuracy": 0.5631523132324219, + "num_tokens": 2760036168.0, + "step": 5396 + }, + { + "epoch": 1.459437533802055, + "grad_norm": 1.302211046218872, + "learning_rate": 1.6953521281821252e-05, + "loss": 2.0681, + "mean_token_accuracy": 0.5376502275466919, + "num_tokens": 2760560435.0, + "step": 5397 + }, + { + "epoch": 1.4597079502433747, + "grad_norm": 1.1382766962051392, + "learning_rate": 1.6952338913323695e-05, + "loss": 2.0351, + "mean_token_accuracy": 0.5429148077964783, + "num_tokens": 2761079467.0, + "step": 5398 + }, + { + "epoch": 1.4599783666846944, + "grad_norm": 1.5172131061553955, + "learning_rate": 1.6951156362190876e-05, + "loss": 2.15, + "mean_token_accuracy": 0.5274642705917358, + "num_tokens": 2761577915.0, + "step": 5399 + }, + { + "epoch": 1.460248783126014, + "grad_norm": 1.1034324169158936, + "learning_rate": 1.6949973628459075e-05, + "loss": 2.069, + "mean_token_accuracy": 0.5280554294586182, + "num_tokens": 2762101990.0, + "step": 5400 + }, + { + "epoch": 1.4605191995673337, + "grad_norm": 0.8487698435783386, + "learning_rate": 1.6948790712164586e-05, + "loss": 1.1391, + "mean_token_accuracy": 0.7236732840538025, + "num_tokens": 2762561166.0, + "step": 5401 + }, + { + "epoch": 1.4607896160086533, + "grad_norm": 1.5036218166351318, + "learning_rate": 1.6947607613343705e-05, + "loss": 2.064, + "mean_token_accuracy": 0.5462303161621094, + "num_tokens": 2763085353.0, + "step": 5402 + }, + { + "epoch": 1.461060032449973, + "grad_norm": 1.824571967124939, + "learning_rate": 1.694642433203273e-05, + "loss": 1.9599, + "mean_token_accuracy": 0.5334867238998413, + "num_tokens": 2763609626.0, + "step": 5403 + }, + { + "epoch": 1.4613304488912926, + "grad_norm": 1.3388988971710205, + "learning_rate": 1.6945240868267973e-05, + "loss": 2.0179, + "mean_token_accuracy": 0.5599347949028015, + "num_tokens": 2764066021.0, + "step": 5404 + }, + { + "epoch": 1.4616008653326122, + "grad_norm": 1.7459017038345337, + "learning_rate": 1.6944057222085735e-05, + "loss": 2.0036, + "mean_token_accuracy": 0.5229882597923279, + "num_tokens": 2764590297.0, + "step": 5405 + }, + { + "epoch": 1.4618712817739319, + "grad_norm": 1.3353663682937622, + "learning_rate": 1.694287339352235e-05, + "loss": 2.0122, + "mean_token_accuracy": 0.5428018569946289, + "num_tokens": 2765114400.0, + "step": 5406 + }, + { + "epoch": 1.4621416982152515, + "grad_norm": 1.4176926612854004, + "learning_rate": 1.6941689382614127e-05, + "loss": 1.9555, + "mean_token_accuracy": 0.5528541803359985, + "num_tokens": 2765638661.0, + "step": 5407 + }, + { + "epoch": 1.4624121146565712, + "grad_norm": 1.4726423025131226, + "learning_rate": 1.69405051893974e-05, + "loss": 2.0354, + "mean_token_accuracy": 0.5337144136428833, + "num_tokens": 2766162868.0, + "step": 5408 + }, + { + "epoch": 1.4626825310978908, + "grad_norm": 1.3350822925567627, + "learning_rate": 1.6939320813908508e-05, + "loss": 1.9923, + "mean_token_accuracy": 0.5673555135726929, + "num_tokens": 2766686857.0, + "step": 5409 + }, + { + "epoch": 1.4629529475392105, + "grad_norm": 1.4122109413146973, + "learning_rate": 1.693813625618378e-05, + "loss": 2.0268, + "mean_token_accuracy": 0.5267583131790161, + "num_tokens": 2767211089.0, + "step": 5410 + }, + { + "epoch": 1.4632233639805299, + "grad_norm": 1.261900782585144, + "learning_rate": 1.6936951516259575e-05, + "loss": 2.0587, + "mean_token_accuracy": 0.5422138571739197, + "num_tokens": 2767691474.0, + "step": 5411 + }, + { + "epoch": 1.4634937804218495, + "grad_norm": 1.1741905212402344, + "learning_rate": 1.6935766594172235e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5546535849571228, + "num_tokens": 2768170987.0, + "step": 5412 + }, + { + "epoch": 1.4637641968631692, + "grad_norm": 1.322730302810669, + "learning_rate": 1.693458148995812e-05, + "loss": 2.0365, + "mean_token_accuracy": 0.5316690802574158, + "num_tokens": 2768683781.0, + "step": 5413 + }, + { + "epoch": 1.4640346133044888, + "grad_norm": 2.580822229385376, + "learning_rate": 1.6933396203653594e-05, + "loss": 1.7767, + "mean_token_accuracy": 0.6123413443565369, + "num_tokens": 2769128537.0, + "step": 5414 + }, + { + "epoch": 1.4643050297458085, + "grad_norm": 1.7373064756393433, + "learning_rate": 1.6932210735295025e-05, + "loss": 1.933, + "mean_token_accuracy": 0.5468553304672241, + "num_tokens": 2769652686.0, + "step": 5415 + }, + { + "epoch": 1.464575446187128, + "grad_norm": 1.577589988708496, + "learning_rate": 1.6931025084918783e-05, + "loss": 2.1026, + "mean_token_accuracy": 0.5358456373214722, + "num_tokens": 2770121267.0, + "step": 5416 + }, + { + "epoch": 1.4648458626284477, + "grad_norm": 1.47738516330719, + "learning_rate": 1.692983925256125e-05, + "loss": 1.9016, + "mean_token_accuracy": 0.5716853141784668, + "num_tokens": 2770640149.0, + "step": 5417 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 1.5180082321166992, + "learning_rate": 1.692865323825881e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5732412934303284, + "num_tokens": 2771164328.0, + "step": 5418 + }, + { + "epoch": 1.465386695511087, + "grad_norm": 1.4231714010238647, + "learning_rate": 1.6927467042047854e-05, + "loss": 1.9722, + "mean_token_accuracy": 0.5599290132522583, + "num_tokens": 2771636832.0, + "step": 5419 + }, + { + "epoch": 1.4656571119524067, + "grad_norm": 1.2945876121520996, + "learning_rate": 1.6926280663964776e-05, + "loss": 1.9726, + "mean_token_accuracy": 0.544538676738739, + "num_tokens": 2772160998.0, + "step": 5420 + }, + { + "epoch": 1.4659275283937263, + "grad_norm": 0.7211440801620483, + "learning_rate": 1.6925094104045982e-05, + "loss": 1.1254, + "mean_token_accuracy": 0.7069383263587952, + "num_tokens": 2772672728.0, + "step": 5421 + }, + { + "epoch": 1.466197944835046, + "grad_norm": 2.5827689170837402, + "learning_rate": 1.6923907362327872e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.5477294921875, + "num_tokens": 2773160753.0, + "step": 5422 + }, + { + "epoch": 1.4664683612763656, + "grad_norm": 2.1963624954223633, + "learning_rate": 1.6922720438846863e-05, + "loss": 1.8661, + "mean_token_accuracy": 0.6016973853111267, + "num_tokens": 2773652174.0, + "step": 5423 + }, + { + "epoch": 1.4667387777176852, + "grad_norm": 1.550728678703308, + "learning_rate": 1.6921533333639377e-05, + "loss": 2.1306, + "mean_token_accuracy": 0.5494149923324585, + "num_tokens": 2774107337.0, + "step": 5424 + }, + { + "epoch": 1.4670091941590049, + "grad_norm": 1.6004949808120728, + "learning_rate": 1.6920346046741837e-05, + "loss": 1.9395, + "mean_token_accuracy": 0.5630852580070496, + "num_tokens": 2774631570.0, + "step": 5425 + }, + { + "epoch": 1.4672796106003245, + "grad_norm": 1.8537027835845947, + "learning_rate": 1.6919158578190664e-05, + "loss": 2.073, + "mean_token_accuracy": 0.5511398315429688, + "num_tokens": 2775113108.0, + "step": 5426 + }, + { + "epoch": 1.4675500270416442, + "grad_norm": 1.3971872329711914, + "learning_rate": 1.6917970928022298e-05, + "loss": 1.9383, + "mean_token_accuracy": 0.5789663791656494, + "num_tokens": 2775637328.0, + "step": 5427 + }, + { + "epoch": 1.4678204434829638, + "grad_norm": 1.9007413387298584, + "learning_rate": 1.6916783096273183e-05, + "loss": 1.8335, + "mean_token_accuracy": 0.6102238893508911, + "num_tokens": 2776161591.0, + "step": 5428 + }, + { + "epoch": 1.4680908599242835, + "grad_norm": 1.7792457342147827, + "learning_rate": 1.6915595082979763e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.550021767616272, + "num_tokens": 2776622231.0, + "step": 5429 + }, + { + "epoch": 1.468361276365603, + "grad_norm": 1.4164221286773682, + "learning_rate": 1.691440688817849e-05, + "loss": 1.9679, + "mean_token_accuracy": 0.5530884265899658, + "num_tokens": 2777146459.0, + "step": 5430 + }, + { + "epoch": 1.4686316928069227, + "grad_norm": 1.327124834060669, + "learning_rate": 1.691321851190582e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.5481345653533936, + "num_tokens": 2777670739.0, + "step": 5431 + }, + { + "epoch": 1.4689021092482424, + "grad_norm": 1.5040779113769531, + "learning_rate": 1.6912029954198213e-05, + "loss": 2.0957, + "mean_token_accuracy": 0.5361046195030212, + "num_tokens": 2778169987.0, + "step": 5432 + }, + { + "epoch": 1.469172525689562, + "grad_norm": 1.1395606994628906, + "learning_rate": 1.6910841215092145e-05, + "loss": 1.916, + "mean_token_accuracy": 0.5644538402557373, + "num_tokens": 2778694146.0, + "step": 5433 + }, + { + "epoch": 1.4694429421308817, + "grad_norm": 1.5059468746185303, + "learning_rate": 1.690965229462408e-05, + "loss": 2.0482, + "mean_token_accuracy": 0.5554296970367432, + "num_tokens": 2779218395.0, + "step": 5434 + }, + { + "epoch": 1.4697133585722013, + "grad_norm": 1.454708456993103, + "learning_rate": 1.690846319283051e-05, + "loss": 2.0722, + "mean_token_accuracy": 0.5467859506607056, + "num_tokens": 2779742501.0, + "step": 5435 + }, + { + "epoch": 1.469983775013521, + "grad_norm": 1.1294351816177368, + "learning_rate": 1.6907273909747913e-05, + "loss": 1.924, + "mean_token_accuracy": 0.5584981441497803, + "num_tokens": 2780266786.0, + "step": 5436 + }, + { + "epoch": 1.4702541914548404, + "grad_norm": 1.7512342929840088, + "learning_rate": 1.6906084445412777e-05, + "loss": 2.0973, + "mean_token_accuracy": 0.5418053269386292, + "num_tokens": 2780790979.0, + "step": 5437 + }, + { + "epoch": 1.47052460789616, + "grad_norm": 1.494233250617981, + "learning_rate": 1.6904894799861603e-05, + "loss": 1.9755, + "mean_token_accuracy": 0.567711353302002, + "num_tokens": 2781315158.0, + "step": 5438 + }, + { + "epoch": 1.4707950243374797, + "grad_norm": 1.2818371057510376, + "learning_rate": 1.690370497313089e-05, + "loss": 2.041, + "mean_token_accuracy": 0.5418663024902344, + "num_tokens": 2781812664.0, + "step": 5439 + }, + { + "epoch": 1.4710654407787993, + "grad_norm": 1.3832740783691406, + "learning_rate": 1.6902514965257152e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.5160214304924011, + "num_tokens": 2782336817.0, + "step": 5440 + }, + { + "epoch": 1.471335857220119, + "grad_norm": 0.6007522344589233, + "learning_rate": 1.690132477627689e-05, + "loss": 1.115, + "mean_token_accuracy": 0.6932933330535889, + "num_tokens": 2782861041.0, + "step": 5441 + }, + { + "epoch": 1.4716062736614386, + "grad_norm": 2.469435453414917, + "learning_rate": 1.6900134406226635e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5374876856803894, + "num_tokens": 2783385263.0, + "step": 5442 + }, + { + "epoch": 1.4718766901027582, + "grad_norm": 2.5280206203460693, + "learning_rate": 1.68989438551429e-05, + "loss": 2.0958, + "mean_token_accuracy": 0.5308274030685425, + "num_tokens": 2783909443.0, + "step": 5443 + }, + { + "epoch": 1.4721471065440779, + "grad_norm": 1.5972124338150024, + "learning_rate": 1.6897753123062223e-05, + "loss": 2.0456, + "mean_token_accuracy": 0.5458953380584717, + "num_tokens": 2784433668.0, + "step": 5444 + }, + { + "epoch": 1.4724175229853975, + "grad_norm": 1.6659489870071411, + "learning_rate": 1.6896562210021138e-05, + "loss": 1.9829, + "mean_token_accuracy": 0.5575733780860901, + "num_tokens": 2784945507.0, + "step": 5445 + }, + { + "epoch": 1.4726879394267172, + "grad_norm": 1.8457552194595337, + "learning_rate": 1.6895371116056177e-05, + "loss": 2.0398, + "mean_token_accuracy": 0.5433617830276489, + "num_tokens": 2785469677.0, + "step": 5446 + }, + { + "epoch": 1.4729583558680368, + "grad_norm": 1.6281657218933105, + "learning_rate": 1.6894179841203898e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5536515712738037, + "num_tokens": 2785993928.0, + "step": 5447 + }, + { + "epoch": 1.4732287723093564, + "grad_norm": 2.14125919342041, + "learning_rate": 1.689298838550085e-05, + "loss": 1.9285, + "mean_token_accuracy": 0.5581687092781067, + "num_tokens": 2786518059.0, + "step": 5448 + }, + { + "epoch": 1.473499188750676, + "grad_norm": 1.8397064208984375, + "learning_rate": 1.6891796748983583e-05, + "loss": 2.0534, + "mean_token_accuracy": 0.5449988842010498, + "num_tokens": 2787033710.0, + "step": 5449 + }, + { + "epoch": 1.4737696051919957, + "grad_norm": 1.3266631364822388, + "learning_rate": 1.6890604931688664e-05, + "loss": 1.9812, + "mean_token_accuracy": 0.5362348556518555, + "num_tokens": 2787523221.0, + "step": 5450 + }, + { + "epoch": 1.4740400216333154, + "grad_norm": 1.7763323783874512, + "learning_rate": 1.688941293365267e-05, + "loss": 2.0279, + "mean_token_accuracy": 0.5687321424484253, + "num_tokens": 2788047454.0, + "step": 5451 + }, + { + "epoch": 1.4743104380746348, + "grad_norm": 1.6879833936691284, + "learning_rate": 1.688822075491216e-05, + "loss": 2.0012, + "mean_token_accuracy": 0.5439563989639282, + "num_tokens": 2788548630.0, + "step": 5452 + }, + { + "epoch": 1.4745808545159544, + "grad_norm": 1.383575439453125, + "learning_rate": 1.6887028395503725e-05, + "loss": 1.98, + "mean_token_accuracy": 0.5469876527786255, + "num_tokens": 2789072821.0, + "step": 5453 + }, + { + "epoch": 1.474851270957274, + "grad_norm": 1.3699151277542114, + "learning_rate": 1.6885835855463947e-05, + "loss": 1.9324, + "mean_token_accuracy": 0.5803223848342896, + "num_tokens": 2789596996.0, + "step": 5454 + }, + { + "epoch": 1.4751216873985937, + "grad_norm": 1.5583254098892212, + "learning_rate": 1.6884643134829412e-05, + "loss": 2.0182, + "mean_token_accuracy": 0.5406109094619751, + "num_tokens": 2790118638.0, + "step": 5455 + }, + { + "epoch": 1.4753921038399134, + "grad_norm": 1.5199764966964722, + "learning_rate": 1.6883450233636724e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.5678483247756958, + "num_tokens": 2790585249.0, + "step": 5456 + }, + { + "epoch": 1.475662520281233, + "grad_norm": 1.340029001235962, + "learning_rate": 1.688225715192248e-05, + "loss": 1.8364, + "mean_token_accuracy": 0.557198703289032, + "num_tokens": 2791109423.0, + "step": 5457 + }, + { + "epoch": 1.4759329367225527, + "grad_norm": 1.4250273704528809, + "learning_rate": 1.6881063889723288e-05, + "loss": 1.842, + "mean_token_accuracy": 0.5683293342590332, + "num_tokens": 2791633509.0, + "step": 5458 + }, + { + "epoch": 1.4762033531638723, + "grad_norm": 1.2017464637756348, + "learning_rate": 1.6879870447075755e-05, + "loss": 2.1032, + "mean_token_accuracy": 0.5366297960281372, + "num_tokens": 2792099827.0, + "step": 5459 + }, + { + "epoch": 1.476473769605192, + "grad_norm": 1.5201107263565063, + "learning_rate": 1.6878676824016513e-05, + "loss": 2.01, + "mean_token_accuracy": 0.5491651296615601, + "num_tokens": 2792623709.0, + "step": 5460 + }, + { + "epoch": 1.4767441860465116, + "grad_norm": 0.578751266002655, + "learning_rate": 1.6877483020582175e-05, + "loss": 1.1853, + "mean_token_accuracy": 0.696359395980835, + "num_tokens": 2793147883.0, + "step": 5461 + }, + { + "epoch": 1.4770146024878312, + "grad_norm": 2.1953394412994385, + "learning_rate": 1.6876289036809375e-05, + "loss": 2.0357, + "mean_token_accuracy": 0.5342485308647156, + "num_tokens": 2793531313.0, + "step": 5462 + }, + { + "epoch": 1.4772850189291509, + "grad_norm": 1.565662145614624, + "learning_rate": 1.6875094872734743e-05, + "loss": 2.0418, + "mean_token_accuracy": 0.5235222578048706, + "num_tokens": 2793995632.0, + "step": 5463 + }, + { + "epoch": 1.4775554353704705, + "grad_norm": 1.7822397947311401, + "learning_rate": 1.6873900528394925e-05, + "loss": 2.1203, + "mean_token_accuracy": 0.5465306043624878, + "num_tokens": 2794467876.0, + "step": 5464 + }, + { + "epoch": 1.4778258518117902, + "grad_norm": 1.1424856185913086, + "learning_rate": 1.687270600382656e-05, + "loss": 1.9352, + "mean_token_accuracy": 0.5541285276412964, + "num_tokens": 2794992011.0, + "step": 5465 + }, + { + "epoch": 1.4780962682531098, + "grad_norm": 1.0925036668777466, + "learning_rate": 1.6871511299066313e-05, + "loss": 2.0671, + "mean_token_accuracy": 0.5377553105354309, + "num_tokens": 2795495776.0, + "step": 5466 + }, + { + "epoch": 1.4783666846944294, + "grad_norm": 1.1170713901519775, + "learning_rate": 1.6870316414150827e-05, + "loss": 1.8706, + "mean_token_accuracy": 0.5611280202865601, + "num_tokens": 2796019964.0, + "step": 5467 + }, + { + "epoch": 1.478637101135749, + "grad_norm": 1.2955697774887085, + "learning_rate": 1.686912134911677e-05, + "loss": 1.9582, + "mean_token_accuracy": 0.5819292664527893, + "num_tokens": 2796479416.0, + "step": 5468 + }, + { + "epoch": 1.4789075175770687, + "grad_norm": 1.2147712707519531, + "learning_rate": 1.6867926104000812e-05, + "loss": 2.0891, + "mean_token_accuracy": 0.5278077125549316, + "num_tokens": 2797003586.0, + "step": 5469 + }, + { + "epoch": 1.4791779340183884, + "grad_norm": 1.4067898988723755, + "learning_rate": 1.6866730678839622e-05, + "loss": 1.9452, + "mean_token_accuracy": 0.5440595746040344, + "num_tokens": 2797527790.0, + "step": 5470 + }, + { + "epoch": 1.479448350459708, + "grad_norm": 1.2145538330078125, + "learning_rate": 1.6865535073669886e-05, + "loss": 1.8797, + "mean_token_accuracy": 0.5759283304214478, + "num_tokens": 2797994569.0, + "step": 5471 + }, + { + "epoch": 1.4797187669010277, + "grad_norm": 1.360952615737915, + "learning_rate": 1.6864339288528282e-05, + "loss": 2.122, + "mean_token_accuracy": 0.5493699312210083, + "num_tokens": 2798501282.0, + "step": 5472 + }, + { + "epoch": 1.4799891833423473, + "grad_norm": 1.0217572450637817, + "learning_rate": 1.68631433234515e-05, + "loss": 1.8744, + "mean_token_accuracy": 0.5826365947723389, + "num_tokens": 2799025550.0, + "step": 5473 + }, + { + "epoch": 1.480259599783667, + "grad_norm": 1.0731945037841797, + "learning_rate": 1.6861947178476243e-05, + "loss": 1.9611, + "mean_token_accuracy": 0.5446906089782715, + "num_tokens": 2799549813.0, + "step": 5474 + }, + { + "epoch": 1.4805300162249866, + "grad_norm": 1.455834984779358, + "learning_rate": 1.6860750853639204e-05, + "loss": 1.9557, + "mean_token_accuracy": 0.5633678436279297, + "num_tokens": 2800074030.0, + "step": 5475 + }, + { + "epoch": 1.4808004326663062, + "grad_norm": 1.2369468212127686, + "learning_rate": 1.6859554348977098e-05, + "loss": 1.9615, + "mean_token_accuracy": 0.5618846416473389, + "num_tokens": 2800598299.0, + "step": 5476 + }, + { + "epoch": 1.4810708491076259, + "grad_norm": 1.2173008918762207, + "learning_rate": 1.6858357664526624e-05, + "loss": 1.9812, + "mean_token_accuracy": 0.5647822618484497, + "num_tokens": 2801122478.0, + "step": 5477 + }, + { + "epoch": 1.4813412655489453, + "grad_norm": 1.570556640625, + "learning_rate": 1.685716080032452e-05, + "loss": 2.0459, + "mean_token_accuracy": 0.5309255123138428, + "num_tokens": 2801646722.0, + "step": 5478 + }, + { + "epoch": 1.481611681990265, + "grad_norm": 1.3210608959197998, + "learning_rate": 1.685596375640749e-05, + "loss": 1.9899, + "mean_token_accuracy": 0.5588143467903137, + "num_tokens": 2802142521.0, + "step": 5479 + }, + { + "epoch": 1.4818820984315846, + "grad_norm": 1.5576674938201904, + "learning_rate": 1.6854766532812273e-05, + "loss": 1.9201, + "mean_token_accuracy": 0.5598070621490479, + "num_tokens": 2802666676.0, + "step": 5480 + }, + { + "epoch": 1.4821525148729042, + "grad_norm": 0.6257421970367432, + "learning_rate": 1.6853569129575603e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.6942122578620911, + "num_tokens": 2803190749.0, + "step": 5481 + }, + { + "epoch": 1.4824229313142239, + "grad_norm": 2.552565574645996, + "learning_rate": 1.6852371546734213e-05, + "loss": 1.9235, + "mean_token_accuracy": 0.5682884454727173, + "num_tokens": 2803663759.0, + "step": 5482 + }, + { + "epoch": 1.4826933477555435, + "grad_norm": 2.28886342048645, + "learning_rate": 1.6851173784324857e-05, + "loss": 1.9515, + "mean_token_accuracy": 0.5586676001548767, + "num_tokens": 2804188003.0, + "step": 5483 + }, + { + "epoch": 1.4829637641968632, + "grad_norm": 1.2345218658447266, + "learning_rate": 1.6849975842384283e-05, + "loss": 1.8432, + "mean_token_accuracy": 0.5765290260314941, + "num_tokens": 2804663989.0, + "step": 5484 + }, + { + "epoch": 1.4832341806381828, + "grad_norm": 1.9583418369293213, + "learning_rate": 1.6848777720949244e-05, + "loss": 1.9871, + "mean_token_accuracy": 0.5427864789962769, + "num_tokens": 2805188037.0, + "step": 5485 + }, + { + "epoch": 1.4835045970795024, + "grad_norm": 2.2898221015930176, + "learning_rate": 1.6847579420056504e-05, + "loss": 2.0519, + "mean_token_accuracy": 0.5559585094451904, + "num_tokens": 2805712255.0, + "step": 5486 + }, + { + "epoch": 1.483775013520822, + "grad_norm": 1.5207587480545044, + "learning_rate": 1.684638093974283e-05, + "loss": 2.0749, + "mean_token_accuracy": 0.5370399355888367, + "num_tokens": 2806236414.0, + "step": 5487 + }, + { + "epoch": 1.4840454299621417, + "grad_norm": 1.544679045677185, + "learning_rate": 1.6845182280045003e-05, + "loss": 1.8466, + "mean_token_accuracy": 0.5890954732894897, + "num_tokens": 2806695015.0, + "step": 5488 + }, + { + "epoch": 1.4843158464034614, + "grad_norm": 1.912785530090332, + "learning_rate": 1.6843983440999786e-05, + "loss": 1.9968, + "mean_token_accuracy": 0.5557177066802979, + "num_tokens": 2807219252.0, + "step": 5489 + }, + { + "epoch": 1.484586262844781, + "grad_norm": 1.4555881023406982, + "learning_rate": 1.6842784422643972e-05, + "loss": 2.051, + "mean_token_accuracy": 0.5692760348320007, + "num_tokens": 2807641862.0, + "step": 5490 + }, + { + "epoch": 1.4848566792861007, + "grad_norm": 1.537391185760498, + "learning_rate": 1.6841585225014354e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.554980993270874, + "num_tokens": 2808166136.0, + "step": 5491 + }, + { + "epoch": 1.4851270957274203, + "grad_norm": 1.3355448246002197, + "learning_rate": 1.684038584814772e-05, + "loss": 2.0166, + "mean_token_accuracy": 0.5452988743782043, + "num_tokens": 2808690244.0, + "step": 5492 + }, + { + "epoch": 1.48539751216874, + "grad_norm": 1.359249472618103, + "learning_rate": 1.6839186292080868e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.544914722442627, + "num_tokens": 2809155284.0, + "step": 5493 + }, + { + "epoch": 1.4856679286100594, + "grad_norm": 1.6379024982452393, + "learning_rate": 1.6837986556850615e-05, + "loss": 1.9946, + "mean_token_accuracy": 0.5549075603485107, + "num_tokens": 2809679554.0, + "step": 5494 + }, + { + "epoch": 1.485938345051379, + "grad_norm": 1.4384530782699585, + "learning_rate": 1.6836786642493765e-05, + "loss": 2.0501, + "mean_token_accuracy": 0.5421239137649536, + "num_tokens": 2810203765.0, + "step": 5495 + }, + { + "epoch": 1.4862087614926986, + "grad_norm": 1.4408997297286987, + "learning_rate": 1.6835586549047133e-05, + "loss": 2.1031, + "mean_token_accuracy": 0.5088371634483337, + "num_tokens": 2810727940.0, + "step": 5496 + }, + { + "epoch": 1.4864791779340183, + "grad_norm": 1.6833370923995972, + "learning_rate": 1.6834386276547547e-05, + "loss": 2.135, + "mean_token_accuracy": 0.5139081478118896, + "num_tokens": 2811252187.0, + "step": 5497 + }, + { + "epoch": 1.486749594375338, + "grad_norm": 1.4855481386184692, + "learning_rate": 1.683318582503183e-05, + "loss": 2.0523, + "mean_token_accuracy": 0.5463964939117432, + "num_tokens": 2811776361.0, + "step": 5498 + }, + { + "epoch": 1.4870200108166576, + "grad_norm": 1.4218471050262451, + "learning_rate": 1.6831985194536823e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.5466662645339966, + "num_tokens": 2812295339.0, + "step": 5499 + }, + { + "epoch": 1.4872904272579772, + "grad_norm": 1.6660593748092651, + "learning_rate": 1.6830784385099353e-05, + "loss": 2.0052, + "mean_token_accuracy": 0.5706173777580261, + "num_tokens": 2812778267.0, + "step": 5500 + }, + { + "epoch": 1.4875608436992969, + "grad_norm": 0.6575964093208313, + "learning_rate": 1.6829583396756276e-05, + "loss": 1.2204, + "mean_token_accuracy": 0.667495846748352, + "num_tokens": 2813302542.0, + "step": 5501 + }, + { + "epoch": 1.4878312601406165, + "grad_norm": 1.6546756029129028, + "learning_rate": 1.682838222954443e-05, + "loss": 1.9572, + "mean_token_accuracy": 0.5479087829589844, + "num_tokens": 2813779696.0, + "step": 5502 + }, + { + "epoch": 1.4881016765819362, + "grad_norm": 2.124056577682495, + "learning_rate": 1.6827180883500686e-05, + "loss": 2.0627, + "mean_token_accuracy": 0.5373928546905518, + "num_tokens": 2814303963.0, + "step": 5503 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 1.3812485933303833, + "learning_rate": 1.682597935866189e-05, + "loss": 2.1184, + "mean_token_accuracy": 0.5415939092636108, + "num_tokens": 2814793944.0, + "step": 5504 + }, + { + "epoch": 1.4886425094645754, + "grad_norm": 1.3818529844284058, + "learning_rate": 1.6824777655064913e-05, + "loss": 1.9325, + "mean_token_accuracy": 0.565078854560852, + "num_tokens": 2815281204.0, + "step": 5505 + }, + { + "epoch": 1.488912925905895, + "grad_norm": 1.2676258087158203, + "learning_rate": 1.682357577274663e-05, + "loss": 1.9163, + "mean_token_accuracy": 0.5582671165466309, + "num_tokens": 2815805426.0, + "step": 5506 + }, + { + "epoch": 1.4891833423472147, + "grad_norm": 1.4592130184173584, + "learning_rate": 1.6822373711743916e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.5683343410491943, + "num_tokens": 2816267060.0, + "step": 5507 + }, + { + "epoch": 1.4894537587885344, + "grad_norm": 1.524938702583313, + "learning_rate": 1.6821171472093652e-05, + "loss": 2.086, + "mean_token_accuracy": 0.5229799747467041, + "num_tokens": 2816791266.0, + "step": 5508 + }, + { + "epoch": 1.489724175229854, + "grad_norm": 1.532127857208252, + "learning_rate": 1.6819969053832732e-05, + "loss": 1.9629, + "mean_token_accuracy": 0.5360945463180542, + "num_tokens": 2817315397.0, + "step": 5509 + }, + { + "epoch": 1.4899945916711737, + "grad_norm": 1.8585699796676636, + "learning_rate": 1.6818766456998045e-05, + "loss": 2.1188, + "mean_token_accuracy": 0.5354132056236267, + "num_tokens": 2817839560.0, + "step": 5510 + }, + { + "epoch": 1.4902650081124933, + "grad_norm": 1.3264172077178955, + "learning_rate": 1.6817563681626493e-05, + "loss": 2.0633, + "mean_token_accuracy": 0.5404016971588135, + "num_tokens": 2818363845.0, + "step": 5511 + }, + { + "epoch": 1.490535424553813, + "grad_norm": 1.640069603919983, + "learning_rate": 1.6816360727754978e-05, + "loss": 1.8208, + "mean_token_accuracy": 0.5836438536643982, + "num_tokens": 2818887870.0, + "step": 5512 + }, + { + "epoch": 1.4908058409951326, + "grad_norm": 1.3371913433074951, + "learning_rate": 1.6815157595420408e-05, + "loss": 2.0047, + "mean_token_accuracy": 0.5676621198654175, + "num_tokens": 2819356235.0, + "step": 5513 + }, + { + "epoch": 1.4910762574364522, + "grad_norm": 1.40359365940094, + "learning_rate": 1.6813954284659702e-05, + "loss": 2.0582, + "mean_token_accuracy": 0.5447092652320862, + "num_tokens": 2819880474.0, + "step": 5514 + }, + { + "epoch": 1.4913466738777719, + "grad_norm": 1.0802052021026611, + "learning_rate": 1.6812750795509782e-05, + "loss": 1.9371, + "mean_token_accuracy": 0.5592449903488159, + "num_tokens": 2820404597.0, + "step": 5515 + }, + { + "epoch": 1.4916170903190915, + "grad_norm": 1.5124753713607788, + "learning_rate": 1.6811547128007575e-05, + "loss": 1.937, + "mean_token_accuracy": 0.573711633682251, + "num_tokens": 2820872282.0, + "step": 5516 + }, + { + "epoch": 1.4918875067604112, + "grad_norm": 1.3561760187149048, + "learning_rate": 1.681034328219001e-05, + "loss": 1.7802, + "mean_token_accuracy": 0.6037729382514954, + "num_tokens": 2821351224.0, + "step": 5517 + }, + { + "epoch": 1.4921579232017308, + "grad_norm": 1.238616704940796, + "learning_rate": 1.6809139258094027e-05, + "loss": 1.9894, + "mean_token_accuracy": 0.5393619537353516, + "num_tokens": 2821875438.0, + "step": 5518 + }, + { + "epoch": 1.4924283396430502, + "grad_norm": 1.1343127489089966, + "learning_rate": 1.6807935055756565e-05, + "loss": 2.0402, + "mean_token_accuracy": 0.535284161567688, + "num_tokens": 2822399515.0, + "step": 5519 + }, + { + "epoch": 1.4926987560843699, + "grad_norm": 1.1517627239227295, + "learning_rate": 1.680673067521458e-05, + "loss": 1.9541, + "mean_token_accuracy": 0.5520648956298828, + "num_tokens": 2822913739.0, + "step": 5520 + }, + { + "epoch": 1.4929691725256895, + "grad_norm": 0.6317272782325745, + "learning_rate": 1.680552611650502e-05, + "loss": 1.113, + "mean_token_accuracy": 0.7007390260696411, + "num_tokens": 2823437907.0, + "step": 5521 + }, + { + "epoch": 1.4932395889670091, + "grad_norm": 1.998735785484314, + "learning_rate": 1.6804321379664845e-05, + "loss": 1.9826, + "mean_token_accuracy": 0.5382546782493591, + "num_tokens": 2823962010.0, + "step": 5522 + }, + { + "epoch": 1.4935100054083288, + "grad_norm": 1.6975903511047363, + "learning_rate": 1.680311646473102e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.5388191938400269, + "num_tokens": 2824486123.0, + "step": 5523 + }, + { + "epoch": 1.4937804218496484, + "grad_norm": 1.415268063545227, + "learning_rate": 1.6801911371740517e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.5466312170028687, + "num_tokens": 2824957517.0, + "step": 5524 + }, + { + "epoch": 1.494050838290968, + "grad_norm": 1.769423007965088, + "learning_rate": 1.680070610073031e-05, + "loss": 2.0376, + "mean_token_accuracy": 0.5483719706535339, + "num_tokens": 2825481796.0, + "step": 5525 + }, + { + "epoch": 1.4943212547322877, + "grad_norm": 1.482450008392334, + "learning_rate": 1.6799500651737383e-05, + "loss": 1.9807, + "mean_token_accuracy": 0.5316511988639832, + "num_tokens": 2826005928.0, + "step": 5526 + }, + { + "epoch": 1.4945916711736074, + "grad_norm": 1.5855531692504883, + "learning_rate": 1.679829502479872e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.5378969311714172, + "num_tokens": 2826530190.0, + "step": 5527 + }, + { + "epoch": 1.494862087614927, + "grad_norm": 1.621437430381775, + "learning_rate": 1.679708921995131e-05, + "loss": 1.8628, + "mean_token_accuracy": 0.5726376175880432, + "num_tokens": 2827035150.0, + "step": 5528 + }, + { + "epoch": 1.4951325040562466, + "grad_norm": 1.4514611959457397, + "learning_rate": 1.679588323723216e-05, + "loss": 1.9487, + "mean_token_accuracy": 0.5582621693611145, + "num_tokens": 2827559408.0, + "step": 5529 + }, + { + "epoch": 1.4954029204975663, + "grad_norm": 1.4061089754104614, + "learning_rate": 1.6794677076678265e-05, + "loss": 1.9799, + "mean_token_accuracy": 0.5437490940093994, + "num_tokens": 2828083659.0, + "step": 5530 + }, + { + "epoch": 1.495673336938886, + "grad_norm": 1.3081773519515991, + "learning_rate": 1.679347073832664e-05, + "loss": 1.956, + "mean_token_accuracy": 0.5415706038475037, + "num_tokens": 2828607832.0, + "step": 5531 + }, + { + "epoch": 1.4959437533802056, + "grad_norm": 1.5390264987945557, + "learning_rate": 1.6792264222214286e-05, + "loss": 2.0125, + "mean_token_accuracy": 0.5596848130226135, + "num_tokens": 2829132103.0, + "step": 5532 + }, + { + "epoch": 1.4962141698215252, + "grad_norm": 1.4216903448104858, + "learning_rate": 1.679105752837824e-05, + "loss": 1.9148, + "mean_token_accuracy": 0.5558435916900635, + "num_tokens": 2829656358.0, + "step": 5533 + }, + { + "epoch": 1.4964845862628449, + "grad_norm": 1.36186945438385, + "learning_rate": 1.6789850656855514e-05, + "loss": 1.9997, + "mean_token_accuracy": 0.558622419834137, + "num_tokens": 2830130757.0, + "step": 5534 + }, + { + "epoch": 1.4967550027041643, + "grad_norm": 1.6050351858139038, + "learning_rate": 1.678864360768315e-05, + "loss": 2.02, + "mean_token_accuracy": 0.5549893975257874, + "num_tokens": 2830654987.0, + "step": 5535 + }, + { + "epoch": 1.497025419145484, + "grad_norm": 1.4604946374893188, + "learning_rate": 1.6787436380898166e-05, + "loss": 1.9418, + "mean_token_accuracy": 0.5547347068786621, + "num_tokens": 2831170453.0, + "step": 5536 + }, + { + "epoch": 1.4972958355868036, + "grad_norm": 1.4384530782699585, + "learning_rate": 1.6786228976537622e-05, + "loss": 2.0233, + "mean_token_accuracy": 0.5567087531089783, + "num_tokens": 2831650764.0, + "step": 5537 + }, + { + "epoch": 1.4975662520281232, + "grad_norm": 1.19864022731781, + "learning_rate": 1.678502139463856e-05, + "loss": 1.9952, + "mean_token_accuracy": 0.561332106590271, + "num_tokens": 2832175041.0, + "step": 5538 + }, + { + "epoch": 1.4978366684694429, + "grad_norm": 1.2722747325897217, + "learning_rate": 1.6783813635238023e-05, + "loss": 1.9174, + "mean_token_accuracy": 0.5502872467041016, + "num_tokens": 2832699247.0, + "step": 5539 + }, + { + "epoch": 1.4981070849107625, + "grad_norm": 1.1452010869979858, + "learning_rate": 1.6782605698373076e-05, + "loss": 1.9401, + "mean_token_accuracy": 0.562501847743988, + "num_tokens": 2833166221.0, + "step": 5540 + }, + { + "epoch": 1.4983775013520821, + "grad_norm": 0.6302019357681274, + "learning_rate": 1.6781397584080778e-05, + "loss": 1.1161, + "mean_token_accuracy": 0.6974082589149475, + "num_tokens": 2833690470.0, + "step": 5541 + }, + { + "epoch": 1.4986479177934018, + "grad_norm": 3.18900465965271, + "learning_rate": 1.6780189292398205e-05, + "loss": 2.0504, + "mean_token_accuracy": 0.5575637221336365, + "num_tokens": 2834155508.0, + "step": 5542 + }, + { + "epoch": 1.4989183342347214, + "grad_norm": 2.277297019958496, + "learning_rate": 1.677898082336242e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.5407024621963501, + "num_tokens": 2834679783.0, + "step": 5543 + }, + { + "epoch": 1.499188750676041, + "grad_norm": 1.373110055923462, + "learning_rate": 1.6777772177010513e-05, + "loss": 2.023, + "mean_token_accuracy": 0.5612815618515015, + "num_tokens": 2835204062.0, + "step": 5544 + }, + { + "epoch": 1.4994591671173607, + "grad_norm": 2.8267388343811035, + "learning_rate": 1.6776563353379564e-05, + "loss": 2.0813, + "mean_token_accuracy": 0.5352383852005005, + "num_tokens": 2835728181.0, + "step": 5545 + }, + { + "epoch": 1.4997295835586804, + "grad_norm": 2.081268072128296, + "learning_rate": 1.6775354352506662e-05, + "loss": 2.0646, + "mean_token_accuracy": 0.5342568159103394, + "num_tokens": 2836213441.0, + "step": 5546 + }, + { + "epoch": 1.5, + "grad_norm": 1.3872822523117065, + "learning_rate": 1.6774145174428908e-05, + "loss": 1.882, + "mean_token_accuracy": 0.5666391849517822, + "num_tokens": 2836737710.0, + "step": 5547 + }, + { + "epoch": 1.5002704164413196, + "grad_norm": 1.959027647972107, + "learning_rate": 1.6772935819183394e-05, + "loss": 1.6819, + "mean_token_accuracy": 0.6457061171531677, + "num_tokens": 2837202259.0, + "step": 5548 + }, + { + "epoch": 1.5005408328826393, + "grad_norm": 2.0335605144500732, + "learning_rate": 1.6771726286807235e-05, + "loss": 1.8569, + "mean_token_accuracy": 0.556891679763794, + "num_tokens": 2837726291.0, + "step": 5549 + }, + { + "epoch": 1.500811249323959, + "grad_norm": 2.0212409496307373, + "learning_rate": 1.677051657733754e-05, + "loss": 1.8775, + "mean_token_accuracy": 0.5543185472488403, + "num_tokens": 2838207159.0, + "step": 5550 + }, + { + "epoch": 1.5010816657652786, + "grad_norm": 2.063938617706299, + "learning_rate": 1.6769306690811427e-05, + "loss": 2.0415, + "mean_token_accuracy": 0.5519559383392334, + "num_tokens": 2838731369.0, + "step": 5551 + }, + { + "epoch": 1.5013520822065982, + "grad_norm": 2.0168914794921875, + "learning_rate": 1.6768096627266016e-05, + "loss": 2.0075, + "mean_token_accuracy": 0.5540629625320435, + "num_tokens": 2839233110.0, + "step": 5552 + }, + { + "epoch": 1.5016224986479179, + "grad_norm": 1.4636547565460205, + "learning_rate": 1.676688638673844e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.5627615451812744, + "num_tokens": 2839720856.0, + "step": 5553 + }, + { + "epoch": 1.5018929150892375, + "grad_norm": 1.614736795425415, + "learning_rate": 1.676567596926583e-05, + "loss": 1.9187, + "mean_token_accuracy": 0.5527790784835815, + "num_tokens": 2840245028.0, + "step": 5554 + }, + { + "epoch": 1.5021633315305571, + "grad_norm": 2.0915029048919678, + "learning_rate": 1.676446537488532e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5658796429634094, + "num_tokens": 2840732470.0, + "step": 5555 + }, + { + "epoch": 1.5024337479718768, + "grad_norm": 1.7381666898727417, + "learning_rate": 1.6763254603634066e-05, + "loss": 2.13, + "mean_token_accuracy": 0.5254020690917969, + "num_tokens": 2841241197.0, + "step": 5556 + }, + { + "epoch": 1.5027041644131964, + "grad_norm": 1.3465315103530884, + "learning_rate": 1.676204365554921e-05, + "loss": 1.9839, + "mean_token_accuracy": 0.539743185043335, + "num_tokens": 2841713288.0, + "step": 5557 + }, + { + "epoch": 1.502974580854516, + "grad_norm": 1.5773308277130127, + "learning_rate": 1.676083253066791e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5346707701683044, + "num_tokens": 2842237513.0, + "step": 5558 + }, + { + "epoch": 1.5032449972958357, + "grad_norm": 2.0085203647613525, + "learning_rate": 1.675962122902733e-05, + "loss": 2.053, + "mean_token_accuracy": 0.528464674949646, + "num_tokens": 2842761798.0, + "step": 5559 + }, + { + "epoch": 1.5035154137371554, + "grad_norm": 1.2350114583969116, + "learning_rate": 1.675840975066463e-05, + "loss": 1.9861, + "mean_token_accuracy": 0.5644840598106384, + "num_tokens": 2843230538.0, + "step": 5560 + }, + { + "epoch": 1.503785830178475, + "grad_norm": 0.8381641507148743, + "learning_rate": 1.6757198095616983e-05, + "loss": 1.1281, + "mean_token_accuracy": 0.6992875337600708, + "num_tokens": 2843754645.0, + "step": 5561 + }, + { + "epoch": 1.5040562466197946, + "grad_norm": 2.7128748893737793, + "learning_rate": 1.675598626392157e-05, + "loss": 1.9862, + "mean_token_accuracy": 0.550549328327179, + "num_tokens": 2844222558.0, + "step": 5562 + }, + { + "epoch": 1.504326663061114, + "grad_norm": 2.4200968742370605, + "learning_rate": 1.675477425561557e-05, + "loss": 2.1249, + "mean_token_accuracy": 0.5199917554855347, + "num_tokens": 2844746804.0, + "step": 5563 + }, + { + "epoch": 1.5045970795024337, + "grad_norm": 1.604621410369873, + "learning_rate": 1.675356207073617e-05, + "loss": 2.0108, + "mean_token_accuracy": 0.5533866286277771, + "num_tokens": 2845270978.0, + "step": 5564 + }, + { + "epoch": 1.5048674959437534, + "grad_norm": 1.7366923093795776, + "learning_rate": 1.675234970932057e-05, + "loss": 2.0709, + "mean_token_accuracy": 0.5558702349662781, + "num_tokens": 2845743514.0, + "step": 5565 + }, + { + "epoch": 1.505137912385073, + "grad_norm": 2.0198748111724854, + "learning_rate": 1.675113717140596e-05, + "loss": 2.1133, + "mean_token_accuracy": 0.5546896457672119, + "num_tokens": 2846267733.0, + "step": 5566 + }, + { + "epoch": 1.5054083288263926, + "grad_norm": 1.6266230344772339, + "learning_rate": 1.6749924457029552e-05, + "loss": 1.9606, + "mean_token_accuracy": 0.5554360151290894, + "num_tokens": 2846791887.0, + "step": 5567 + }, + { + "epoch": 1.5056787452677123, + "grad_norm": 1.3769185543060303, + "learning_rate": 1.674871156622855e-05, + "loss": 1.9518, + "mean_token_accuracy": 0.5555371046066284, + "num_tokens": 2847316029.0, + "step": 5568 + }, + { + "epoch": 1.505949161709032, + "grad_norm": 1.5563485622406006, + "learning_rate": 1.674749849904017e-05, + "loss": 1.9714, + "mean_token_accuracy": 0.5611370205879211, + "num_tokens": 2847840207.0, + "step": 5569 + }, + { + "epoch": 1.5062195781503516, + "grad_norm": 1.8318283557891846, + "learning_rate": 1.674628525550164e-05, + "loss": 2.1039, + "mean_token_accuracy": 0.5443788766860962, + "num_tokens": 2848364485.0, + "step": 5570 + }, + { + "epoch": 1.5064899945916712, + "grad_norm": 1.5296313762664795, + "learning_rate": 1.6745071835650174e-05, + "loss": 1.9982, + "mean_token_accuracy": 0.5493048429489136, + "num_tokens": 2848888675.0, + "step": 5571 + }, + { + "epoch": 1.5067604110329909, + "grad_norm": 1.3023916482925415, + "learning_rate": 1.674385823952301e-05, + "loss": 1.984, + "mean_token_accuracy": 0.5531299710273743, + "num_tokens": 2849357111.0, + "step": 5572 + }, + { + "epoch": 1.5070308274743103, + "grad_norm": 1.108950138092041, + "learning_rate": 1.6742644467157385e-05, + "loss": 1.9084, + "mean_token_accuracy": 0.5651710033416748, + "num_tokens": 2849881365.0, + "step": 5573 + }, + { + "epoch": 1.50730124391563, + "grad_norm": 1.413575530052185, + "learning_rate": 1.674143051859054e-05, + "loss": 1.944, + "mean_token_accuracy": 0.5558602809906006, + "num_tokens": 2850349024.0, + "step": 5574 + }, + { + "epoch": 1.5075716603569496, + "grad_norm": 1.2656575441360474, + "learning_rate": 1.674021639385972e-05, + "loss": 1.9457, + "mean_token_accuracy": 0.5291423797607422, + "num_tokens": 2850873247.0, + "step": 5575 + }, + { + "epoch": 1.5078420767982692, + "grad_norm": 1.7557586431503296, + "learning_rate": 1.673900209300218e-05, + "loss": 2.1163, + "mean_token_accuracy": 0.5396596193313599, + "num_tokens": 2851397525.0, + "step": 5576 + }, + { + "epoch": 1.5081124932395888, + "grad_norm": 1.4759176969528198, + "learning_rate": 1.6737787616055183e-05, + "loss": 1.9216, + "mean_token_accuracy": 0.572278618812561, + "num_tokens": 2851921677.0, + "step": 5577 + }, + { + "epoch": 1.5083829096809085, + "grad_norm": 1.339604139328003, + "learning_rate": 1.673657296305598e-05, + "loss": 1.8883, + "mean_token_accuracy": 0.578402042388916, + "num_tokens": 2852445782.0, + "step": 5578 + }, + { + "epoch": 1.5086533261222281, + "grad_norm": 1.3361815214157104, + "learning_rate": 1.6735358134041853e-05, + "loss": 2.0595, + "mean_token_accuracy": 0.5196489095687866, + "num_tokens": 2852969980.0, + "step": 5579 + }, + { + "epoch": 1.5089237425635478, + "grad_norm": 1.16185462474823, + "learning_rate": 1.6734143129050075e-05, + "loss": 1.9207, + "mean_token_accuracy": 0.5586807727813721, + "num_tokens": 2853494235.0, + "step": 5580 + }, + { + "epoch": 1.5091941590048674, + "grad_norm": 0.5966786742210388, + "learning_rate": 1.673292794811792e-05, + "loss": 1.1985, + "mean_token_accuracy": 0.6661434173583984, + "num_tokens": 2854018449.0, + "step": 5581 + }, + { + "epoch": 1.509464575446187, + "grad_norm": 1.4822072982788086, + "learning_rate": 1.6731712591282673e-05, + "loss": 2.0684, + "mean_token_accuracy": 0.5345755219459534, + "num_tokens": 2854542728.0, + "step": 5582 + }, + { + "epoch": 1.5097349918875067, + "grad_norm": 1.3355761766433716, + "learning_rate": 1.6730497058581632e-05, + "loss": 1.9597, + "mean_token_accuracy": 0.548984706401825, + "num_tokens": 2855066924.0, + "step": 5583 + }, + { + "epoch": 1.5100054083288263, + "grad_norm": 1.1907057762145996, + "learning_rate": 1.672928135005209e-05, + "loss": 1.9492, + "mean_token_accuracy": 0.5574867725372314, + "num_tokens": 2855557076.0, + "step": 5584 + }, + { + "epoch": 1.510275824770146, + "grad_norm": 1.4442172050476074, + "learning_rate": 1.6728065465731343e-05, + "loss": 1.956, + "mean_token_accuracy": 0.5560322403907776, + "num_tokens": 2856033096.0, + "step": 5585 + }, + { + "epoch": 1.5105462412114656, + "grad_norm": 1.2646223306655884, + "learning_rate": 1.6726849405656707e-05, + "loss": 2.0099, + "mean_token_accuracy": 0.5527385473251343, + "num_tokens": 2856557365.0, + "step": 5586 + }, + { + "epoch": 1.5108166576527853, + "grad_norm": 1.6756476163864136, + "learning_rate": 1.6725633169865486e-05, + "loss": 2.0569, + "mean_token_accuracy": 0.5478768348693848, + "num_tokens": 2857080404.0, + "step": 5587 + }, + { + "epoch": 1.511087074094105, + "grad_norm": 1.2832738161087036, + "learning_rate": 1.6724416758395004e-05, + "loss": 1.9962, + "mean_token_accuracy": 0.5473552942276001, + "num_tokens": 2857604605.0, + "step": 5588 + }, + { + "epoch": 1.5113574905354246, + "grad_norm": 1.3094998598098755, + "learning_rate": 1.6723200171282577e-05, + "loss": 2.0321, + "mean_token_accuracy": 0.5446580648422241, + "num_tokens": 2858128856.0, + "step": 5589 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 1.566760540008545, + "learning_rate": 1.6721983408565543e-05, + "loss": 2.0038, + "mean_token_accuracy": 0.5465646982192993, + "num_tokens": 2858653060.0, + "step": 5590 + }, + { + "epoch": 1.5118983234180638, + "grad_norm": 1.067959189414978, + "learning_rate": 1.6720766470281228e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.5660836696624756, + "num_tokens": 2859173849.0, + "step": 5591 + }, + { + "epoch": 1.5121687398593835, + "grad_norm": 1.3894639015197754, + "learning_rate": 1.6719549356466974e-05, + "loss": 1.9731, + "mean_token_accuracy": 0.5480022430419922, + "num_tokens": 2859680458.0, + "step": 5592 + }, + { + "epoch": 1.5124391563007031, + "grad_norm": 1.3567779064178467, + "learning_rate": 1.6718332067160127e-05, + "loss": 1.985, + "mean_token_accuracy": 0.5661638975143433, + "num_tokens": 2860106579.0, + "step": 5593 + }, + { + "epoch": 1.5127095727420228, + "grad_norm": 1.3888654708862305, + "learning_rate": 1.6717114602398036e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5770865678787231, + "num_tokens": 2860630735.0, + "step": 5594 + }, + { + "epoch": 1.5129799891833424, + "grad_norm": 1.1924935579299927, + "learning_rate": 1.6715896962218053e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.563300371170044, + "num_tokens": 2861154768.0, + "step": 5595 + }, + { + "epoch": 1.513250405624662, + "grad_norm": 1.131592035293579, + "learning_rate": 1.6714679146657545e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5455007553100586, + "num_tokens": 2861678837.0, + "step": 5596 + }, + { + "epoch": 1.5135208220659817, + "grad_norm": 1.3642098903656006, + "learning_rate": 1.6713461155753874e-05, + "loss": 1.9602, + "mean_token_accuracy": 0.5523171424865723, + "num_tokens": 2862202927.0, + "step": 5597 + }, + { + "epoch": 1.5137912385073014, + "grad_norm": 1.2767424583435059, + "learning_rate": 1.6712242989544415e-05, + "loss": 2.0566, + "mean_token_accuracy": 0.5403203368186951, + "num_tokens": 2862700899.0, + "step": 5598 + }, + { + "epoch": 1.514061654948621, + "grad_norm": 1.3397085666656494, + "learning_rate": 1.6711024648066536e-05, + "loss": 2.0831, + "mean_token_accuracy": 0.5292093753814697, + "num_tokens": 2863135532.0, + "step": 5599 + }, + { + "epoch": 1.5143320713899406, + "grad_norm": 1.2534865140914917, + "learning_rate": 1.6709806131357633e-05, + "loss": 2.0501, + "mean_token_accuracy": 0.5326750874519348, + "num_tokens": 2863659696.0, + "step": 5600 + }, + { + "epoch": 1.5146024878312603, + "grad_norm": 0.6387252807617188, + "learning_rate": 1.670858743945508e-05, + "loss": 1.1203, + "mean_token_accuracy": 0.701903223991394, + "num_tokens": 2864183785.0, + "step": 5601 + }, + { + "epoch": 1.51487290427258, + "grad_norm": 1.929071068763733, + "learning_rate": 1.6707368572396285e-05, + "loss": 1.9707, + "mean_token_accuracy": 0.5632250308990479, + "num_tokens": 2864687257.0, + "step": 5602 + }, + { + "epoch": 1.5151433207138996, + "grad_norm": 1.2553930282592773, + "learning_rate": 1.670614953021863e-05, + "loss": 1.7908, + "mean_token_accuracy": 0.5929325222969055, + "num_tokens": 2865211406.0, + "step": 5603 + }, + { + "epoch": 1.515413737155219, + "grad_norm": 1.3884053230285645, + "learning_rate": 1.670493031295953e-05, + "loss": 1.8741, + "mean_token_accuracy": 0.5515398383140564, + "num_tokens": 2865735686.0, + "step": 5604 + }, + { + "epoch": 1.5156841535965386, + "grad_norm": 2.0189905166625977, + "learning_rate": 1.670371092065639e-05, + "loss": 1.9405, + "mean_token_accuracy": 0.5659623742103577, + "num_tokens": 2866259897.0, + "step": 5605 + }, + { + "epoch": 1.5159545700378583, + "grad_norm": 1.5794423818588257, + "learning_rate": 1.6702491353346625e-05, + "loss": 2.0312, + "mean_token_accuracy": 0.5474984645843506, + "num_tokens": 2866737501.0, + "step": 5606 + }, + { + "epoch": 1.516224986479178, + "grad_norm": 2.1437013149261475, + "learning_rate": 1.6701271611067657e-05, + "loss": 1.9595, + "mean_token_accuracy": 0.5578749179840088, + "num_tokens": 2867261766.0, + "step": 5607 + }, + { + "epoch": 1.5164954029204976, + "grad_norm": 1.7514622211456299, + "learning_rate": 1.670005169385691e-05, + "loss": 1.8741, + "mean_token_accuracy": 0.5511028170585632, + "num_tokens": 2867786000.0, + "step": 5608 + }, + { + "epoch": 1.5167658193618172, + "grad_norm": 1.546069860458374, + "learning_rate": 1.6698831601751808e-05, + "loss": 2.0325, + "mean_token_accuracy": 0.556174635887146, + "num_tokens": 2868216333.0, + "step": 5609 + }, + { + "epoch": 1.5170362358031368, + "grad_norm": 1.8566511869430542, + "learning_rate": 1.66976113347898e-05, + "loss": 1.8877, + "mean_token_accuracy": 0.57063227891922, + "num_tokens": 2868740599.0, + "step": 5610 + }, + { + "epoch": 1.5173066522444565, + "grad_norm": 1.994437336921692, + "learning_rate": 1.6696390893008318e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.5619089603424072, + "num_tokens": 2869210815.0, + "step": 5611 + }, + { + "epoch": 1.5175770686857761, + "grad_norm": 1.607421636581421, + "learning_rate": 1.6695170276444812e-05, + "loss": 2.0821, + "mean_token_accuracy": 0.5363876819610596, + "num_tokens": 2869729752.0, + "step": 5612 + }, + { + "epoch": 1.5178474851270958, + "grad_norm": 1.5421181917190552, + "learning_rate": 1.6693949485136735e-05, + "loss": 1.9117, + "mean_token_accuracy": 0.5652785897254944, + "num_tokens": 2870254001.0, + "step": 5613 + }, + { + "epoch": 1.5181179015684152, + "grad_norm": 1.3241989612579346, + "learning_rate": 1.669272851912154e-05, + "loss": 1.862, + "mean_token_accuracy": 0.5688546895980835, + "num_tokens": 2870778197.0, + "step": 5614 + }, + { + "epoch": 1.5183883180097348, + "grad_norm": 1.5395584106445312, + "learning_rate": 1.6691507378436696e-05, + "loss": 1.921, + "mean_token_accuracy": 0.5728275179862976, + "num_tokens": 2871265971.0, + "step": 5615 + }, + { + "epoch": 1.5186587344510545, + "grad_norm": 1.5208392143249512, + "learning_rate": 1.669028606311967e-05, + "loss": 2.0292, + "mean_token_accuracy": 0.5516429543495178, + "num_tokens": 2871790146.0, + "step": 5616 + }, + { + "epoch": 1.5189291508923741, + "grad_norm": 1.6158403158187866, + "learning_rate": 1.6689064573207925e-05, + "loss": 1.9516, + "mean_token_accuracy": 0.5617004632949829, + "num_tokens": 2872293772.0, + "step": 5617 + }, + { + "epoch": 1.5191995673336938, + "grad_norm": 1.4417014122009277, + "learning_rate": 1.6687842908738957e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.5488182306289673, + "num_tokens": 2872818029.0, + "step": 5618 + }, + { + "epoch": 1.5194699837750134, + "grad_norm": 1.5608067512512207, + "learning_rate": 1.6686621069750237e-05, + "loss": 1.9792, + "mean_token_accuracy": 0.5596462488174438, + "num_tokens": 2873342198.0, + "step": 5619 + }, + { + "epoch": 1.519740400216333, + "grad_norm": 1.3796393871307373, + "learning_rate": 1.668539905627926e-05, + "loss": 1.9279, + "mean_token_accuracy": 0.5627613067626953, + "num_tokens": 2873817083.0, + "step": 5620 + }, + { + "epoch": 1.5200108166576527, + "grad_norm": 0.7464931011199951, + "learning_rate": 1.668417686836352e-05, + "loss": 1.1656, + "mean_token_accuracy": 0.6955393552780151, + "num_tokens": 2874341363.0, + "step": 5621 + }, + { + "epoch": 1.5202812330989723, + "grad_norm": 2.204948902130127, + "learning_rate": 1.6682954506040522e-05, + "loss": 2.0559, + "mean_token_accuracy": 0.538682222366333, + "num_tokens": 2874859775.0, + "step": 5622 + }, + { + "epoch": 1.520551649540292, + "grad_norm": 1.7652100324630737, + "learning_rate": 1.6681731969347762e-05, + "loss": 1.9962, + "mean_token_accuracy": 0.5383172631263733, + "num_tokens": 2875384049.0, + "step": 5623 + }, + { + "epoch": 1.5208220659816116, + "grad_norm": 1.2668746709823608, + "learning_rate": 1.6680509258322757e-05, + "loss": 1.9146, + "mean_token_accuracy": 0.5684237480163574, + "num_tokens": 2875908245.0, + "step": 5624 + }, + { + "epoch": 1.5210924824229313, + "grad_norm": 1.5113410949707031, + "learning_rate": 1.6679286373003023e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.5354057550430298, + "num_tokens": 2876432521.0, + "step": 5625 + }, + { + "epoch": 1.521362898864251, + "grad_norm": 1.4462478160858154, + "learning_rate": 1.667806331342608e-05, + "loss": 2.1183, + "mean_token_accuracy": 0.5429690480232239, + "num_tokens": 2876902574.0, + "step": 5626 + }, + { + "epoch": 1.5216333153055706, + "grad_norm": 1.350744605064392, + "learning_rate": 1.667684007962946e-05, + "loss": 1.9543, + "mean_token_accuracy": 0.5867842435836792, + "num_tokens": 2877426736.0, + "step": 5627 + }, + { + "epoch": 1.5219037317468902, + "grad_norm": 1.431664228439331, + "learning_rate": 1.6675616671650693e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.5480155944824219, + "num_tokens": 2877950983.0, + "step": 5628 + }, + { + "epoch": 1.5221741481882098, + "grad_norm": 1.5196549892425537, + "learning_rate": 1.667439308952731e-05, + "loss": 2.0813, + "mean_token_accuracy": 0.5300106406211853, + "num_tokens": 2878475264.0, + "step": 5629 + }, + { + "epoch": 1.5224445646295295, + "grad_norm": 1.0900415182113647, + "learning_rate": 1.6673169333296863e-05, + "loss": 2.0591, + "mean_token_accuracy": 0.5506538152694702, + "num_tokens": 2878999546.0, + "step": 5630 + }, + { + "epoch": 1.5227149810708491, + "grad_norm": 1.298236608505249, + "learning_rate": 1.66719454029969e-05, + "loss": 2.0317, + "mean_token_accuracy": 0.5562983751296997, + "num_tokens": 2879473155.0, + "step": 5631 + }, + { + "epoch": 1.5229853975121688, + "grad_norm": 1.52981436252594, + "learning_rate": 1.667072129866497e-05, + "loss": 2.0883, + "mean_token_accuracy": 0.5463638305664062, + "num_tokens": 2879997425.0, + "step": 5632 + }, + { + "epoch": 1.5232558139534884, + "grad_norm": 1.2242079973220825, + "learning_rate": 1.6669497020338632e-05, + "loss": 1.96, + "mean_token_accuracy": 0.5393968820571899, + "num_tokens": 2880521665.0, + "step": 5633 + }, + { + "epoch": 1.523526230394808, + "grad_norm": 1.2624362707138062, + "learning_rate": 1.666827256805545e-05, + "loss": 2.0844, + "mean_token_accuracy": 0.52998948097229, + "num_tokens": 2881045877.0, + "step": 5634 + }, + { + "epoch": 1.5237966468361277, + "grad_norm": 1.3344297409057617, + "learning_rate": 1.6667047941853e-05, + "loss": 1.926, + "mean_token_accuracy": 0.5618398189544678, + "num_tokens": 2881527051.0, + "step": 5635 + }, + { + "epoch": 1.5240670632774473, + "grad_norm": 1.3437556028366089, + "learning_rate": 1.666582314176885e-05, + "loss": 2.013, + "mean_token_accuracy": 0.5581249594688416, + "num_tokens": 2882051314.0, + "step": 5636 + }, + { + "epoch": 1.524337479718767, + "grad_norm": 1.2469111680984497, + "learning_rate": 1.6664598167840592e-05, + "loss": 1.8839, + "mean_token_accuracy": 0.5658965110778809, + "num_tokens": 2882487917.0, + "step": 5637 + }, + { + "epoch": 1.5246078961600866, + "grad_norm": 1.6035789251327515, + "learning_rate": 1.6663373020105797e-05, + "loss": 2.0434, + "mean_token_accuracy": 0.5335325002670288, + "num_tokens": 2883012167.0, + "step": 5638 + }, + { + "epoch": 1.5248783126014063, + "grad_norm": 1.4107565879821777, + "learning_rate": 1.6662147698602064e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5560696125030518, + "num_tokens": 2883505734.0, + "step": 5639 + }, + { + "epoch": 1.525148729042726, + "grad_norm": 1.573962926864624, + "learning_rate": 1.6660922203366988e-05, + "loss": 2.0109, + "mean_token_accuracy": 0.5551830530166626, + "num_tokens": 2883981030.0, + "step": 5640 + }, + { + "epoch": 1.5254191454840456, + "grad_norm": 0.5880414247512817, + "learning_rate": 1.6659696534438174e-05, + "loss": 1.1648, + "mean_token_accuracy": 0.6795122623443604, + "num_tokens": 2884505251.0, + "step": 5641 + }, + { + "epoch": 1.5256895619253652, + "grad_norm": 1.852845311164856, + "learning_rate": 1.6658470691853223e-05, + "loss": 2.0094, + "mean_token_accuracy": 0.5265939831733704, + "num_tokens": 2885029447.0, + "step": 5642 + }, + { + "epoch": 1.5259599783666848, + "grad_norm": 1.6339843273162842, + "learning_rate": 1.6657244675649753e-05, + "loss": 1.996, + "mean_token_accuracy": 0.5475500226020813, + "num_tokens": 2885527437.0, + "step": 5643 + }, + { + "epoch": 1.5262303948080045, + "grad_norm": 1.3100744485855103, + "learning_rate": 1.665601848586538e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.5595737099647522, + "num_tokens": 2886051697.0, + "step": 5644 + }, + { + "epoch": 1.526500811249324, + "grad_norm": 1.9560407400131226, + "learning_rate": 1.6654792122537726e-05, + "loss": 2.0986, + "mean_token_accuracy": 0.5450714826583862, + "num_tokens": 2886575751.0, + "step": 5645 + }, + { + "epoch": 1.5267712276906436, + "grad_norm": 1.5061770677566528, + "learning_rate": 1.6653565585704418e-05, + "loss": 1.8478, + "mean_token_accuracy": 0.6031826734542847, + "num_tokens": 2887099903.0, + "step": 5646 + }, + { + "epoch": 1.5270416441319632, + "grad_norm": 1.428073763847351, + "learning_rate": 1.665233887540309e-05, + "loss": 2.1129, + "mean_token_accuracy": 0.5363543629646301, + "num_tokens": 2887624175.0, + "step": 5647 + }, + { + "epoch": 1.5273120605732828, + "grad_norm": 1.6666768789291382, + "learning_rate": 1.6651111991671386e-05, + "loss": 2.0154, + "mean_token_accuracy": 0.5512552261352539, + "num_tokens": 2888148338.0, + "step": 5648 + }, + { + "epoch": 1.5275824770146025, + "grad_norm": 1.6189957857131958, + "learning_rate": 1.6649884934546945e-05, + "loss": 2.1088, + "mean_token_accuracy": 0.5265076160430908, + "num_tokens": 2888614719.0, + "step": 5649 + }, + { + "epoch": 1.5278528934559221, + "grad_norm": 1.4501886367797852, + "learning_rate": 1.6648657704067424e-05, + "loss": 2.0124, + "mean_token_accuracy": 0.5507364869117737, + "num_tokens": 2889110430.0, + "step": 5650 + }, + { + "epoch": 1.5281233098972418, + "grad_norm": 1.4137892723083496, + "learning_rate": 1.6647430300270467e-05, + "loss": 1.9779, + "mean_token_accuracy": 0.5579631328582764, + "num_tokens": 2889634674.0, + "step": 5651 + }, + { + "epoch": 1.5283937263385614, + "grad_norm": 1.6903562545776367, + "learning_rate": 1.6646202723193748e-05, + "loss": 2.0878, + "mean_token_accuracy": 0.5322582721710205, + "num_tokens": 2890097914.0, + "step": 5652 + }, + { + "epoch": 1.528664142779881, + "grad_norm": 1.1907105445861816, + "learning_rate": 1.6644974972874917e-05, + "loss": 1.9259, + "mean_token_accuracy": 0.5415500402450562, + "num_tokens": 2890622186.0, + "step": 5653 + }, + { + "epoch": 1.5289345592212007, + "grad_norm": 1.6803157329559326, + "learning_rate": 1.664374704935166e-05, + "loss": 2.1279, + "mean_token_accuracy": 0.5474065542221069, + "num_tokens": 2891084498.0, + "step": 5654 + }, + { + "epoch": 1.5292049756625201, + "grad_norm": 1.5871622562408447, + "learning_rate": 1.664251895266164e-05, + "loss": 1.966, + "mean_token_accuracy": 0.5621910095214844, + "num_tokens": 2891608737.0, + "step": 5655 + }, + { + "epoch": 1.5294753921038398, + "grad_norm": 1.3896253108978271, + "learning_rate": 1.6641290682842545e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5516920685768127, + "num_tokens": 2892132687.0, + "step": 5656 + }, + { + "epoch": 1.5297458085451594, + "grad_norm": 1.5656852722167969, + "learning_rate": 1.664006223993207e-05, + "loss": 2.0199, + "mean_token_accuracy": 0.5394531488418579, + "num_tokens": 2892656899.0, + "step": 5657 + }, + { + "epoch": 1.530016224986479, + "grad_norm": 1.0952681303024292, + "learning_rate": 1.663883362396789e-05, + "loss": 1.9944, + "mean_token_accuracy": 0.5481845140457153, + "num_tokens": 2893181039.0, + "step": 5658 + }, + { + "epoch": 1.5302866414277987, + "grad_norm": 1.6408356428146362, + "learning_rate": 1.6637604834987718e-05, + "loss": 2.0206, + "mean_token_accuracy": 0.5428683757781982, + "num_tokens": 2893680356.0, + "step": 5659 + }, + { + "epoch": 1.5305570578691183, + "grad_norm": 1.6175072193145752, + "learning_rate": 1.6636375873029246e-05, + "loss": 1.9348, + "mean_token_accuracy": 0.5641525387763977, + "num_tokens": 2894191961.0, + "step": 5660 + }, + { + "epoch": 1.530827474310438, + "grad_norm": 0.522071123123169, + "learning_rate": 1.663514673813019e-05, + "loss": 1.0246, + "mean_token_accuracy": 0.7359235286712646, + "num_tokens": 2894716104.0, + "step": 5661 + }, + { + "epoch": 1.5310978907517576, + "grad_norm": 2.3078179359436035, + "learning_rate": 1.663391743032826e-05, + "loss": 2.0053, + "mean_token_accuracy": 0.5514745116233826, + "num_tokens": 2895240317.0, + "step": 5662 + }, + { + "epoch": 1.5313683071930773, + "grad_norm": 1.9500842094421387, + "learning_rate": 1.6632687949661172e-05, + "loss": 2.023, + "mean_token_accuracy": 0.5519719123840332, + "num_tokens": 2895683218.0, + "step": 5663 + }, + { + "epoch": 1.531638723634397, + "grad_norm": 1.395110011100769, + "learning_rate": 1.6631458296166652e-05, + "loss": 1.9795, + "mean_token_accuracy": 0.5415670871734619, + "num_tokens": 2896207295.0, + "step": 5664 + }, + { + "epoch": 1.5319091400757165, + "grad_norm": 1.54004967212677, + "learning_rate": 1.6630228469882436e-05, + "loss": 1.8981, + "mean_token_accuracy": 0.5558478832244873, + "num_tokens": 2896731495.0, + "step": 5665 + }, + { + "epoch": 1.5321795565170362, + "grad_norm": 1.5964453220367432, + "learning_rate": 1.6628998470846253e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.563345730304718, + "num_tokens": 2897255602.0, + "step": 5666 + }, + { + "epoch": 1.5324499729583558, + "grad_norm": 1.3257607221603394, + "learning_rate": 1.6627768299095837e-05, + "loss": 1.8802, + "mean_token_accuracy": 0.5687260627746582, + "num_tokens": 2897728320.0, + "step": 5667 + }, + { + "epoch": 1.5327203893996755, + "grad_norm": 2.101902484893799, + "learning_rate": 1.6626537954668943e-05, + "loss": 2.1072, + "mean_token_accuracy": 0.5330082774162292, + "num_tokens": 2898252594.0, + "step": 5668 + }, + { + "epoch": 1.5329908058409951, + "grad_norm": 1.3992856740951538, + "learning_rate": 1.662530743760332e-05, + "loss": 1.9849, + "mean_token_accuracy": 0.5462374687194824, + "num_tokens": 2898776830.0, + "step": 5669 + }, + { + "epoch": 1.5332612222823148, + "grad_norm": 1.9645233154296875, + "learning_rate": 1.662407674793672e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5662564039230347, + "num_tokens": 2899225883.0, + "step": 5670 + }, + { + "epoch": 1.5335316387236344, + "grad_norm": 2.3148062229156494, + "learning_rate": 1.6622845885706906e-05, + "loss": 2.1256, + "mean_token_accuracy": 0.5372812747955322, + "num_tokens": 2899750067.0, + "step": 5671 + }, + { + "epoch": 1.533802055164954, + "grad_norm": 1.457047700881958, + "learning_rate": 1.6621614850951645e-05, + "loss": 2.1403, + "mean_token_accuracy": 0.5182874202728271, + "num_tokens": 2900274330.0, + "step": 5672 + }, + { + "epoch": 1.5340724716062737, + "grad_norm": 1.6249080896377563, + "learning_rate": 1.6620383643708707e-05, + "loss": 1.9442, + "mean_token_accuracy": 0.5556988716125488, + "num_tokens": 2900798561.0, + "step": 5673 + }, + { + "epoch": 1.5343428880475933, + "grad_norm": 1.5044692754745483, + "learning_rate": 1.661915226401587e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.5521262288093567, + "num_tokens": 2901267010.0, + "step": 5674 + }, + { + "epoch": 1.534613304488913, + "grad_norm": 1.2611016035079956, + "learning_rate": 1.6617920711910918e-05, + "loss": 1.82, + "mean_token_accuracy": 0.5801171660423279, + "num_tokens": 2901791217.0, + "step": 5675 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 1.449467420578003, + "learning_rate": 1.6616688987431637e-05, + "loss": 2.0246, + "mean_token_accuracy": 0.5661450028419495, + "num_tokens": 2902290283.0, + "step": 5676 + }, + { + "epoch": 1.5351541373715523, + "grad_norm": 1.2701890468597412, + "learning_rate": 1.6615457090615818e-05, + "loss": 1.9665, + "mean_token_accuracy": 0.5432645678520203, + "num_tokens": 2902814376.0, + "step": 5677 + }, + { + "epoch": 1.535424553812872, + "grad_norm": 1.5047978162765503, + "learning_rate": 1.6614225021501263e-05, + "loss": 1.8061, + "mean_token_accuracy": 0.5682864189147949, + "num_tokens": 2903338508.0, + "step": 5678 + }, + { + "epoch": 1.5356949702541915, + "grad_norm": 1.973540186882019, + "learning_rate": 1.6612992780125777e-05, + "loss": 2.1415, + "mean_token_accuracy": 0.5297253131866455, + "num_tokens": 2903846026.0, + "step": 5679 + }, + { + "epoch": 1.5359653866955112, + "grad_norm": 1.437843918800354, + "learning_rate": 1.6611760366527165e-05, + "loss": 1.9828, + "mean_token_accuracy": 0.5626049041748047, + "num_tokens": 2904308289.0, + "step": 5680 + }, + { + "epoch": 1.5362358031368308, + "grad_norm": 0.6874470710754395, + "learning_rate": 1.661052778074324e-05, + "loss": 1.1521, + "mean_token_accuracy": 0.6984987258911133, + "num_tokens": 2904824073.0, + "step": 5681 + }, + { + "epoch": 1.5365062195781505, + "grad_norm": 2.116091012954712, + "learning_rate": 1.660929502281182e-05, + "loss": 2.0315, + "mean_token_accuracy": 0.5248726606369019, + "num_tokens": 2905348276.0, + "step": 5682 + }, + { + "epoch": 1.5367766360194701, + "grad_norm": 1.961804747581482, + "learning_rate": 1.660806209277074e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.5369668006896973, + "num_tokens": 2905872531.0, + "step": 5683 + }, + { + "epoch": 1.5370470524607898, + "grad_norm": 1.3712693452835083, + "learning_rate": 1.6606828990657817e-05, + "loss": 1.8658, + "mean_token_accuracy": 0.5880584120750427, + "num_tokens": 2906396698.0, + "step": 5684 + }, + { + "epoch": 1.5373174689021094, + "grad_norm": 1.6975610256195068, + "learning_rate": 1.6605595716510895e-05, + "loss": 1.9812, + "mean_token_accuracy": 0.5595057010650635, + "num_tokens": 2906893126.0, + "step": 5685 + }, + { + "epoch": 1.5375878853434288, + "grad_norm": 1.7196846008300781, + "learning_rate": 1.660436227036781e-05, + "loss": 2.0683, + "mean_token_accuracy": 0.5325862765312195, + "num_tokens": 2907390074.0, + "step": 5686 + }, + { + "epoch": 1.5378583017847485, + "grad_norm": 1.488571047782898, + "learning_rate": 1.660312865226641e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.5510920882225037, + "num_tokens": 2907914320.0, + "step": 5687 + }, + { + "epoch": 1.5381287182260681, + "grad_norm": 1.7930172681808472, + "learning_rate": 1.6601894862244546e-05, + "loss": 2.0078, + "mean_token_accuracy": 0.5353765487670898, + "num_tokens": 2908438595.0, + "step": 5688 + }, + { + "epoch": 1.5383991346673878, + "grad_norm": 1.8401883840560913, + "learning_rate": 1.6600660900340074e-05, + "loss": 2.0269, + "mean_token_accuracy": 0.5402252078056335, + "num_tokens": 2908951676.0, + "step": 5689 + }, + { + "epoch": 1.5386695511087074, + "grad_norm": 1.3035091161727905, + "learning_rate": 1.659942676659085e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5448145270347595, + "num_tokens": 2909475859.0, + "step": 5690 + }, + { + "epoch": 1.538939967550027, + "grad_norm": 1.314832091331482, + "learning_rate": 1.659819246103475e-05, + "loss": 1.921, + "mean_token_accuracy": 0.5533236861228943, + "num_tokens": 2909999932.0, + "step": 5691 + }, + { + "epoch": 1.5392103839913467, + "grad_norm": 1.5155888795852661, + "learning_rate": 1.6596957983709644e-05, + "loss": 1.9908, + "mean_token_accuracy": 0.5356115102767944, + "num_tokens": 2910524112.0, + "step": 5692 + }, + { + "epoch": 1.5394808004326663, + "grad_norm": 1.3367986679077148, + "learning_rate": 1.65957233346534e-05, + "loss": 2.0498, + "mean_token_accuracy": 0.5349811315536499, + "num_tokens": 2911048376.0, + "step": 5693 + }, + { + "epoch": 1.539751216873986, + "grad_norm": 1.1868913173675537, + "learning_rate": 1.6594488513903914e-05, + "loss": 1.9422, + "mean_token_accuracy": 0.5618607997894287, + "num_tokens": 2911572550.0, + "step": 5694 + }, + { + "epoch": 1.5400216333153056, + "grad_norm": 1.5210648775100708, + "learning_rate": 1.6593253521499067e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.5399571657180786, + "num_tokens": 2912045845.0, + "step": 5695 + }, + { + "epoch": 1.540292049756625, + "grad_norm": 1.2009197473526, + "learning_rate": 1.6592018357476756e-05, + "loss": 1.83, + "mean_token_accuracy": 0.5777978301048279, + "num_tokens": 2912533967.0, + "step": 5696 + }, + { + "epoch": 1.5405624661979447, + "grad_norm": 1.9035630226135254, + "learning_rate": 1.6590783021874874e-05, + "loss": 1.8344, + "mean_token_accuracy": 0.6019372344017029, + "num_tokens": 2913058210.0, + "step": 5697 + }, + { + "epoch": 1.5408328826392643, + "grad_norm": 1.1179941892623901, + "learning_rate": 1.6589547514731327e-05, + "loss": 1.9291, + "mean_token_accuracy": 0.5532751083374023, + "num_tokens": 2913582350.0, + "step": 5698 + }, + { + "epoch": 1.541103299080584, + "grad_norm": 1.5568435192108154, + "learning_rate": 1.6588311836084025e-05, + "loss": 1.9715, + "mean_token_accuracy": 0.5549181699752808, + "num_tokens": 2914106461.0, + "step": 5699 + }, + { + "epoch": 1.5413737155219036, + "grad_norm": 1.4504023790359497, + "learning_rate": 1.658707598597088e-05, + "loss": 2.1271, + "mean_token_accuracy": 0.5216094255447388, + "num_tokens": 2914630723.0, + "step": 5700 + }, + { + "epoch": 1.5416441319632233, + "grad_norm": 0.5659536123275757, + "learning_rate": 1.6585839964429816e-05, + "loss": 1.0577, + "mean_token_accuracy": 0.7132773399353027, + "num_tokens": 2915154977.0, + "step": 5701 + }, + { + "epoch": 1.541914548404543, + "grad_norm": 1.8928098678588867, + "learning_rate": 1.6584603771498756e-05, + "loss": 1.9692, + "mean_token_accuracy": 0.59865403175354, + "num_tokens": 2915614835.0, + "step": 5702 + }, + { + "epoch": 1.5421849648458625, + "grad_norm": 1.7921613454818726, + "learning_rate": 1.6583367407215624e-05, + "loss": 1.956, + "mean_token_accuracy": 0.5616089105606079, + "num_tokens": 2916138995.0, + "step": 5703 + }, + { + "epoch": 1.5424553812871822, + "grad_norm": 1.3317354917526245, + "learning_rate": 1.6582130871618367e-05, + "loss": 2.0595, + "mean_token_accuracy": 0.5482000112533569, + "num_tokens": 2916607191.0, + "step": 5704 + }, + { + "epoch": 1.5427257977285018, + "grad_norm": 1.65413498878479, + "learning_rate": 1.6580894164744912e-05, + "loss": 2.0574, + "mean_token_accuracy": 0.5464001893997192, + "num_tokens": 2917131321.0, + "step": 5705 + }, + { + "epoch": 1.5429962141698215, + "grad_norm": 4.53463077545166, + "learning_rate": 1.6579657286633217e-05, + "loss": 1.7195, + "mean_token_accuracy": 0.6170926094055176, + "num_tokens": 2917655591.0, + "step": 5706 + }, + { + "epoch": 1.543266630611141, + "grad_norm": 1.7845097780227661, + "learning_rate": 1.6578420237321225e-05, + "loss": 2.0548, + "mean_token_accuracy": 0.5905623435974121, + "num_tokens": 2918055901.0, + "step": 5707 + }, + { + "epoch": 1.5435370470524608, + "grad_norm": 1.8733258247375488, + "learning_rate": 1.6577183016846897e-05, + "loss": 1.958, + "mean_token_accuracy": 0.5328652858734131, + "num_tokens": 2918580051.0, + "step": 5708 + }, + { + "epoch": 1.5438074634937804, + "grad_norm": 1.5679751634597778, + "learning_rate": 1.657594562524819e-05, + "loss": 1.936, + "mean_token_accuracy": 0.5477531552314758, + "num_tokens": 2919104235.0, + "step": 5709 + }, + { + "epoch": 1.5440778799351, + "grad_norm": 1.342112421989441, + "learning_rate": 1.6574708062563078e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5565556883811951, + "num_tokens": 2919628371.0, + "step": 5710 + }, + { + "epoch": 1.5443482963764197, + "grad_norm": 1.4301023483276367, + "learning_rate": 1.6573470328829525e-05, + "loss": 2.0127, + "mean_token_accuracy": 0.5335530042648315, + "num_tokens": 2920152639.0, + "step": 5711 + }, + { + "epoch": 1.5446187128177393, + "grad_norm": 1.3370989561080933, + "learning_rate": 1.657223242408551e-05, + "loss": 1.94, + "mean_token_accuracy": 0.5651747584342957, + "num_tokens": 2920670776.0, + "step": 5712 + }, + { + "epoch": 1.544889129259059, + "grad_norm": 1.4463770389556885, + "learning_rate": 1.657099434836902e-05, + "loss": 2.0695, + "mean_token_accuracy": 0.5115598440170288, + "num_tokens": 2921194996.0, + "step": 5713 + }, + { + "epoch": 1.5451595457003786, + "grad_norm": 1.624979019165039, + "learning_rate": 1.6569756101718043e-05, + "loss": 2.1253, + "mean_token_accuracy": 0.5392510890960693, + "num_tokens": 2921719253.0, + "step": 5714 + }, + { + "epoch": 1.5454299621416983, + "grad_norm": 1.4553618431091309, + "learning_rate": 1.6568517684170564e-05, + "loss": 2.0387, + "mean_token_accuracy": 0.5373174548149109, + "num_tokens": 2922243477.0, + "step": 5715 + }, + { + "epoch": 1.545700378583018, + "grad_norm": 1.3929420709609985, + "learning_rate": 1.656727909576459e-05, + "loss": 2.0179, + "mean_token_accuracy": 0.5445104241371155, + "num_tokens": 2922767673.0, + "step": 5716 + }, + { + "epoch": 1.5459707950243375, + "grad_norm": 1.5616672039031982, + "learning_rate": 1.6566040336538116e-05, + "loss": 2.0648, + "mean_token_accuracy": 0.547921359539032, + "num_tokens": 2923263257.0, + "step": 5717 + }, + { + "epoch": 1.5462412114656572, + "grad_norm": 1.2964202165603638, + "learning_rate": 1.656480140652916e-05, + "loss": 2.0436, + "mean_token_accuracy": 0.5397332906723022, + "num_tokens": 2923787433.0, + "step": 5718 + }, + { + "epoch": 1.5465116279069768, + "grad_norm": 1.3172537088394165, + "learning_rate": 1.6563562305775734e-05, + "loss": 2.0402, + "mean_token_accuracy": 0.5465317964553833, + "num_tokens": 2924311532.0, + "step": 5719 + }, + { + "epoch": 1.5467820443482965, + "grad_norm": 1.3855830430984497, + "learning_rate": 1.656232303431585e-05, + "loss": 1.8839, + "mean_token_accuracy": 0.5534689426422119, + "num_tokens": 2924813204.0, + "step": 5720 + }, + { + "epoch": 1.5470524607896161, + "grad_norm": 0.7109562158584595, + "learning_rate": 1.656108359218754e-05, + "loss": 1.2115, + "mean_token_accuracy": 0.6749173402786255, + "num_tokens": 2925337485.0, + "step": 5721 + }, + { + "epoch": 1.5473228772309358, + "grad_norm": 1.9058517217636108, + "learning_rate": 1.6559843979428833e-05, + "loss": 2.0132, + "mean_token_accuracy": 0.5572602152824402, + "num_tokens": 2925861668.0, + "step": 5722 + }, + { + "epoch": 1.5475932936722554, + "grad_norm": 1.726393699645996, + "learning_rate": 1.6558604196077758e-05, + "loss": 1.9631, + "mean_token_accuracy": 0.5620636940002441, + "num_tokens": 2926385869.0, + "step": 5723 + }, + { + "epoch": 1.547863710113575, + "grad_norm": 1.8677047491073608, + "learning_rate": 1.655736424217236e-05, + "loss": 1.737, + "mean_token_accuracy": 0.6099241971969604, + "num_tokens": 2926910030.0, + "step": 5724 + }, + { + "epoch": 1.5481341265548947, + "grad_norm": 1.551262378692627, + "learning_rate": 1.655612411775069e-05, + "loss": 1.9354, + "mean_token_accuracy": 0.551494300365448, + "num_tokens": 2927434292.0, + "step": 5725 + }, + { + "epoch": 1.5484045429962143, + "grad_norm": 1.2946867942810059, + "learning_rate": 1.6554883822850787e-05, + "loss": 1.9972, + "mean_token_accuracy": 0.5424016118049622, + "num_tokens": 2927958524.0, + "step": 5726 + }, + { + "epoch": 1.5486749594375337, + "grad_norm": 1.2436802387237549, + "learning_rate": 1.6553643357510718e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5698560476303101, + "num_tokens": 2928482661.0, + "step": 5727 + }, + { + "epoch": 1.5489453758788534, + "grad_norm": 1.5134140253067017, + "learning_rate": 1.655240272176853e-05, + "loss": 1.9657, + "mean_token_accuracy": 0.5474262833595276, + "num_tokens": 2929006851.0, + "step": 5728 + }, + { + "epoch": 1.549215792320173, + "grad_norm": 1.4003981351852417, + "learning_rate": 1.6551161915662303e-05, + "loss": 1.9619, + "mean_token_accuracy": 0.5676764249801636, + "num_tokens": 2929388690.0, + "step": 5729 + }, + { + "epoch": 1.5494862087614927, + "grad_norm": 1.5858627557754517, + "learning_rate": 1.6549920939230102e-05, + "loss": 2.1347, + "mean_token_accuracy": 0.5316094160079956, + "num_tokens": 2929912953.0, + "step": 5730 + }, + { + "epoch": 1.5497566252028123, + "grad_norm": 1.7117180824279785, + "learning_rate": 1.6548679792510008e-05, + "loss": 2.0178, + "mean_token_accuracy": 0.5441310405731201, + "num_tokens": 2930394990.0, + "step": 5731 + }, + { + "epoch": 1.550027041644132, + "grad_norm": 1.1908036470413208, + "learning_rate": 1.65474384755401e-05, + "loss": 1.9925, + "mean_token_accuracy": 0.5595259666442871, + "num_tokens": 2930842136.0, + "step": 5732 + }, + { + "epoch": 1.5502974580854516, + "grad_norm": 1.7535065412521362, + "learning_rate": 1.654619698835846e-05, + "loss": 1.9702, + "mean_token_accuracy": 0.56284499168396, + "num_tokens": 2931314400.0, + "step": 5733 + }, + { + "epoch": 1.5505678745267713, + "grad_norm": 1.4282394647598267, + "learning_rate": 1.654495533100319e-05, + "loss": 2.0057, + "mean_token_accuracy": 0.5570434927940369, + "num_tokens": 2931800635.0, + "step": 5734 + }, + { + "epoch": 1.550838290968091, + "grad_norm": 1.2318733930587769, + "learning_rate": 1.6543713503512387e-05, + "loss": 1.9953, + "mean_token_accuracy": 0.5538707375526428, + "num_tokens": 2932324845.0, + "step": 5735 + }, + { + "epoch": 1.5511087074094105, + "grad_norm": 1.3190429210662842, + "learning_rate": 1.6542471505924143e-05, + "loss": 2.0746, + "mean_token_accuracy": 0.5622670650482178, + "num_tokens": 2932748707.0, + "step": 5736 + }, + { + "epoch": 1.5513791238507302, + "grad_norm": 1.48798668384552, + "learning_rate": 1.6541229338276583e-05, + "loss": 1.9891, + "mean_token_accuracy": 0.5599125027656555, + "num_tokens": 2933261156.0, + "step": 5737 + }, + { + "epoch": 1.5516495402920496, + "grad_norm": 1.2303824424743652, + "learning_rate": 1.6539987000607806e-05, + "loss": 2.0332, + "mean_token_accuracy": 0.5413992404937744, + "num_tokens": 2933726197.0, + "step": 5738 + }, + { + "epoch": 1.5519199567333692, + "grad_norm": 1.4697057008743286, + "learning_rate": 1.6538744492955935e-05, + "loss": 2.0672, + "mean_token_accuracy": 0.5418933629989624, + "num_tokens": 2934250285.0, + "step": 5739 + }, + { + "epoch": 1.5521903731746889, + "grad_norm": 1.5583535432815552, + "learning_rate": 1.6537501815359097e-05, + "loss": 2.1165, + "mean_token_accuracy": 0.5448890328407288, + "num_tokens": 2934774539.0, + "step": 5740 + }, + { + "epoch": 1.5524607896160085, + "grad_norm": 0.6479580402374268, + "learning_rate": 1.653625896785542e-05, + "loss": 1.1342, + "mean_token_accuracy": 0.6859784722328186, + "num_tokens": 2935298788.0, + "step": 5741 + }, + { + "epoch": 1.5527312060573282, + "grad_norm": 2.5138161182403564, + "learning_rate": 1.6535015950483034e-05, + "loss": 2.056, + "mean_token_accuracy": 0.5521365404129028, + "num_tokens": 2935758730.0, + "step": 5742 + }, + { + "epoch": 1.5530016224986478, + "grad_norm": 1.8368669748306274, + "learning_rate": 1.6533772763280082e-05, + "loss": 1.9784, + "mean_token_accuracy": 0.5681489706039429, + "num_tokens": 2936167501.0, + "step": 5743 + }, + { + "epoch": 1.5532720389399675, + "grad_norm": 1.3388644456863403, + "learning_rate": 1.6532529406284704e-05, + "loss": 1.911, + "mean_token_accuracy": 0.5625331401824951, + "num_tokens": 2936691564.0, + "step": 5744 + }, + { + "epoch": 1.553542455381287, + "grad_norm": 1.485142469406128, + "learning_rate": 1.6531285879535063e-05, + "loss": 1.8524, + "mean_token_accuracy": 0.5651295185089111, + "num_tokens": 2937215760.0, + "step": 5745 + }, + { + "epoch": 1.5538128718226067, + "grad_norm": 1.7169727087020874, + "learning_rate": 1.6530042183069296e-05, + "loss": 1.7901, + "mean_token_accuracy": 0.600847065448761, + "num_tokens": 2937739876.0, + "step": 5746 + }, + { + "epoch": 1.5540832882639264, + "grad_norm": 1.685551404953003, + "learning_rate": 1.6528798316925578e-05, + "loss": 1.9767, + "mean_token_accuracy": 0.550478458404541, + "num_tokens": 2938264016.0, + "step": 5747 + }, + { + "epoch": 1.554353704705246, + "grad_norm": 1.7390468120574951, + "learning_rate": 1.6527554281142065e-05, + "loss": 2.0273, + "mean_token_accuracy": 0.5301105380058289, + "num_tokens": 2938788153.0, + "step": 5748 + }, + { + "epoch": 1.5546241211465657, + "grad_norm": 1.434404969215393, + "learning_rate": 1.652631007575693e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5496108531951904, + "num_tokens": 2939312385.0, + "step": 5749 + }, + { + "epoch": 1.5548945375878853, + "grad_norm": 1.6291048526763916, + "learning_rate": 1.6525065700808355e-05, + "loss": 1.833, + "mean_token_accuracy": 0.577206015586853, + "num_tokens": 2939836589.0, + "step": 5750 + }, + { + "epoch": 1.555164954029205, + "grad_norm": 1.5132187604904175, + "learning_rate": 1.652382115633451e-05, + "loss": 1.8025, + "mean_token_accuracy": 0.5814645290374756, + "num_tokens": 2940360832.0, + "step": 5751 + }, + { + "epoch": 1.5554353704705246, + "grad_norm": 1.6196680068969727, + "learning_rate": 1.652257644237359e-05, + "loss": 1.9249, + "mean_token_accuracy": 0.5480997562408447, + "num_tokens": 2940885112.0, + "step": 5752 + }, + { + "epoch": 1.5557057869118442, + "grad_norm": 1.9164953231811523, + "learning_rate": 1.6521331558963783e-05, + "loss": 2.0337, + "mean_token_accuracy": 0.5497878789901733, + "num_tokens": 2941409333.0, + "step": 5753 + }, + { + "epoch": 1.555976203353164, + "grad_norm": 1.402040958404541, + "learning_rate": 1.652008650614329e-05, + "loss": 2.0618, + "mean_token_accuracy": 0.5380396246910095, + "num_tokens": 2941933470.0, + "step": 5754 + }, + { + "epoch": 1.5562466197944835, + "grad_norm": 1.9451007843017578, + "learning_rate": 1.6518841283950307e-05, + "loss": 2.0394, + "mean_token_accuracy": 0.5471988320350647, + "num_tokens": 2942439882.0, + "step": 5755 + }, + { + "epoch": 1.5565170362358032, + "grad_norm": 2.490036964416504, + "learning_rate": 1.6517595892423043e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.5437355041503906, + "num_tokens": 2942964143.0, + "step": 5756 + }, + { + "epoch": 1.5567874526771228, + "grad_norm": 1.450219750404358, + "learning_rate": 1.6516350331599714e-05, + "loss": 2.035, + "mean_token_accuracy": 0.5573549270629883, + "num_tokens": 2943488422.0, + "step": 5757 + }, + { + "epoch": 1.5570578691184425, + "grad_norm": 1.415331482887268, + "learning_rate": 1.651510460151853e-05, + "loss": 2.091, + "mean_token_accuracy": 0.52936851978302, + "num_tokens": 2944012684.0, + "step": 5758 + }, + { + "epoch": 1.557328285559762, + "grad_norm": 1.4621490240097046, + "learning_rate": 1.6513858702217723e-05, + "loss": 2.0007, + "mean_token_accuracy": 0.5469667911529541, + "num_tokens": 2944536915.0, + "step": 5759 + }, + { + "epoch": 1.5575987020010817, + "grad_norm": 1.2421512603759766, + "learning_rate": 1.651261263373551e-05, + "loss": 1.9498, + "mean_token_accuracy": 0.5589205026626587, + "num_tokens": 2945061099.0, + "step": 5760 + }, + { + "epoch": 1.5578691184424014, + "grad_norm": 0.9567451477050781, + "learning_rate": 1.6511366396110133e-05, + "loss": 1.1502, + "mean_token_accuracy": 0.6909583806991577, + "num_tokens": 2945567549.0, + "step": 5761 + }, + { + "epoch": 1.558139534883721, + "grad_norm": 2.2203567028045654, + "learning_rate": 1.6510119989379827e-05, + "loss": 2.1274, + "mean_token_accuracy": 0.5263446569442749, + "num_tokens": 2946091808.0, + "step": 5762 + }, + { + "epoch": 1.5584099513250407, + "grad_norm": 1.7504255771636963, + "learning_rate": 1.6508873413582837e-05, + "loss": 1.9707, + "mean_token_accuracy": 0.5612068176269531, + "num_tokens": 2946616088.0, + "step": 5763 + }, + { + "epoch": 1.5586803677663603, + "grad_norm": 1.2090091705322266, + "learning_rate": 1.6507626668757407e-05, + "loss": 1.968, + "mean_token_accuracy": 0.5664063692092896, + "num_tokens": 2947140339.0, + "step": 5764 + }, + { + "epoch": 1.55895078420768, + "grad_norm": 1.6393096446990967, + "learning_rate": 1.6506379754941795e-05, + "loss": 2.0921, + "mean_token_accuracy": 0.5515651702880859, + "num_tokens": 2947607885.0, + "step": 5765 + }, + { + "epoch": 1.5592212006489996, + "grad_norm": 1.491387963294983, + "learning_rate": 1.650513267217426e-05, + "loss": 2.0278, + "mean_token_accuracy": 0.5426004528999329, + "num_tokens": 2948132095.0, + "step": 5766 + }, + { + "epoch": 1.5594916170903192, + "grad_norm": 1.151332974433899, + "learning_rate": 1.6503885420493066e-05, + "loss": 2.0614, + "mean_token_accuracy": 0.5146066546440125, + "num_tokens": 2948656246.0, + "step": 5767 + }, + { + "epoch": 1.5597620335316387, + "grad_norm": 1.472933053970337, + "learning_rate": 1.650263799993648e-05, + "loss": 1.9466, + "mean_token_accuracy": 0.5592334270477295, + "num_tokens": 2949180480.0, + "step": 5768 + }, + { + "epoch": 1.5600324499729583, + "grad_norm": 1.1899456977844238, + "learning_rate": 1.650139041054278e-05, + "loss": 1.9972, + "mean_token_accuracy": 0.5325203537940979, + "num_tokens": 2949704666.0, + "step": 5769 + }, + { + "epoch": 1.560302866414278, + "grad_norm": 1.1517516374588013, + "learning_rate": 1.6500142652350237e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5426946878433228, + "num_tokens": 2950228810.0, + "step": 5770 + }, + { + "epoch": 1.5605732828555976, + "grad_norm": 1.409216046333313, + "learning_rate": 1.6498894725397148e-05, + "loss": 1.998, + "mean_token_accuracy": 0.5439612865447998, + "num_tokens": 2950752965.0, + "step": 5771 + }, + { + "epoch": 1.5608436992969172, + "grad_norm": 1.2399059534072876, + "learning_rate": 1.6497646629721798e-05, + "loss": 2.0645, + "mean_token_accuracy": 0.5318861603736877, + "num_tokens": 2951221590.0, + "step": 5772 + }, + { + "epoch": 1.5611141157382369, + "grad_norm": 1.647643804550171, + "learning_rate": 1.649639836536248e-05, + "loss": 2.0029, + "mean_token_accuracy": 0.5607843399047852, + "num_tokens": 2951664328.0, + "step": 5773 + }, + { + "epoch": 1.5613845321795565, + "grad_norm": 1.6702830791473389, + "learning_rate": 1.6495149932357498e-05, + "loss": 2.0918, + "mean_token_accuracy": 0.5457198619842529, + "num_tokens": 2952134065.0, + "step": 5774 + }, + { + "epoch": 1.5616549486208762, + "grad_norm": 1.2677807807922363, + "learning_rate": 1.6493901330745154e-05, + "loss": 2.077, + "mean_token_accuracy": 0.5538809299468994, + "num_tokens": 2952620189.0, + "step": 5775 + }, + { + "epoch": 1.5619253650621958, + "grad_norm": 1.4680399894714355, + "learning_rate": 1.649265256056376e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.5497084856033325, + "num_tokens": 2953144397.0, + "step": 5776 + }, + { + "epoch": 1.5621957815035155, + "grad_norm": 1.727916955947876, + "learning_rate": 1.6491403621851635e-05, + "loss": 2.0583, + "mean_token_accuracy": 0.5421488285064697, + "num_tokens": 2953668567.0, + "step": 5777 + }, + { + "epoch": 1.562466197944835, + "grad_norm": 1.0955926179885864, + "learning_rate": 1.6490154514647093e-05, + "loss": 1.9163, + "mean_token_accuracy": 0.55596923828125, + "num_tokens": 2954192736.0, + "step": 5778 + }, + { + "epoch": 1.5627366143861545, + "grad_norm": 1.3173024654388428, + "learning_rate": 1.6488905238988473e-05, + "loss": 1.9353, + "mean_token_accuracy": 0.5478674173355103, + "num_tokens": 2954716960.0, + "step": 5779 + }, + { + "epoch": 1.5630070308274742, + "grad_norm": 1.4884155988693237, + "learning_rate": 1.6487655794914093e-05, + "loss": 2.0423, + "mean_token_accuracy": 0.5378021001815796, + "num_tokens": 2955241118.0, + "step": 5780 + }, + { + "epoch": 1.5632774472687938, + "grad_norm": 0.5868819952011108, + "learning_rate": 1.6486406182462296e-05, + "loss": 1.1372, + "mean_token_accuracy": 0.7010049819946289, + "num_tokens": 2955758607.0, + "step": 5781 + }, + { + "epoch": 1.5635478637101135, + "grad_norm": 2.327768325805664, + "learning_rate": 1.6485156401671422e-05, + "loss": 1.9637, + "mean_token_accuracy": 0.5437424182891846, + "num_tokens": 2956282855.0, + "step": 5782 + }, + { + "epoch": 1.563818280151433, + "grad_norm": 1.6397411823272705, + "learning_rate": 1.648390645257982e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.5661869049072266, + "num_tokens": 2956807120.0, + "step": 5783 + }, + { + "epoch": 1.5640886965927527, + "grad_norm": 1.27349853515625, + "learning_rate": 1.648265633522584e-05, + "loss": 2.0318, + "mean_token_accuracy": 0.5571004152297974, + "num_tokens": 2957331221.0, + "step": 5784 + }, + { + "epoch": 1.5643591130340724, + "grad_norm": 1.5780491828918457, + "learning_rate": 1.6481406049647844e-05, + "loss": 1.9941, + "mean_token_accuracy": 0.53636634349823, + "num_tokens": 2957855430.0, + "step": 5785 + }, + { + "epoch": 1.564629529475392, + "grad_norm": 1.1779735088348389, + "learning_rate": 1.6480155595884187e-05, + "loss": 1.9404, + "mean_token_accuracy": 0.5532602071762085, + "num_tokens": 2958379693.0, + "step": 5786 + }, + { + "epoch": 1.5648999459167117, + "grad_norm": 1.633583664894104, + "learning_rate": 1.647890497397324e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.5561284422874451, + "num_tokens": 2958903729.0, + "step": 5787 + }, + { + "epoch": 1.5651703623580313, + "grad_norm": 1.2169333696365356, + "learning_rate": 1.6477654183953375e-05, + "loss": 1.9813, + "mean_token_accuracy": 0.5482251644134521, + "num_tokens": 2959415846.0, + "step": 5788 + }, + { + "epoch": 1.565440778799351, + "grad_norm": 1.3577306270599365, + "learning_rate": 1.6476403225862974e-05, + "loss": 1.8852, + "mean_token_accuracy": 0.5641309022903442, + "num_tokens": 2959849726.0, + "step": 5789 + }, + { + "epoch": 1.5657111952406706, + "grad_norm": 1.415310263633728, + "learning_rate": 1.6475152099740416e-05, + "loss": 1.9686, + "mean_token_accuracy": 0.5576688051223755, + "num_tokens": 2960373925.0, + "step": 5790 + }, + { + "epoch": 1.5659816116819902, + "grad_norm": 1.047950267791748, + "learning_rate": 1.647390080562409e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5515202283859253, + "num_tokens": 2960897970.0, + "step": 5791 + }, + { + "epoch": 1.5662520281233099, + "grad_norm": 1.2048726081848145, + "learning_rate": 1.647264934355239e-05, + "loss": 2.0194, + "mean_token_accuracy": 0.5542887449264526, + "num_tokens": 2961392238.0, + "step": 5792 + }, + { + "epoch": 1.5665224445646295, + "grad_norm": 1.0703284740447998, + "learning_rate": 1.6471397713563714e-05, + "loss": 2.0475, + "mean_token_accuracy": 0.5583656430244446, + "num_tokens": 2961916449.0, + "step": 5793 + }, + { + "epoch": 1.5667928610059492, + "grad_norm": 1.3065133094787598, + "learning_rate": 1.6470145915696463e-05, + "loss": 1.9553, + "mean_token_accuracy": 0.5436665415763855, + "num_tokens": 2962440716.0, + "step": 5794 + }, + { + "epoch": 1.5670632774472688, + "grad_norm": 1.2917377948760986, + "learning_rate": 1.646889394998905e-05, + "loss": 2.056, + "mean_token_accuracy": 0.5464879274368286, + "num_tokens": 2962945399.0, + "step": 5795 + }, + { + "epoch": 1.5673336938885885, + "grad_norm": 1.0457017421722412, + "learning_rate": 1.646764181647989e-05, + "loss": 1.8959, + "mean_token_accuracy": 0.5592255592346191, + "num_tokens": 2963469672.0, + "step": 5796 + }, + { + "epoch": 1.567604110329908, + "grad_norm": 1.2578338384628296, + "learning_rate": 1.64663895152074e-05, + "loss": 1.9846, + "mean_token_accuracy": 0.5731261968612671, + "num_tokens": 2963978348.0, + "step": 5797 + }, + { + "epoch": 1.5678745267712277, + "grad_norm": 1.165611982345581, + "learning_rate": 1.646513704621e-05, + "loss": 2.0064, + "mean_token_accuracy": 0.5480995178222656, + "num_tokens": 2964502594.0, + "step": 5798 + }, + { + "epoch": 1.5681449432125474, + "grad_norm": 1.2698628902435303, + "learning_rate": 1.6463884409526128e-05, + "loss": 1.9061, + "mean_token_accuracy": 0.5557737350463867, + "num_tokens": 2965026857.0, + "step": 5799 + }, + { + "epoch": 1.568415359653867, + "grad_norm": 1.115570306777954, + "learning_rate": 1.6462631605194214e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.5390989780426025, + "num_tokens": 2965536460.0, + "step": 5800 + }, + { + "epoch": 1.5686857760951867, + "grad_norm": 0.5518254041671753, + "learning_rate": 1.6461378633252695e-05, + "loss": 1.1474, + "mean_token_accuracy": 0.7114754915237427, + "num_tokens": 2966060720.0, + "step": 5801 + }, + { + "epoch": 1.5689561925365063, + "grad_norm": 2.6568078994750977, + "learning_rate": 1.646012549374002e-05, + "loss": 2.1049, + "mean_token_accuracy": 0.52726149559021, + "num_tokens": 2966584978.0, + "step": 5802 + }, + { + "epoch": 1.569226608977826, + "grad_norm": 2.1849560737609863, + "learning_rate": 1.6458872186694636e-05, + "loss": 1.9587, + "mean_token_accuracy": 0.5769098401069641, + "num_tokens": 2967101154.0, + "step": 5803 + }, + { + "epoch": 1.5694970254191456, + "grad_norm": 1.4633128643035889, + "learning_rate": 1.6457618712155e-05, + "loss": 1.9398, + "mean_token_accuracy": 0.5652504563331604, + "num_tokens": 2967625432.0, + "step": 5804 + }, + { + "epoch": 1.5697674418604652, + "grad_norm": 1.5942103862762451, + "learning_rate": 1.6456365070159578e-05, + "loss": 2.0338, + "mean_token_accuracy": 0.5354956388473511, + "num_tokens": 2968149659.0, + "step": 5805 + }, + { + "epoch": 1.5700378583017849, + "grad_norm": 1.6526769399642944, + "learning_rate": 1.6455111260746822e-05, + "loss": 1.9926, + "mean_token_accuracy": 0.5359820127487183, + "num_tokens": 2968673790.0, + "step": 5806 + }, + { + "epoch": 1.5703082747431045, + "grad_norm": 1.3279004096984863, + "learning_rate": 1.6453857283955216e-05, + "loss": 1.8895, + "mean_token_accuracy": 0.556369960308075, + "num_tokens": 2969198032.0, + "step": 5807 + }, + { + "epoch": 1.5705786911844242, + "grad_norm": 1.5674755573272705, + "learning_rate": 1.6452603139823226e-05, + "loss": 2.0392, + "mean_token_accuracy": 0.5657041072845459, + "num_tokens": 2969640626.0, + "step": 5808 + }, + { + "epoch": 1.5708491076257436, + "grad_norm": 1.4103584289550781, + "learning_rate": 1.6451348828389336e-05, + "loss": 1.966, + "mean_token_accuracy": 0.5562662482261658, + "num_tokens": 2970164681.0, + "step": 5809 + }, + { + "epoch": 1.5711195240670632, + "grad_norm": 1.5292516946792603, + "learning_rate": 1.6450094349692034e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.5455702543258667, + "num_tokens": 2970688892.0, + "step": 5810 + }, + { + "epoch": 1.5713899405083829, + "grad_norm": 1.519587516784668, + "learning_rate": 1.6448839703769807e-05, + "loss": 1.957, + "mean_token_accuracy": 0.5314123034477234, + "num_tokens": 2971213172.0, + "step": 5811 + }, + { + "epoch": 1.5716603569497025, + "grad_norm": 1.2787529230117798, + "learning_rate": 1.644758489066116e-05, + "loss": 1.9263, + "mean_token_accuracy": 0.5491591691970825, + "num_tokens": 2971737426.0, + "step": 5812 + }, + { + "epoch": 1.5719307733910222, + "grad_norm": 1.0423920154571533, + "learning_rate": 1.6446329910404583e-05, + "loss": 2.0591, + "mean_token_accuracy": 0.5462088584899902, + "num_tokens": 2972252576.0, + "step": 5813 + }, + { + "epoch": 1.5722011898323418, + "grad_norm": 1.3539096117019653, + "learning_rate": 1.644507476303859e-05, + "loss": 1.98, + "mean_token_accuracy": 0.5663195848464966, + "num_tokens": 2972776660.0, + "step": 5814 + }, + { + "epoch": 1.5724716062736614, + "grad_norm": 1.2115956544876099, + "learning_rate": 1.6443819448601685e-05, + "loss": 2.0716, + "mean_token_accuracy": 0.5476582050323486, + "num_tokens": 2973300821.0, + "step": 5815 + }, + { + "epoch": 1.572742022714981, + "grad_norm": 1.174920678138733, + "learning_rate": 1.6442563967132397e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5563523173332214, + "num_tokens": 2973825053.0, + "step": 5816 + }, + { + "epoch": 1.5730124391563007, + "grad_norm": 1.0502550601959229, + "learning_rate": 1.6441308318669238e-05, + "loss": 1.9022, + "mean_token_accuracy": 0.574813723564148, + "num_tokens": 2974318030.0, + "step": 5817 + }, + { + "epoch": 1.5732828555976204, + "grad_norm": 1.2066446542739868, + "learning_rate": 1.644005250325074e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5686044692993164, + "num_tokens": 2974842217.0, + "step": 5818 + }, + { + "epoch": 1.57355327203894, + "grad_norm": 1.151588797569275, + "learning_rate": 1.6438796520915432e-05, + "loss": 1.9401, + "mean_token_accuracy": 0.5293498039245605, + "num_tokens": 2975366490.0, + "step": 5819 + }, + { + "epoch": 1.5738236884802594, + "grad_norm": 1.2107335329055786, + "learning_rate": 1.643754037170185e-05, + "loss": 2.0642, + "mean_token_accuracy": 0.5359838604927063, + "num_tokens": 2975890621.0, + "step": 5820 + }, + { + "epoch": 1.574094104921579, + "grad_norm": 0.7818005084991455, + "learning_rate": 1.643628405564854e-05, + "loss": 1.1426, + "mean_token_accuracy": 0.6972419023513794, + "num_tokens": 2976407172.0, + "step": 5821 + }, + { + "epoch": 1.5743645213628987, + "grad_norm": 1.731782078742981, + "learning_rate": 1.6435027572794052e-05, + "loss": 2.0496, + "mean_token_accuracy": 0.5283418893814087, + "num_tokens": 2976873140.0, + "step": 5822 + }, + { + "epoch": 1.5746349378042184, + "grad_norm": 1.440843105316162, + "learning_rate": 1.643377092317693e-05, + "loss": 2.0235, + "mean_token_accuracy": 0.5332581996917725, + "num_tokens": 2977397289.0, + "step": 5823 + }, + { + "epoch": 1.574905354245538, + "grad_norm": 1.10614013671875, + "learning_rate": 1.643251410683574e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.5521613359451294, + "num_tokens": 2977921466.0, + "step": 5824 + }, + { + "epoch": 1.5751757706868577, + "grad_norm": 1.3304890394210815, + "learning_rate": 1.6431257123809036e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5590137243270874, + "num_tokens": 2978445679.0, + "step": 5825 + }, + { + "epoch": 1.5754461871281773, + "grad_norm": 1.2512658834457397, + "learning_rate": 1.6429999974135395e-05, + "loss": 1.9231, + "mean_token_accuracy": 0.5820918083190918, + "num_tokens": 2978969921.0, + "step": 5826 + }, + { + "epoch": 1.575716603569497, + "grad_norm": 1.2153640985488892, + "learning_rate": 1.6428742657853386e-05, + "loss": 1.7948, + "mean_token_accuracy": 0.5628385543823242, + "num_tokens": 2979494003.0, + "step": 5827 + }, + { + "epoch": 1.5759870200108166, + "grad_norm": 1.417946219444275, + "learning_rate": 1.6427485175001585e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5438040494918823, + "num_tokens": 2979986649.0, + "step": 5828 + }, + { + "epoch": 1.5762574364521362, + "grad_norm": 1.6989775896072388, + "learning_rate": 1.6426227525618583e-05, + "loss": 2.0378, + "mean_token_accuracy": 0.5403375625610352, + "num_tokens": 2980510921.0, + "step": 5829 + }, + { + "epoch": 1.5765278528934559, + "grad_norm": 1.3700037002563477, + "learning_rate": 1.642496970974296e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5435700416564941, + "num_tokens": 2981035175.0, + "step": 5830 + }, + { + "epoch": 1.5767982693347755, + "grad_norm": 1.2210007905960083, + "learning_rate": 1.6423711727413308e-05, + "loss": 1.9528, + "mean_token_accuracy": 0.5453611612319946, + "num_tokens": 2981559354.0, + "step": 5831 + }, + { + "epoch": 1.5770686857760952, + "grad_norm": 1.244083046913147, + "learning_rate": 1.6422453578668236e-05, + "loss": 1.9076, + "mean_token_accuracy": 0.569103479385376, + "num_tokens": 2982083590.0, + "step": 5832 + }, + { + "epoch": 1.5773391022174148, + "grad_norm": 1.263813853263855, + "learning_rate": 1.642119526354634e-05, + "loss": 1.8421, + "mean_token_accuracy": 0.5780288577079773, + "num_tokens": 2982607775.0, + "step": 5833 + }, + { + "epoch": 1.5776095186587344, + "grad_norm": 1.3153455257415771, + "learning_rate": 1.641993678208623e-05, + "loss": 2.0624, + "mean_token_accuracy": 0.5345635414123535, + "num_tokens": 2983132035.0, + "step": 5834 + }, + { + "epoch": 1.577879935100054, + "grad_norm": 1.2185924053192139, + "learning_rate": 1.641867813432652e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.5425207614898682, + "num_tokens": 2983656209.0, + "step": 5835 + }, + { + "epoch": 1.5781503515413737, + "grad_norm": 1.2025666236877441, + "learning_rate": 1.6417419320305833e-05, + "loss": 1.9855, + "mean_token_accuracy": 0.5558140873908997, + "num_tokens": 2984180238.0, + "step": 5836 + }, + { + "epoch": 1.5784207679826934, + "grad_norm": 1.413856863975525, + "learning_rate": 1.6416160340062784e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.5535513162612915, + "num_tokens": 2984704476.0, + "step": 5837 + }, + { + "epoch": 1.578691184424013, + "grad_norm": 1.364255428314209, + "learning_rate": 1.641490119363601e-05, + "loss": 1.9985, + "mean_token_accuracy": 0.5600306987762451, + "num_tokens": 2985228663.0, + "step": 5838 + }, + { + "epoch": 1.5789616008653327, + "grad_norm": 1.4177366495132446, + "learning_rate": 1.641364188106415e-05, + "loss": 1.9729, + "mean_token_accuracy": 0.5519587397575378, + "num_tokens": 2985699199.0, + "step": 5839 + }, + { + "epoch": 1.5792320173066523, + "grad_norm": 1.4389336109161377, + "learning_rate": 1.6412382402385827e-05, + "loss": 2.0823, + "mean_token_accuracy": 0.5377597808837891, + "num_tokens": 2986223295.0, + "step": 5840 + }, + { + "epoch": 1.579502433747972, + "grad_norm": 0.7132226824760437, + "learning_rate": 1.64111227576397e-05, + "loss": 1.1049, + "mean_token_accuracy": 0.7023149132728577, + "num_tokens": 2986747535.0, + "step": 5841 + }, + { + "epoch": 1.5797728501892916, + "grad_norm": 2.049433946609497, + "learning_rate": 1.6409862946864413e-05, + "loss": 2.0829, + "mean_token_accuracy": 0.5241600871086121, + "num_tokens": 2987271810.0, + "step": 5842 + }, + { + "epoch": 1.5800432666306112, + "grad_norm": 1.6078498363494873, + "learning_rate": 1.640860297009862e-05, + "loss": 2.0454, + "mean_token_accuracy": 0.5399137735366821, + "num_tokens": 2987757164.0, + "step": 5843 + }, + { + "epoch": 1.5803136830719309, + "grad_norm": 1.338013768196106, + "learning_rate": 1.6407342827380983e-05, + "loss": 1.741, + "mean_token_accuracy": 0.6083475351333618, + "num_tokens": 2988281440.0, + "step": 5844 + }, + { + "epoch": 1.5805840995132505, + "grad_norm": 1.2928590774536133, + "learning_rate": 1.6406082518750166e-05, + "loss": 2.0209, + "mean_token_accuracy": 0.5567227005958557, + "num_tokens": 2988805710.0, + "step": 5845 + }, + { + "epoch": 1.5808545159545702, + "grad_norm": 1.0532854795455933, + "learning_rate": 1.640482204424484e-05, + "loss": 1.7877, + "mean_token_accuracy": 0.5861610174179077, + "num_tokens": 2989299373.0, + "step": 5846 + }, + { + "epoch": 1.5811249323958898, + "grad_norm": 1.981311321258545, + "learning_rate": 1.640356140390368e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.535209059715271, + "num_tokens": 2989781885.0, + "step": 5847 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 1.3195335865020752, + "learning_rate": 1.640230059776536e-05, + "loss": 1.8962, + "mean_token_accuracy": 0.5754187107086182, + "num_tokens": 2990305966.0, + "step": 5848 + }, + { + "epoch": 1.581665765278529, + "grad_norm": 1.20510733127594, + "learning_rate": 1.640103962586858e-05, + "loss": 1.9854, + "mean_token_accuracy": 0.5510096549987793, + "num_tokens": 2990830249.0, + "step": 5849 + }, + { + "epoch": 1.5819361817198487, + "grad_norm": 1.6313002109527588, + "learning_rate": 1.6399778488252015e-05, + "loss": 2.027, + "mean_token_accuracy": 0.5336923599243164, + "num_tokens": 2991336020.0, + "step": 5850 + }, + { + "epoch": 1.5822065981611682, + "grad_norm": 1.221814751625061, + "learning_rate": 1.6398517184954362e-05, + "loss": 1.9765, + "mean_token_accuracy": 0.5643469095230103, + "num_tokens": 2991805876.0, + "step": 5851 + }, + { + "epoch": 1.5824770146024878, + "grad_norm": 1.403481125831604, + "learning_rate": 1.6397255716014335e-05, + "loss": 2.0875, + "mean_token_accuracy": 0.5500714778900146, + "num_tokens": 2992330052.0, + "step": 5852 + }, + { + "epoch": 1.5827474310438074, + "grad_norm": 1.5755192041397095, + "learning_rate": 1.6395994081470624e-05, + "loss": 1.9435, + "mean_token_accuracy": 0.5688233375549316, + "num_tokens": 2992854277.0, + "step": 5853 + }, + { + "epoch": 1.583017847485127, + "grad_norm": 1.2318637371063232, + "learning_rate": 1.6394732281361948e-05, + "loss": 1.9336, + "mean_token_accuracy": 0.5488443970680237, + "num_tokens": 2993378542.0, + "step": 5854 + }, + { + "epoch": 1.5832882639264467, + "grad_norm": 1.271128535270691, + "learning_rate": 1.639347031572702e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.5541949272155762, + "num_tokens": 2993902773.0, + "step": 5855 + }, + { + "epoch": 1.5835586803677664, + "grad_norm": 1.2506955862045288, + "learning_rate": 1.639220818460456e-05, + "loss": 1.9536, + "mean_token_accuracy": 0.5655279159545898, + "num_tokens": 2994416587.0, + "step": 5856 + }, + { + "epoch": 1.583829096809086, + "grad_norm": 1.128319263458252, + "learning_rate": 1.63909458880333e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.55523282289505, + "num_tokens": 2994940732.0, + "step": 5857 + }, + { + "epoch": 1.5840995132504057, + "grad_norm": 1.1528606414794922, + "learning_rate": 1.6389683426051962e-05, + "loss": 2.0208, + "mean_token_accuracy": 0.5655437707901001, + "num_tokens": 2995464931.0, + "step": 5858 + }, + { + "epoch": 1.5843699296917253, + "grad_norm": 1.3125659227371216, + "learning_rate": 1.638842079869929e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5654975175857544, + "num_tokens": 2995957271.0, + "step": 5859 + }, + { + "epoch": 1.584640346133045, + "grad_norm": 1.1701867580413818, + "learning_rate": 1.638715800601402e-05, + "loss": 1.9935, + "mean_token_accuracy": 0.5377539396286011, + "num_tokens": 2996481446.0, + "step": 5860 + }, + { + "epoch": 1.5849107625743644, + "grad_norm": 0.4879899024963379, + "learning_rate": 1.6385895048034898e-05, + "loss": 1.1352, + "mean_token_accuracy": 0.7042292952537537, + "num_tokens": 2997005581.0, + "step": 5861 + }, + { + "epoch": 1.585181179015684, + "grad_norm": 1.6563739776611328, + "learning_rate": 1.638463192480068e-05, + "loss": 1.91, + "mean_token_accuracy": 0.5670727491378784, + "num_tokens": 2997529827.0, + "step": 5862 + }, + { + "epoch": 1.5854515954570036, + "grad_norm": 1.6136744022369385, + "learning_rate": 1.6383368636350116e-05, + "loss": 2.0602, + "mean_token_accuracy": 0.5319842100143433, + "num_tokens": 2998054059.0, + "step": 5863 + }, + { + "epoch": 1.5857220118983233, + "grad_norm": 1.1899759769439697, + "learning_rate": 1.638210518272197e-05, + "loss": 2.0936, + "mean_token_accuracy": 0.5637530088424683, + "num_tokens": 2998578310.0, + "step": 5864 + }, + { + "epoch": 1.585992428339643, + "grad_norm": 1.4561007022857666, + "learning_rate": 1.6380841563955017e-05, + "loss": 2.0927, + "mean_token_accuracy": 0.548814058303833, + "num_tokens": 2999102420.0, + "step": 5865 + }, + { + "epoch": 1.5862628447809626, + "grad_norm": 1.5291709899902344, + "learning_rate": 1.6379577780088015e-05, + "loss": 2.036, + "mean_token_accuracy": 0.5468649864196777, + "num_tokens": 2999626564.0, + "step": 5866 + }, + { + "epoch": 1.5865332612222822, + "grad_norm": 1.4590975046157837, + "learning_rate": 1.6378313831159747e-05, + "loss": 1.976, + "mean_token_accuracy": 0.5654393434524536, + "num_tokens": 3000150800.0, + "step": 5867 + }, + { + "epoch": 1.5868036776636019, + "grad_norm": 1.3355748653411865, + "learning_rate": 1.6377049717208993e-05, + "loss": 1.9163, + "mean_token_accuracy": 0.5437705516815186, + "num_tokens": 3000611949.0, + "step": 5868 + }, + { + "epoch": 1.5870740941049215, + "grad_norm": 1.377026915550232, + "learning_rate": 1.6375785438274544e-05, + "loss": 2.0407, + "mean_token_accuracy": 0.5460548400878906, + "num_tokens": 3001100393.0, + "step": 5869 + }, + { + "epoch": 1.5873445105462411, + "grad_norm": 1.4261459112167358, + "learning_rate": 1.6374520994395184e-05, + "loss": 1.9845, + "mean_token_accuracy": 0.5758732557296753, + "num_tokens": 3001604367.0, + "step": 5870 + }, + { + "epoch": 1.5876149269875608, + "grad_norm": 1.272356390953064, + "learning_rate": 1.637325638560972e-05, + "loss": 2.0227, + "mean_token_accuracy": 0.5551269054412842, + "num_tokens": 3002085952.0, + "step": 5871 + }, + { + "epoch": 1.5878853434288804, + "grad_norm": 1.4368469715118408, + "learning_rate": 1.6371991611956944e-05, + "loss": 2.0715, + "mean_token_accuracy": 0.5203050971031189, + "num_tokens": 3002610212.0, + "step": 5872 + }, + { + "epoch": 1.5881557598702, + "grad_norm": 1.3408012390136719, + "learning_rate": 1.637072667347567e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5572123527526855, + "num_tokens": 3003134332.0, + "step": 5873 + }, + { + "epoch": 1.5884261763115197, + "grad_norm": 1.2960312366485596, + "learning_rate": 1.6369461570204704e-05, + "loss": 1.9334, + "mean_token_accuracy": 0.5547041893005371, + "num_tokens": 3003618471.0, + "step": 5874 + }, + { + "epoch": 1.5886965927528394, + "grad_norm": 1.3231929540634155, + "learning_rate": 1.6368196302182872e-05, + "loss": 1.9708, + "mean_token_accuracy": 0.552658200263977, + "num_tokens": 3004142680.0, + "step": 5875 + }, + { + "epoch": 1.588967009194159, + "grad_norm": 1.2990400791168213, + "learning_rate": 1.6366930869448986e-05, + "loss": 2.0811, + "mean_token_accuracy": 0.5388811826705933, + "num_tokens": 3004666880.0, + "step": 5876 + }, + { + "epoch": 1.5892374256354787, + "grad_norm": 1.3300672769546509, + "learning_rate": 1.636566527204188e-05, + "loss": 2.0094, + "mean_token_accuracy": 0.5469969511032104, + "num_tokens": 3005191080.0, + "step": 5877 + }, + { + "epoch": 1.5895078420767983, + "grad_norm": 8.39460563659668, + "learning_rate": 1.6364399510000383e-05, + "loss": 1.8185, + "mean_token_accuracy": 0.5861263275146484, + "num_tokens": 3005701812.0, + "step": 5878 + }, + { + "epoch": 1.589778258518118, + "grad_norm": 1.9948701858520508, + "learning_rate": 1.6363133583363337e-05, + "loss": 1.9934, + "mean_token_accuracy": 0.5643495321273804, + "num_tokens": 3006164146.0, + "step": 5879 + }, + { + "epoch": 1.5900486749594376, + "grad_norm": 2.014758348464966, + "learning_rate": 1.636186749216957e-05, + "loss": 2.1017, + "mean_token_accuracy": 0.53782057762146, + "num_tokens": 3006688298.0, + "step": 5880 + }, + { + "epoch": 1.5903190914007572, + "grad_norm": 0.81537264585495, + "learning_rate": 1.636060123645795e-05, + "loss": 1.1279, + "mean_token_accuracy": 0.7010689377784729, + "num_tokens": 3007210263.0, + "step": 5881 + }, + { + "epoch": 1.5905895078420769, + "grad_norm": 1.9187759160995483, + "learning_rate": 1.635933481626732e-05, + "loss": 2.0873, + "mean_token_accuracy": 0.5251066088676453, + "num_tokens": 3007734436.0, + "step": 5882 + }, + { + "epoch": 1.5908599242833965, + "grad_norm": 1.6866055727005005, + "learning_rate": 1.6358068231636532e-05, + "loss": 1.9847, + "mean_token_accuracy": 0.5642521381378174, + "num_tokens": 3008258526.0, + "step": 5883 + }, + { + "epoch": 1.5911303407247162, + "grad_norm": 1.2265751361846924, + "learning_rate": 1.6356801482604457e-05, + "loss": 1.939, + "mean_token_accuracy": 0.564759373664856, + "num_tokens": 3008782802.0, + "step": 5884 + }, + { + "epoch": 1.5914007571660358, + "grad_norm": 1.5629805326461792, + "learning_rate": 1.6355534569209958e-05, + "loss": 1.9263, + "mean_token_accuracy": 0.5472331047058105, + "num_tokens": 3009306974.0, + "step": 5885 + }, + { + "epoch": 1.5916711736073554, + "grad_norm": 1.6898530721664429, + "learning_rate": 1.6354267491491908e-05, + "loss": 2.0778, + "mean_token_accuracy": 0.5271033644676208, + "num_tokens": 3009831146.0, + "step": 5886 + }, + { + "epoch": 1.591941590048675, + "grad_norm": 1.54446542263031, + "learning_rate": 1.6353000249489185e-05, + "loss": 1.9762, + "mean_token_accuracy": 0.5614172220230103, + "num_tokens": 3010315718.0, + "step": 5887 + }, + { + "epoch": 1.5922120064899947, + "grad_norm": 1.6016863584518433, + "learning_rate": 1.6351732843240678e-05, + "loss": 2.1086, + "mean_token_accuracy": 0.5323721766471863, + "num_tokens": 3010839914.0, + "step": 5888 + }, + { + "epoch": 1.5924824229313144, + "grad_norm": 1.460800290107727, + "learning_rate": 1.635046527278526e-05, + "loss": 2.0076, + "mean_token_accuracy": 0.5374438762664795, + "num_tokens": 3011322944.0, + "step": 5889 + }, + { + "epoch": 1.592752839372634, + "grad_norm": 1.2437690496444702, + "learning_rate": 1.634919753816184e-05, + "loss": 1.9392, + "mean_token_accuracy": 0.5642263293266296, + "num_tokens": 3011832157.0, + "step": 5890 + }, + { + "epoch": 1.5930232558139537, + "grad_norm": 1.3621044158935547, + "learning_rate": 1.6347929639409304e-05, + "loss": 1.8688, + "mean_token_accuracy": 0.5638220906257629, + "num_tokens": 3012336991.0, + "step": 5891 + }, + { + "epoch": 1.593293672255273, + "grad_norm": 1.9850610494613647, + "learning_rate": 1.6346661576566563e-05, + "loss": 1.7862, + "mean_token_accuracy": 0.6040864586830139, + "num_tokens": 3012859024.0, + "step": 5892 + }, + { + "epoch": 1.5935640886965927, + "grad_norm": 1.3633639812469482, + "learning_rate": 1.634539334967252e-05, + "loss": 1.8913, + "mean_token_accuracy": 0.5683831572532654, + "num_tokens": 3013383282.0, + "step": 5893 + }, + { + "epoch": 1.5938345051379124, + "grad_norm": 1.3397767543792725, + "learning_rate": 1.6344124958766087e-05, + "loss": 1.972, + "mean_token_accuracy": 0.5598130822181702, + "num_tokens": 3013907440.0, + "step": 5894 + }, + { + "epoch": 1.594104921579232, + "grad_norm": 1.4745808839797974, + "learning_rate": 1.6342856403886186e-05, + "loss": 2.0215, + "mean_token_accuracy": 0.5517787933349609, + "num_tokens": 3014396727.0, + "step": 5895 + }, + { + "epoch": 1.5943753380205516, + "grad_norm": 1.1558269262313843, + "learning_rate": 1.6341587685071737e-05, + "loss": 2.0731, + "mean_token_accuracy": 0.5483670234680176, + "num_tokens": 3014920906.0, + "step": 5896 + }, + { + "epoch": 1.5946457544618713, + "grad_norm": 1.3311591148376465, + "learning_rate": 1.634031880236167e-05, + "loss": 1.896, + "mean_token_accuracy": 0.5732845067977905, + "num_tokens": 3015445151.0, + "step": 5897 + }, + { + "epoch": 1.594916170903191, + "grad_norm": 1.4103440046310425, + "learning_rate": 1.6339049755794917e-05, + "loss": 1.8945, + "mean_token_accuracy": 0.5612236261367798, + "num_tokens": 3015969353.0, + "step": 5898 + }, + { + "epoch": 1.5951865873445106, + "grad_norm": 1.4518288373947144, + "learning_rate": 1.6337780545410417e-05, + "loss": 2.0098, + "mean_token_accuracy": 0.5555813312530518, + "num_tokens": 3016493524.0, + "step": 5899 + }, + { + "epoch": 1.5954570037858302, + "grad_norm": 1.493827223777771, + "learning_rate": 1.6336511171247113e-05, + "loss": 2.061, + "mean_token_accuracy": 0.5460732579231262, + "num_tokens": 3017017799.0, + "step": 5900 + }, + { + "epoch": 1.5957274202271499, + "grad_norm": 0.593697726726532, + "learning_rate": 1.633524163334395e-05, + "loss": 1.164, + "mean_token_accuracy": 0.6795058250427246, + "num_tokens": 3017542072.0, + "step": 5901 + }, + { + "epoch": 1.5959978366684693, + "grad_norm": 1.9112212657928467, + "learning_rate": 1.6333971931739888e-05, + "loss": 2.0342, + "mean_token_accuracy": 0.5335917472839355, + "num_tokens": 3018066274.0, + "step": 5902 + }, + { + "epoch": 1.596268253109789, + "grad_norm": 1.7099822759628296, + "learning_rate": 1.6332702066473878e-05, + "loss": 1.9089, + "mean_token_accuracy": 0.5614114999771118, + "num_tokens": 3018586228.0, + "step": 5903 + }, + { + "epoch": 1.5965386695511086, + "grad_norm": 1.1404379606246948, + "learning_rate": 1.6331432037584883e-05, + "loss": 1.8827, + "mean_token_accuracy": 0.5662375688552856, + "num_tokens": 3019110485.0, + "step": 5904 + }, + { + "epoch": 1.5968090859924282, + "grad_norm": 1.6354479789733887, + "learning_rate": 1.633016184511188e-05, + "loss": 1.9706, + "mean_token_accuracy": 0.5479656457901001, + "num_tokens": 3019634712.0, + "step": 5905 + }, + { + "epoch": 1.5970795024337479, + "grad_norm": 1.470750331878662, + "learning_rate": 1.6328891489093836e-05, + "loss": 1.9568, + "mean_token_accuracy": 0.54926598072052, + "num_tokens": 3020158982.0, + "step": 5906 + }, + { + "epoch": 1.5973499188750675, + "grad_norm": 1.7220897674560547, + "learning_rate": 1.632762096956973e-05, + "loss": 2.0129, + "mean_token_accuracy": 0.5580795407295227, + "num_tokens": 3020653980.0, + "step": 5907 + }, + { + "epoch": 1.5976203353163871, + "grad_norm": 2.2215962409973145, + "learning_rate": 1.6326350286578544e-05, + "loss": 2.0661, + "mean_token_accuracy": 0.5635853409767151, + "num_tokens": 3021134890.0, + "step": 5908 + }, + { + "epoch": 1.5978907517577068, + "grad_norm": 1.5547939538955688, + "learning_rate": 1.6325079440159265e-05, + "loss": 1.9699, + "mean_token_accuracy": 0.5487463474273682, + "num_tokens": 3021659115.0, + "step": 5909 + }, + { + "epoch": 1.5981611681990264, + "grad_norm": 1.1997419595718384, + "learning_rate": 1.632380843035089e-05, + "loss": 1.9529, + "mean_token_accuracy": 0.5599352121353149, + "num_tokens": 3022183376.0, + "step": 5910 + }, + { + "epoch": 1.598431584640346, + "grad_norm": 1.492695927619934, + "learning_rate": 1.632253725719242e-05, + "loss": 1.8487, + "mean_token_accuracy": 0.5851632356643677, + "num_tokens": 3022707620.0, + "step": 5911 + }, + { + "epoch": 1.5987020010816657, + "grad_norm": 1.5545144081115723, + "learning_rate": 1.6321265920722852e-05, + "loss": 2.0478, + "mean_token_accuracy": 0.5327668190002441, + "num_tokens": 3023227396.0, + "step": 5912 + }, + { + "epoch": 1.5989724175229854, + "grad_norm": 1.268251895904541, + "learning_rate": 1.6319994420981195e-05, + "loss": 1.9598, + "mean_token_accuracy": 0.5457101464271545, + "num_tokens": 3023751489.0, + "step": 5913 + }, + { + "epoch": 1.599242833964305, + "grad_norm": 1.342989206314087, + "learning_rate": 1.6318722758006464e-05, + "loss": 2.0028, + "mean_token_accuracy": 0.5529131293296814, + "num_tokens": 3024275630.0, + "step": 5914 + }, + { + "epoch": 1.5995132504056246, + "grad_norm": 1.2345763444900513, + "learning_rate": 1.631745093183768e-05, + "loss": 1.7724, + "mean_token_accuracy": 0.577662467956543, + "num_tokens": 3024799896.0, + "step": 5915 + }, + { + "epoch": 1.5997836668469443, + "grad_norm": 1.235716700553894, + "learning_rate": 1.6316178942513863e-05, + "loss": 1.8789, + "mean_token_accuracy": 0.5480700135231018, + "num_tokens": 3025324128.0, + "step": 5916 + }, + { + "epoch": 1.600054083288264, + "grad_norm": 1.7052046060562134, + "learning_rate": 1.6314906790074042e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.5619112253189087, + "num_tokens": 3025799777.0, + "step": 5917 + }, + { + "epoch": 1.6003244997295836, + "grad_norm": 1.4477323293685913, + "learning_rate": 1.6313634474557248e-05, + "loss": 2.1339, + "mean_token_accuracy": 0.5219855904579163, + "num_tokens": 3026324035.0, + "step": 5918 + }, + { + "epoch": 1.6005949161709032, + "grad_norm": 1.5189090967178345, + "learning_rate": 1.6312361996002526e-05, + "loss": 2.0531, + "mean_token_accuracy": 0.5459477305412292, + "num_tokens": 3026848259.0, + "step": 5919 + }, + { + "epoch": 1.6008653326122229, + "grad_norm": 1.5762132406234741, + "learning_rate": 1.6311089354448916e-05, + "loss": 1.9999, + "mean_token_accuracy": 0.5539209842681885, + "num_tokens": 3027372527.0, + "step": 5920 + }, + { + "epoch": 1.6011357490535425, + "grad_norm": 0.5951618552207947, + "learning_rate": 1.630981654993546e-05, + "loss": 1.1464, + "mean_token_accuracy": 0.7057929039001465, + "num_tokens": 3027839506.0, + "step": 5921 + }, + { + "epoch": 1.6014061654948621, + "grad_norm": 1.533592939376831, + "learning_rate": 1.6308543582501223e-05, + "loss": 1.9023, + "mean_token_accuracy": 0.5846669673919678, + "num_tokens": 3028297979.0, + "step": 5922 + }, + { + "epoch": 1.6016765819361818, + "grad_norm": 1.148943305015564, + "learning_rate": 1.6307270452185253e-05, + "loss": 1.9503, + "mean_token_accuracy": 0.559510350227356, + "num_tokens": 3028810475.0, + "step": 5923 + }, + { + "epoch": 1.6019469983775014, + "grad_norm": 1.2214902639389038, + "learning_rate": 1.630599715902662e-05, + "loss": 2.0017, + "mean_token_accuracy": 0.5546576976776123, + "num_tokens": 3029301110.0, + "step": 5924 + }, + { + "epoch": 1.602217414818821, + "grad_norm": 1.2152217626571655, + "learning_rate": 1.630472370306439e-05, + "loss": 1.9607, + "mean_token_accuracy": 0.5509896278381348, + "num_tokens": 3029825351.0, + "step": 5925 + }, + { + "epoch": 1.6024878312601407, + "grad_norm": 1.38003408908844, + "learning_rate": 1.6303450084337635e-05, + "loss": 1.9866, + "mean_token_accuracy": 0.5485392808914185, + "num_tokens": 3030302062.0, + "step": 5926 + }, + { + "epoch": 1.6027582477014604, + "grad_norm": 1.223841667175293, + "learning_rate": 1.630217630288544e-05, + "loss": 1.9034, + "mean_token_accuracy": 0.5766682624816895, + "num_tokens": 3030826276.0, + "step": 5927 + }, + { + "epoch": 1.60302866414278, + "grad_norm": 1.0421522855758667, + "learning_rate": 1.6300902358746877e-05, + "loss": 1.7639, + "mean_token_accuracy": 0.5829893350601196, + "num_tokens": 3031323985.0, + "step": 5928 + }, + { + "epoch": 1.6032990805840996, + "grad_norm": 1.3383469581604004, + "learning_rate": 1.6299628251961045e-05, + "loss": 2.0363, + "mean_token_accuracy": 0.5323150157928467, + "num_tokens": 3031848157.0, + "step": 5929 + }, + { + "epoch": 1.6035694970254193, + "grad_norm": 1.3480628728866577, + "learning_rate": 1.6298353982567032e-05, + "loss": 2.1067, + "mean_token_accuracy": 0.553218424320221, + "num_tokens": 3032338699.0, + "step": 5930 + }, + { + "epoch": 1.603839913466739, + "grad_norm": 1.3681641817092896, + "learning_rate": 1.6297079550603934e-05, + "loss": 2.0238, + "mean_token_accuracy": 0.5373092889785767, + "num_tokens": 3032862914.0, + "step": 5931 + }, + { + "epoch": 1.6041103299080586, + "grad_norm": 1.524512767791748, + "learning_rate": 1.6295804956110867e-05, + "loss": 2.0352, + "mean_token_accuracy": 0.5158708095550537, + "num_tokens": 3033387107.0, + "step": 5932 + }, + { + "epoch": 1.604380746349378, + "grad_norm": 1.4399033784866333, + "learning_rate": 1.6294530199126916e-05, + "loss": 1.8916, + "mean_token_accuracy": 0.5722383260726929, + "num_tokens": 3033847697.0, + "step": 5933 + }, + { + "epoch": 1.6046511627906976, + "grad_norm": 1.6062031984329224, + "learning_rate": 1.629325527969122e-05, + "loss": 1.984, + "mean_token_accuracy": 0.5401049852371216, + "num_tokens": 3034371767.0, + "step": 5934 + }, + { + "epoch": 1.6049215792320173, + "grad_norm": 1.293592095375061, + "learning_rate": 1.629198019784288e-05, + "loss": 2.0386, + "mean_token_accuracy": 0.5316528677940369, + "num_tokens": 3034895879.0, + "step": 5935 + }, + { + "epoch": 1.605191995673337, + "grad_norm": 1.4780293703079224, + "learning_rate": 1.6290704953621026e-05, + "loss": 1.9913, + "mean_token_accuracy": 0.5447980165481567, + "num_tokens": 3035420150.0, + "step": 5936 + }, + { + "epoch": 1.6054624121146566, + "grad_norm": 1.4903050661087036, + "learning_rate": 1.6289429547064787e-05, + "loss": 1.8631, + "mean_token_accuracy": 0.5702214241027832, + "num_tokens": 3035944433.0, + "step": 5937 + }, + { + "epoch": 1.6057328285559762, + "grad_norm": 1.18936288356781, + "learning_rate": 1.628815397821329e-05, + "loss": 2.1066, + "mean_token_accuracy": 0.5363671779632568, + "num_tokens": 3036468714.0, + "step": 5938 + }, + { + "epoch": 1.6060032449972959, + "grad_norm": 1.485883355140686, + "learning_rate": 1.6286878247105682e-05, + "loss": 2.0713, + "mean_token_accuracy": 0.5511102676391602, + "num_tokens": 3036945113.0, + "step": 5939 + }, + { + "epoch": 1.6062736614386155, + "grad_norm": 1.2548108100891113, + "learning_rate": 1.62856023537811e-05, + "loss": 1.9351, + "mean_token_accuracy": 0.556648850440979, + "num_tokens": 3037427467.0, + "step": 5940 + }, + { + "epoch": 1.6065440778799351, + "grad_norm": 0.6004846096038818, + "learning_rate": 1.6284326298278695e-05, + "loss": 1.0899, + "mean_token_accuracy": 0.7028525471687317, + "num_tokens": 3037951650.0, + "step": 5941 + }, + { + "epoch": 1.6068144943212548, + "grad_norm": 2.070307731628418, + "learning_rate": 1.6283050080637618e-05, + "loss": 1.9757, + "mean_token_accuracy": 0.555401086807251, + "num_tokens": 3038475864.0, + "step": 5942 + }, + { + "epoch": 1.6070849107625742, + "grad_norm": 1.8516380786895752, + "learning_rate": 1.6281773700897026e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.5570634603500366, + "num_tokens": 3039000140.0, + "step": 5943 + }, + { + "epoch": 1.6073553272038938, + "grad_norm": 1.231943964958191, + "learning_rate": 1.6280497159096087e-05, + "loss": 2.023, + "mean_token_accuracy": 0.553777813911438, + "num_tokens": 3039524292.0, + "step": 5944 + }, + { + "epoch": 1.6076257436452135, + "grad_norm": 1.6840490102767944, + "learning_rate": 1.6279220455273964e-05, + "loss": 2.0154, + "mean_token_accuracy": 0.536911129951477, + "num_tokens": 3040048467.0, + "step": 5945 + }, + { + "epoch": 1.6078961600865331, + "grad_norm": 1.2706995010375977, + "learning_rate": 1.6277943589469835e-05, + "loss": 1.821, + "mean_token_accuracy": 0.5761785507202148, + "num_tokens": 3040572593.0, + "step": 5946 + }, + { + "epoch": 1.6081665765278528, + "grad_norm": 1.6368074417114258, + "learning_rate": 1.6276666561722872e-05, + "loss": 1.9061, + "mean_token_accuracy": 0.5593773722648621, + "num_tokens": 3041036397.0, + "step": 5947 + }, + { + "epoch": 1.6084369929691724, + "grad_norm": 1.3358824253082275, + "learning_rate": 1.6275389372072263e-05, + "loss": 1.7474, + "mean_token_accuracy": 0.6100276708602905, + "num_tokens": 3041560614.0, + "step": 5948 + }, + { + "epoch": 1.608707409410492, + "grad_norm": 1.3921252489089966, + "learning_rate": 1.6274112020557195e-05, + "loss": 1.9853, + "mean_token_accuracy": 0.5618570446968079, + "num_tokens": 3042084735.0, + "step": 5949 + }, + { + "epoch": 1.6089778258518117, + "grad_norm": 1.5557053089141846, + "learning_rate": 1.627283450721686e-05, + "loss": 2.0076, + "mean_token_accuracy": 0.5528110265731812, + "num_tokens": 3042608926.0, + "step": 5950 + }, + { + "epoch": 1.6092482422931313, + "grad_norm": 1.1725174188613892, + "learning_rate": 1.6271556832090454e-05, + "loss": 2.0092, + "mean_token_accuracy": 0.5698832869529724, + "num_tokens": 3043072020.0, + "step": 5951 + }, + { + "epoch": 1.609518658734451, + "grad_norm": 1.64975905418396, + "learning_rate": 1.6270278995217183e-05, + "loss": 1.9071, + "mean_token_accuracy": 0.5698235630989075, + "num_tokens": 3043566010.0, + "step": 5952 + }, + { + "epoch": 1.6097890751757706, + "grad_norm": 1.4902194738388062, + "learning_rate": 1.626900099663625e-05, + "loss": 1.9904, + "mean_token_accuracy": 0.5222546458244324, + "num_tokens": 3044090151.0, + "step": 5953 + }, + { + "epoch": 1.6100594916170903, + "grad_norm": 1.245974063873291, + "learning_rate": 1.6267722836386882e-05, + "loss": 1.9931, + "mean_token_accuracy": 0.5508865714073181, + "num_tokens": 3044614369.0, + "step": 5954 + }, + { + "epoch": 1.61032990805841, + "grad_norm": 1.4497592449188232, + "learning_rate": 1.6266444514508274e-05, + "loss": 1.9346, + "mean_token_accuracy": 0.5679420828819275, + "num_tokens": 3045095740.0, + "step": 5955 + }, + { + "epoch": 1.6106003244997296, + "grad_norm": 1.4832051992416382, + "learning_rate": 1.6265166031039666e-05, + "loss": 1.9664, + "mean_token_accuracy": 0.5499986410140991, + "num_tokens": 3045591360.0, + "step": 5956 + }, + { + "epoch": 1.6108707409410492, + "grad_norm": 1.2748732566833496, + "learning_rate": 1.626388738602028e-05, + "loss": 2.0079, + "mean_token_accuracy": 0.559180498123169, + "num_tokens": 3046095796.0, + "step": 5957 + }, + { + "epoch": 1.6111411573823688, + "grad_norm": 1.7655984163284302, + "learning_rate": 1.626260857948935e-05, + "loss": 2.034, + "mean_token_accuracy": 0.5568374395370483, + "num_tokens": 3046563175.0, + "step": 5958 + }, + { + "epoch": 1.6114115738236885, + "grad_norm": 1.4783512353897095, + "learning_rate": 1.626132961148611e-05, + "loss": 1.6941, + "mean_token_accuracy": 0.6177465319633484, + "num_tokens": 3047039283.0, + "step": 5959 + }, + { + "epoch": 1.6116819902650081, + "grad_norm": 1.4612566232681274, + "learning_rate": 1.6260050482049805e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.5560870170593262, + "num_tokens": 3047563466.0, + "step": 5960 + }, + { + "epoch": 1.6119524067063278, + "grad_norm": 0.5664166808128357, + "learning_rate": 1.625877119121968e-05, + "loss": 1.1278, + "mean_token_accuracy": 0.7129478454589844, + "num_tokens": 3048025184.0, + "step": 5961 + }, + { + "epoch": 1.6122228231476474, + "grad_norm": 2.0622434616088867, + "learning_rate": 1.6257491739035e-05, + "loss": 2.0594, + "mean_token_accuracy": 0.5386655330657959, + "num_tokens": 3048549357.0, + "step": 5962 + }, + { + "epoch": 1.612493239588967, + "grad_norm": 1.3968864679336548, + "learning_rate": 1.6256212125535004e-05, + "loss": 1.8828, + "mean_token_accuracy": 0.5472753047943115, + "num_tokens": 3049073503.0, + "step": 5963 + }, + { + "epoch": 1.6127636560302867, + "grad_norm": 1.4397366046905518, + "learning_rate": 1.6254932350758963e-05, + "loss": 1.9712, + "mean_token_accuracy": 0.5658100843429565, + "num_tokens": 3049578539.0, + "step": 5964 + }, + { + "epoch": 1.6130340724716064, + "grad_norm": 1.5551576614379883, + "learning_rate": 1.6253652414746147e-05, + "loss": 2.0468, + "mean_token_accuracy": 0.5468909740447998, + "num_tokens": 3050102726.0, + "step": 5965 + }, + { + "epoch": 1.613304488912926, + "grad_norm": 1.4858567714691162, + "learning_rate": 1.6252372317535817e-05, + "loss": 1.874, + "mean_token_accuracy": 0.568102240562439, + "num_tokens": 3050607080.0, + "step": 5966 + }, + { + "epoch": 1.6135749053542456, + "grad_norm": 1.4137425422668457, + "learning_rate": 1.6251092059167266e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.5313188433647156, + "num_tokens": 3051107627.0, + "step": 5967 + }, + { + "epoch": 1.6138453217955653, + "grad_norm": 1.7678041458129883, + "learning_rate": 1.6249811639679764e-05, + "loss": 1.9197, + "mean_token_accuracy": 0.5622231364250183, + "num_tokens": 3051614847.0, + "step": 5968 + }, + { + "epoch": 1.614115738236885, + "grad_norm": 1.5457146167755127, + "learning_rate": 1.6248531059112607e-05, + "loss": 1.9255, + "mean_token_accuracy": 0.5685912370681763, + "num_tokens": 3052138992.0, + "step": 5969 + }, + { + "epoch": 1.6143861546782046, + "grad_norm": 1.7477182149887085, + "learning_rate": 1.624725031750508e-05, + "loss": 1.8866, + "mean_token_accuracy": 0.5737442970275879, + "num_tokens": 3052613650.0, + "step": 5970 + }, + { + "epoch": 1.6146565711195242, + "grad_norm": 1.6710847616195679, + "learning_rate": 1.6245969414896482e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.5527981519699097, + "num_tokens": 3053128356.0, + "step": 5971 + }, + { + "epoch": 1.6149269875608439, + "grad_norm": 1.35261070728302, + "learning_rate": 1.6244688351326113e-05, + "loss": 2.0433, + "mean_token_accuracy": 0.5339601635932922, + "num_tokens": 3053652474.0, + "step": 5972 + }, + { + "epoch": 1.6151974040021635, + "grad_norm": 1.496623158454895, + "learning_rate": 1.6243407126833284e-05, + "loss": 2.0481, + "mean_token_accuracy": 0.540611743927002, + "num_tokens": 3054176671.0, + "step": 5973 + }, + { + "epoch": 1.615467820443483, + "grad_norm": 1.684171438217163, + "learning_rate": 1.62421257414573e-05, + "loss": 2.0327, + "mean_token_accuracy": 0.5308692455291748, + "num_tokens": 3054700819.0, + "step": 5974 + }, + { + "epoch": 1.6157382368848026, + "grad_norm": 1.4877514839172363, + "learning_rate": 1.624084419523749e-05, + "loss": 1.9377, + "mean_token_accuracy": 0.5670945644378662, + "num_tokens": 3055168923.0, + "step": 5975 + }, + { + "epoch": 1.6160086533261222, + "grad_norm": 1.4824963808059692, + "learning_rate": 1.623956248821316e-05, + "loss": 2.0132, + "mean_token_accuracy": 0.5507745742797852, + "num_tokens": 3055632997.0, + "step": 5976 + }, + { + "epoch": 1.6162790697674418, + "grad_norm": 1.5984728336334229, + "learning_rate": 1.623828062042365e-05, + "loss": 1.8907, + "mean_token_accuracy": 0.596284031867981, + "num_tokens": 3056092408.0, + "step": 5977 + }, + { + "epoch": 1.6165494862087615, + "grad_norm": 1.1055245399475098, + "learning_rate": 1.6236998591908282e-05, + "loss": 1.9424, + "mean_token_accuracy": 0.5446715354919434, + "num_tokens": 3056616585.0, + "step": 5978 + }, + { + "epoch": 1.6168199026500811, + "grad_norm": 1.4244872331619263, + "learning_rate": 1.6235716402706397e-05, + "loss": 1.906, + "mean_token_accuracy": 0.5911726355552673, + "num_tokens": 3057080872.0, + "step": 5979 + }, + { + "epoch": 1.6170903190914008, + "grad_norm": 1.511192798614502, + "learning_rate": 1.6234434052857337e-05, + "loss": 2.0429, + "mean_token_accuracy": 0.5390989780426025, + "num_tokens": 3057605145.0, + "step": 5980 + }, + { + "epoch": 1.6173607355327204, + "grad_norm": 0.7916049361228943, + "learning_rate": 1.6233151542400446e-05, + "loss": 1.1925, + "mean_token_accuracy": 0.6936354041099548, + "num_tokens": 3058129359.0, + "step": 5981 + }, + { + "epoch": 1.61763115197404, + "grad_norm": 1.805273413658142, + "learning_rate": 1.6231868871375077e-05, + "loss": 1.9791, + "mean_token_accuracy": 0.5572211146354675, + "num_tokens": 3058630327.0, + "step": 5982 + }, + { + "epoch": 1.6179015684153597, + "grad_norm": 1.4498350620269775, + "learning_rate": 1.6230586039820583e-05, + "loss": 2.038, + "mean_token_accuracy": 0.5325596332550049, + "num_tokens": 3059154518.0, + "step": 5983 + }, + { + "epoch": 1.6181719848566791, + "grad_norm": 1.3363614082336426, + "learning_rate": 1.622930304777633e-05, + "loss": 1.8777, + "mean_token_accuracy": 0.5622564554214478, + "num_tokens": 3059628553.0, + "step": 5984 + }, + { + "epoch": 1.6184424012979988, + "grad_norm": 1.6358243227005005, + "learning_rate": 1.622801989528168e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.5467155575752258, + "num_tokens": 3060100155.0, + "step": 5985 + }, + { + "epoch": 1.6187128177393184, + "grad_norm": 1.6103298664093018, + "learning_rate": 1.6226736582376002e-05, + "loss": 2.0103, + "mean_token_accuracy": 0.5460387468338013, + "num_tokens": 3060624410.0, + "step": 5986 + }, + { + "epoch": 1.618983234180638, + "grad_norm": 1.3245916366577148, + "learning_rate": 1.6225453109098678e-05, + "loss": 1.9564, + "mean_token_accuracy": 0.5543581247329712, + "num_tokens": 3061148590.0, + "step": 5987 + }, + { + "epoch": 1.6192536506219577, + "grad_norm": 1.3709379434585571, + "learning_rate": 1.6224169475489085e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.5535687208175659, + "num_tokens": 3061672583.0, + "step": 5988 + }, + { + "epoch": 1.6195240670632773, + "grad_norm": 1.3688390254974365, + "learning_rate": 1.6222885681586612e-05, + "loss": 1.9224, + "mean_token_accuracy": 0.5653621554374695, + "num_tokens": 3062196648.0, + "step": 5989 + }, + { + "epoch": 1.619794483504597, + "grad_norm": 1.1845157146453857, + "learning_rate": 1.622160172743064e-05, + "loss": 2.0013, + "mean_token_accuracy": 0.5460497140884399, + "num_tokens": 3062697533.0, + "step": 5990 + }, + { + "epoch": 1.6200648999459166, + "grad_norm": 1.5551406145095825, + "learning_rate": 1.6220317613060578e-05, + "loss": 2.0012, + "mean_token_accuracy": 0.5599976181983948, + "num_tokens": 3063221566.0, + "step": 5991 + }, + { + "epoch": 1.6203353163872363, + "grad_norm": 1.2439379692077637, + "learning_rate": 1.6219033338515815e-05, + "loss": 1.9527, + "mean_token_accuracy": 0.541710376739502, + "num_tokens": 3063745847.0, + "step": 5992 + }, + { + "epoch": 1.620605732828556, + "grad_norm": 1.1103720664978027, + "learning_rate": 1.6217748903835763e-05, + "loss": 1.9859, + "mean_token_accuracy": 0.5360298156738281, + "num_tokens": 3064270025.0, + "step": 5993 + }, + { + "epoch": 1.6208761492698756, + "grad_norm": 1.4024683237075806, + "learning_rate": 1.621646430905983e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.5655686855316162, + "num_tokens": 3064772647.0, + "step": 5994 + }, + { + "epoch": 1.6211465657111952, + "grad_norm": 1.3912391662597656, + "learning_rate": 1.6215179554227433e-05, + "loss": 2.0285, + "mean_token_accuracy": 0.5318779945373535, + "num_tokens": 3065296882.0, + "step": 5995 + }, + { + "epoch": 1.6214169821525148, + "grad_norm": 1.3857735395431519, + "learning_rate": 1.6213894639377986e-05, + "loss": 1.9268, + "mean_token_accuracy": 0.5597543716430664, + "num_tokens": 3065776617.0, + "step": 5996 + }, + { + "epoch": 1.6216873985938345, + "grad_norm": 1.4826024770736694, + "learning_rate": 1.621260956455092e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.5389745831489563, + "num_tokens": 3066300876.0, + "step": 5997 + }, + { + "epoch": 1.6219578150351541, + "grad_norm": 1.4898560047149658, + "learning_rate": 1.621132432978567e-05, + "loss": 1.8564, + "mean_token_accuracy": 0.5891181230545044, + "num_tokens": 3066825075.0, + "step": 5998 + }, + { + "epoch": 1.6222282314764738, + "grad_norm": 1.4063657522201538, + "learning_rate": 1.6210038935121656e-05, + "loss": 1.9739, + "mean_token_accuracy": 0.5473393797874451, + "num_tokens": 3067349282.0, + "step": 5999 + }, + { + "epoch": 1.6224986479177934, + "grad_norm": 1.5966973304748535, + "learning_rate": 1.6208753380598325e-05, + "loss": 2.0187, + "mean_token_accuracy": 0.5560605525970459, + "num_tokens": 3067873548.0, + "step": 6000 + }, + { + "epoch": 1.622769064359113, + "grad_norm": 0.853329062461853, + "learning_rate": 1.6207467666255127e-05, + "loss": 1.1827, + "mean_token_accuracy": 0.6881335973739624, + "num_tokens": 3068397671.0, + "step": 6001 + }, + { + "epoch": 1.6230394808004327, + "grad_norm": 2.0692203044891357, + "learning_rate": 1.6206181792131506e-05, + "loss": 1.9109, + "mean_token_accuracy": 0.5720930695533752, + "num_tokens": 3068904060.0, + "step": 6002 + }, + { + "epoch": 1.6233098972417523, + "grad_norm": 2.0193092823028564, + "learning_rate": 1.6204895758266918e-05, + "loss": 2.0536, + "mean_token_accuracy": 0.5394218564033508, + "num_tokens": 3069428294.0, + "step": 6003 + }, + { + "epoch": 1.623580313683072, + "grad_norm": 1.2656991481781006, + "learning_rate": 1.6203609564700816e-05, + "loss": 1.8892, + "mean_token_accuracy": 0.5555849075317383, + "num_tokens": 3069952377.0, + "step": 6004 + }, + { + "epoch": 1.6238507301243916, + "grad_norm": 1.4105314016342163, + "learning_rate": 1.6202323211472673e-05, + "loss": 1.9148, + "mean_token_accuracy": 0.546939492225647, + "num_tokens": 3070476652.0, + "step": 6005 + }, + { + "epoch": 1.6241211465657113, + "grad_norm": 1.9367914199829102, + "learning_rate": 1.6201036698621958e-05, + "loss": 1.989, + "mean_token_accuracy": 0.522347629070282, + "num_tokens": 3071000816.0, + "step": 6006 + }, + { + "epoch": 1.624391563007031, + "grad_norm": 1.582197666168213, + "learning_rate": 1.619975002618814e-05, + "loss": 1.851, + "mean_token_accuracy": 0.5580430030822754, + "num_tokens": 3071525089.0, + "step": 6007 + }, + { + "epoch": 1.6246619794483506, + "grad_norm": 1.5655269622802734, + "learning_rate": 1.6198463194210702e-05, + "loss": 2.0276, + "mean_token_accuracy": 0.5439707636833191, + "num_tokens": 3072049272.0, + "step": 6008 + }, + { + "epoch": 1.6249323958896702, + "grad_norm": 1.4450874328613281, + "learning_rate": 1.6197176202729122e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.554832398891449, + "num_tokens": 3072573269.0, + "step": 6009 + }, + { + "epoch": 1.6252028123309898, + "grad_norm": 1.2102121114730835, + "learning_rate": 1.6195889051782893e-05, + "loss": 1.9297, + "mean_token_accuracy": 0.5691064596176147, + "num_tokens": 3073097449.0, + "step": 6010 + }, + { + "epoch": 1.6254732287723095, + "grad_norm": 1.2057501077651978, + "learning_rate": 1.6194601741411505e-05, + "loss": 2.0222, + "mean_token_accuracy": 0.5490593910217285, + "num_tokens": 3073621585.0, + "step": 6011 + }, + { + "epoch": 1.6257436452136291, + "grad_norm": 1.47123122215271, + "learning_rate": 1.6193314271654462e-05, + "loss": 2.0299, + "mean_token_accuracy": 0.5623248815536499, + "num_tokens": 3074145790.0, + "step": 6012 + }, + { + "epoch": 1.6260140616549488, + "grad_norm": 1.1483086347579956, + "learning_rate": 1.6192026642551267e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.5757113695144653, + "num_tokens": 3074669911.0, + "step": 6013 + }, + { + "epoch": 1.6262844780962684, + "grad_norm": 1.412062406539917, + "learning_rate": 1.6190738854141422e-05, + "loss": 1.983, + "mean_token_accuracy": 0.5621050596237183, + "num_tokens": 3075194191.0, + "step": 6014 + }, + { + "epoch": 1.6265548945375878, + "grad_norm": 1.2155940532684326, + "learning_rate": 1.6189450906464446e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5429189801216125, + "num_tokens": 3075718437.0, + "step": 6015 + }, + { + "epoch": 1.6268253109789075, + "grad_norm": 1.2766191959381104, + "learning_rate": 1.6188162799559855e-05, + "loss": 1.9418, + "mean_token_accuracy": 0.5596456527709961, + "num_tokens": 3076242691.0, + "step": 6016 + }, + { + "epoch": 1.6270957274202271, + "grad_norm": 1.5548943281173706, + "learning_rate": 1.618687453346717e-05, + "loss": 2.0546, + "mean_token_accuracy": 0.5361292362213135, + "num_tokens": 3076766959.0, + "step": 6017 + }, + { + "epoch": 1.6273661438615468, + "grad_norm": 1.5052789449691772, + "learning_rate": 1.6185586108225923e-05, + "loss": 2.0164, + "mean_token_accuracy": 0.5436661243438721, + "num_tokens": 3077291051.0, + "step": 6018 + }, + { + "epoch": 1.6276365603028664, + "grad_norm": 1.1428731679916382, + "learning_rate": 1.6184297523875643e-05, + "loss": 1.9559, + "mean_token_accuracy": 0.5490092039108276, + "num_tokens": 3077815333.0, + "step": 6019 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 1.3827428817749023, + "learning_rate": 1.618300878045587e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5457119941711426, + "num_tokens": 3078339547.0, + "step": 6020 + }, + { + "epoch": 1.6281773931855057, + "grad_norm": 0.6612464785575867, + "learning_rate": 1.6181719878006147e-05, + "loss": 1.2968, + "mean_token_accuracy": 0.6599574089050293, + "num_tokens": 3078863753.0, + "step": 6021 + }, + { + "epoch": 1.6284478096268253, + "grad_norm": 1.8438044786453247, + "learning_rate": 1.6180430816566016e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.5585952997207642, + "num_tokens": 3079356405.0, + "step": 6022 + }, + { + "epoch": 1.628718226068145, + "grad_norm": 1.5402854681015015, + "learning_rate": 1.6179141596175034e-05, + "loss": 1.9175, + "mean_token_accuracy": 0.5590812563896179, + "num_tokens": 3079880576.0, + "step": 6023 + }, + { + "epoch": 1.6289886425094646, + "grad_norm": 1.1613810062408447, + "learning_rate": 1.617785221687276e-05, + "loss": 1.9094, + "mean_token_accuracy": 0.5636332035064697, + "num_tokens": 3080404845.0, + "step": 6024 + }, + { + "epoch": 1.629259058950784, + "grad_norm": 1.2303252220153809, + "learning_rate": 1.617656267869875e-05, + "loss": 1.7838, + "mean_token_accuracy": 0.5895696878433228, + "num_tokens": 3080929001.0, + "step": 6025 + }, + { + "epoch": 1.6295294753921037, + "grad_norm": 1.5640546083450317, + "learning_rate": 1.6175272981692572e-05, + "loss": 2.007, + "mean_token_accuracy": 0.553025484085083, + "num_tokens": 3081394101.0, + "step": 6026 + }, + { + "epoch": 1.6297998918334233, + "grad_norm": 1.1074554920196533, + "learning_rate": 1.61739831258938e-05, + "loss": 1.8842, + "mean_token_accuracy": 0.5568281412124634, + "num_tokens": 3081918309.0, + "step": 6027 + }, + { + "epoch": 1.630070308274743, + "grad_norm": 1.4429192543029785, + "learning_rate": 1.6172693111342013e-05, + "loss": 1.9318, + "mean_token_accuracy": 0.5656901001930237, + "num_tokens": 3082442590.0, + "step": 6028 + }, + { + "epoch": 1.6303407247160626, + "grad_norm": 1.4782148599624634, + "learning_rate": 1.617140293807679e-05, + "loss": 1.9961, + "mean_token_accuracy": 0.563262939453125, + "num_tokens": 3082950206.0, + "step": 6029 + }, + { + "epoch": 1.6306111411573823, + "grad_norm": 1.3988970518112183, + "learning_rate": 1.6170112606137715e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.5391584634780884, + "num_tokens": 3083474403.0, + "step": 6030 + }, + { + "epoch": 1.630881557598702, + "grad_norm": 1.4020581245422363, + "learning_rate": 1.6168822115564378e-05, + "loss": 2.0156, + "mean_token_accuracy": 0.535525918006897, + "num_tokens": 3083957346.0, + "step": 6031 + }, + { + "epoch": 1.6311519740400215, + "grad_norm": 1.5008920431137085, + "learning_rate": 1.616753146639638e-05, + "loss": 1.8522, + "mean_token_accuracy": 0.5732043981552124, + "num_tokens": 3084481505.0, + "step": 6032 + }, + { + "epoch": 1.6314223904813412, + "grad_norm": 1.3283647298812866, + "learning_rate": 1.6166240658673322e-05, + "loss": 1.9508, + "mean_token_accuracy": 0.554338812828064, + "num_tokens": 3084969748.0, + "step": 6033 + }, + { + "epoch": 1.6316928069226608, + "grad_norm": 1.5304465293884277, + "learning_rate": 1.6164949692434807e-05, + "loss": 2.0234, + "mean_token_accuracy": 0.5619301795959473, + "num_tokens": 3085452078.0, + "step": 6034 + }, + { + "epoch": 1.6319632233639805, + "grad_norm": 1.66959810256958, + "learning_rate": 1.6163658567720442e-05, + "loss": 2.0367, + "mean_token_accuracy": 0.5483509302139282, + "num_tokens": 3085976284.0, + "step": 6035 + }, + { + "epoch": 1.6322336398053001, + "grad_norm": 1.3424433469772339, + "learning_rate": 1.6162367284569852e-05, + "loss": 1.9178, + "mean_token_accuracy": 0.5774703025817871, + "num_tokens": 3086500561.0, + "step": 6036 + }, + { + "epoch": 1.6325040562466198, + "grad_norm": 1.7059662342071533, + "learning_rate": 1.6161075843022648e-05, + "loss": 2.0081, + "mean_token_accuracy": 0.5583330392837524, + "num_tokens": 3086989032.0, + "step": 6037 + }, + { + "epoch": 1.6327744726879394, + "grad_norm": 1.4500306844711304, + "learning_rate": 1.6159784243118463e-05, + "loss": 2.0194, + "mean_token_accuracy": 0.5634154081344604, + "num_tokens": 3087513194.0, + "step": 6038 + }, + { + "epoch": 1.633044889129259, + "grad_norm": 1.3847321271896362, + "learning_rate": 1.6158492484896922e-05, + "loss": 1.9936, + "mean_token_accuracy": 0.5454542636871338, + "num_tokens": 3088037369.0, + "step": 6039 + }, + { + "epoch": 1.6333153055705787, + "grad_norm": 1.2285767793655396, + "learning_rate": 1.6157200568397663e-05, + "loss": 1.9755, + "mean_token_accuracy": 0.5598486661911011, + "num_tokens": 3088551363.0, + "step": 6040 + }, + { + "epoch": 1.6335857220118983, + "grad_norm": 0.6297555565834045, + "learning_rate": 1.6155908493660323e-05, + "loss": 1.0799, + "mean_token_accuracy": 0.710594892501831, + "num_tokens": 3089028596.0, + "step": 6041 + }, + { + "epoch": 1.633856138453218, + "grad_norm": 2.203341484069824, + "learning_rate": 1.6154616260724547e-05, + "loss": 1.8293, + "mean_token_accuracy": 0.5691810846328735, + "num_tokens": 3089523633.0, + "step": 6042 + }, + { + "epoch": 1.6341265548945376, + "grad_norm": 1.5144778490066528, + "learning_rate": 1.615332386962999e-05, + "loss": 2.0425, + "mean_token_accuracy": 0.557906985282898, + "num_tokens": 3089972529.0, + "step": 6043 + }, + { + "epoch": 1.6343969713358573, + "grad_norm": 1.414108395576477, + "learning_rate": 1.6152031320416296e-05, + "loss": 2.1264, + "mean_token_accuracy": 0.5663483142852783, + "num_tokens": 3090464076.0, + "step": 6044 + }, + { + "epoch": 1.634667387777177, + "grad_norm": 1.0904041528701782, + "learning_rate": 1.615073861312313e-05, + "loss": 1.898, + "mean_token_accuracy": 0.5844554901123047, + "num_tokens": 3090988286.0, + "step": 6045 + }, + { + "epoch": 1.6349378042184965, + "grad_norm": 1.4017138481140137, + "learning_rate": 1.614944574779016e-05, + "loss": 2.0754, + "mean_token_accuracy": 0.5459470748901367, + "num_tokens": 3091476816.0, + "step": 6046 + }, + { + "epoch": 1.6352082206598162, + "grad_norm": 1.0961434841156006, + "learning_rate": 1.614815272445705e-05, + "loss": 1.9384, + "mean_token_accuracy": 0.5597234964370728, + "num_tokens": 3091999405.0, + "step": 6047 + }, + { + "epoch": 1.6354786371011358, + "grad_norm": 1.1913516521453857, + "learning_rate": 1.614685954316347e-05, + "loss": 1.914, + "mean_token_accuracy": 0.5689287781715393, + "num_tokens": 3092523529.0, + "step": 6048 + }, + { + "epoch": 1.6357490535424555, + "grad_norm": 1.3087373971939087, + "learning_rate": 1.6145566203949112e-05, + "loss": 1.9596, + "mean_token_accuracy": 0.5579735040664673, + "num_tokens": 3092998662.0, + "step": 6049 + }, + { + "epoch": 1.6360194699837751, + "grad_norm": 1.341375470161438, + "learning_rate": 1.6144272706853644e-05, + "loss": 2.0236, + "mean_token_accuracy": 0.557296097278595, + "num_tokens": 3093494472.0, + "step": 6050 + }, + { + "epoch": 1.6362898864250948, + "grad_norm": 1.4419465065002441, + "learning_rate": 1.614297905191676e-05, + "loss": 1.9511, + "mean_token_accuracy": 0.5721524953842163, + "num_tokens": 3093988651.0, + "step": 6051 + }, + { + "epoch": 1.6365603028664144, + "grad_norm": 1.1832600831985474, + "learning_rate": 1.6141685239178158e-05, + "loss": 1.9102, + "mean_token_accuracy": 0.564767599105835, + "num_tokens": 3094491400.0, + "step": 6052 + }, + { + "epoch": 1.636830719307734, + "grad_norm": 1.5908689498901367, + "learning_rate": 1.614039126867753e-05, + "loss": 1.9627, + "mean_token_accuracy": 0.5519359707832336, + "num_tokens": 3095015678.0, + "step": 6053 + }, + { + "epoch": 1.6371011357490537, + "grad_norm": 1.306416392326355, + "learning_rate": 1.613909714045458e-05, + "loss": 2.0584, + "mean_token_accuracy": 0.5600554347038269, + "num_tokens": 3095539896.0, + "step": 6054 + }, + { + "epoch": 1.6373715521903733, + "grad_norm": 1.2341386079788208, + "learning_rate": 1.613780285454902e-05, + "loss": 1.8282, + "mean_token_accuracy": 0.5617004632949829, + "num_tokens": 3096064088.0, + "step": 6055 + }, + { + "epoch": 1.6376419686316928, + "grad_norm": 1.4918709993362427, + "learning_rate": 1.6136508411000558e-05, + "loss": 1.8185, + "mean_token_accuracy": 0.5681713819503784, + "num_tokens": 3096588183.0, + "step": 6056 + }, + { + "epoch": 1.6379123850730124, + "grad_norm": 1.391065239906311, + "learning_rate": 1.6135213809848912e-05, + "loss": 2.0343, + "mean_token_accuracy": 0.5467379093170166, + "num_tokens": 3097112447.0, + "step": 6057 + }, + { + "epoch": 1.638182801514332, + "grad_norm": 1.3272901773452759, + "learning_rate": 1.6133919051133804e-05, + "loss": 2.1335, + "mean_token_accuracy": 0.5328609943389893, + "num_tokens": 3097599554.0, + "step": 6058 + }, + { + "epoch": 1.6384532179556517, + "grad_norm": 1.7200345993041992, + "learning_rate": 1.6132624134894965e-05, + "loss": 1.9885, + "mean_token_accuracy": 0.5574510097503662, + "num_tokens": 3098123803.0, + "step": 6059 + }, + { + "epoch": 1.6387236343969713, + "grad_norm": 1.2999991178512573, + "learning_rate": 1.613132906117212e-05, + "loss": 2.0148, + "mean_token_accuracy": 0.5609229803085327, + "num_tokens": 3098617055.0, + "step": 6060 + }, + { + "epoch": 1.638994050838291, + "grad_norm": 0.6690181493759155, + "learning_rate": 1.613003383000501e-05, + "loss": 1.1638, + "mean_token_accuracy": 0.6942268013954163, + "num_tokens": 3099141307.0, + "step": 6061 + }, + { + "epoch": 1.6392644672796106, + "grad_norm": 2.2537434101104736, + "learning_rate": 1.6128738441433376e-05, + "loss": 2.0762, + "mean_token_accuracy": 0.5344752073287964, + "num_tokens": 3099665527.0, + "step": 6062 + }, + { + "epoch": 1.6395348837209303, + "grad_norm": 1.569884181022644, + "learning_rate": 1.6127442895496966e-05, + "loss": 1.963, + "mean_token_accuracy": 0.5432929396629333, + "num_tokens": 3100189789.0, + "step": 6063 + }, + { + "epoch": 1.63980530016225, + "grad_norm": 1.148682951927185, + "learning_rate": 1.6126147192235527e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.5324395895004272, + "num_tokens": 3100714004.0, + "step": 6064 + }, + { + "epoch": 1.6400757166035695, + "grad_norm": 1.5663318634033203, + "learning_rate": 1.6124851331688813e-05, + "loss": 1.9655, + "mean_token_accuracy": 0.5551180243492126, + "num_tokens": 3101238277.0, + "step": 6065 + }, + { + "epoch": 1.640346133044889, + "grad_norm": 1.7299154996871948, + "learning_rate": 1.6123555313896596e-05, + "loss": 2.0892, + "mean_token_accuracy": 0.5350254774093628, + "num_tokens": 3101735998.0, + "step": 6066 + }, + { + "epoch": 1.6406165494862086, + "grad_norm": 1.513298749923706, + "learning_rate": 1.6122259138898632e-05, + "loss": 1.9439, + "mean_token_accuracy": 0.5481142997741699, + "num_tokens": 3102260268.0, + "step": 6067 + }, + { + "epoch": 1.6408869659275283, + "grad_norm": 1.54441499710083, + "learning_rate": 1.612096280673469e-05, + "loss": 1.8656, + "mean_token_accuracy": 0.5929610133171082, + "num_tokens": 3102784406.0, + "step": 6068 + }, + { + "epoch": 1.641157382368848, + "grad_norm": 1.6029289960861206, + "learning_rate": 1.6119666317444555e-05, + "loss": 2.0568, + "mean_token_accuracy": 0.519033670425415, + "num_tokens": 3103308676.0, + "step": 6069 + }, + { + "epoch": 1.6414277988101675, + "grad_norm": 1.337062954902649, + "learning_rate": 1.6118369671068e-05, + "loss": 1.8638, + "mean_token_accuracy": 0.5584152936935425, + "num_tokens": 3103832893.0, + "step": 6070 + }, + { + "epoch": 1.6416982152514872, + "grad_norm": 1.2039108276367188, + "learning_rate": 1.6117072867644808e-05, + "loss": 1.9437, + "mean_token_accuracy": 0.543922483921051, + "num_tokens": 3104357119.0, + "step": 6071 + }, + { + "epoch": 1.6419686316928068, + "grad_norm": 1.2499117851257324, + "learning_rate": 1.6115775907214778e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.5554778575897217, + "num_tokens": 3104804366.0, + "step": 6072 + }, + { + "epoch": 1.6422390481341265, + "grad_norm": 1.1353224515914917, + "learning_rate": 1.611447878981769e-05, + "loss": 1.9508, + "mean_token_accuracy": 0.5346243977546692, + "num_tokens": 3105303257.0, + "step": 6073 + }, + { + "epoch": 1.642509464575446, + "grad_norm": 1.3823866844177246, + "learning_rate": 1.611318151549336e-05, + "loss": 2.0577, + "mean_token_accuracy": 0.5394347906112671, + "num_tokens": 3105827530.0, + "step": 6074 + }, + { + "epoch": 1.6427798810167658, + "grad_norm": 1.0166152715682983, + "learning_rate": 1.611188408428158e-05, + "loss": 1.9018, + "mean_token_accuracy": 0.5675064325332642, + "num_tokens": 3106283975.0, + "step": 6075 + }, + { + "epoch": 1.6430502974580854, + "grad_norm": 1.151140570640564, + "learning_rate": 1.6110586496222166e-05, + "loss": 1.92, + "mean_token_accuracy": 0.5589881539344788, + "num_tokens": 3106808155.0, + "step": 6076 + }, + { + "epoch": 1.643320713899405, + "grad_norm": 1.327541470527649, + "learning_rate": 1.6109288751354927e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5384993553161621, + "num_tokens": 3107332361.0, + "step": 6077 + }, + { + "epoch": 1.6435911303407247, + "grad_norm": 1.2022398710250854, + "learning_rate": 1.6107990849719687e-05, + "loss": 1.8898, + "mean_token_accuracy": 0.565843939781189, + "num_tokens": 3107814724.0, + "step": 6078 + }, + { + "epoch": 1.6438615467820443, + "grad_norm": 1.3705722093582153, + "learning_rate": 1.6106692791356263e-05, + "loss": 1.9165, + "mean_token_accuracy": 0.5669114589691162, + "num_tokens": 3108338899.0, + "step": 6079 + }, + { + "epoch": 1.644131963223364, + "grad_norm": 1.1626996994018555, + "learning_rate": 1.6105394576304484e-05, + "loss": 1.8634, + "mean_token_accuracy": 0.5651081204414368, + "num_tokens": 3108827801.0, + "step": 6080 + }, + { + "epoch": 1.6444023796646836, + "grad_norm": 0.6838481426239014, + "learning_rate": 1.610409620460419e-05, + "loss": 1.0563, + "mean_token_accuracy": 0.7099723815917969, + "num_tokens": 3109289752.0, + "step": 6081 + }, + { + "epoch": 1.6446727961060033, + "grad_norm": 2.513273000717163, + "learning_rate": 1.610279767629521e-05, + "loss": 2.0122, + "mean_token_accuracy": 0.5507831573486328, + "num_tokens": 3109813906.0, + "step": 6082 + }, + { + "epoch": 1.644943212547323, + "grad_norm": 2.139198064804077, + "learning_rate": 1.6101498991417393e-05, + "loss": 2.0845, + "mean_token_accuracy": 0.5236844420433044, + "num_tokens": 3110338179.0, + "step": 6083 + }, + { + "epoch": 1.6452136289886425, + "grad_norm": 1.5240854024887085, + "learning_rate": 1.6100200150010587e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5528820753097534, + "num_tokens": 3110763929.0, + "step": 6084 + }, + { + "epoch": 1.6454840454299622, + "grad_norm": 1.66545569896698, + "learning_rate": 1.609890115211464e-05, + "loss": 2.0108, + "mean_token_accuracy": 0.5322906374931335, + "num_tokens": 3111288141.0, + "step": 6085 + }, + { + "epoch": 1.6457544618712818, + "grad_norm": 1.5324347019195557, + "learning_rate": 1.6097601997769415e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.5548423528671265, + "num_tokens": 3111812350.0, + "step": 6086 + }, + { + "epoch": 1.6460248783126015, + "grad_norm": 1.3771651983261108, + "learning_rate": 1.6096302687014762e-05, + "loss": 1.7011, + "mean_token_accuracy": 0.6058837175369263, + "num_tokens": 3112336484.0, + "step": 6087 + }, + { + "epoch": 1.6462952947539211, + "grad_norm": 1.6087052822113037, + "learning_rate": 1.609500321989056e-05, + "loss": 1.9806, + "mean_token_accuracy": 0.5397462248802185, + "num_tokens": 3112860482.0, + "step": 6088 + }, + { + "epoch": 1.6465657111952408, + "grad_norm": 1.4693776369094849, + "learning_rate": 1.609370359643667e-05, + "loss": 1.9868, + "mean_token_accuracy": 0.55214524269104, + "num_tokens": 3113384671.0, + "step": 6089 + }, + { + "epoch": 1.6468361276365604, + "grad_norm": 1.1825809478759766, + "learning_rate": 1.609240381669298e-05, + "loss": 1.9301, + "mean_token_accuracy": 0.5615652203559875, + "num_tokens": 3113908877.0, + "step": 6090 + }, + { + "epoch": 1.64710654407788, + "grad_norm": 1.4524791240692139, + "learning_rate": 1.6091103880699363e-05, + "loss": 1.8899, + "mean_token_accuracy": 0.5670900344848633, + "num_tokens": 3114425928.0, + "step": 6091 + }, + { + "epoch": 1.6473769605191997, + "grad_norm": 1.4561271667480469, + "learning_rate": 1.608980378849571e-05, + "loss": 2.0051, + "mean_token_accuracy": 0.546281099319458, + "num_tokens": 3114950066.0, + "step": 6092 + }, + { + "epoch": 1.6476473769605193, + "grad_norm": 1.2610472440719604, + "learning_rate": 1.6088503540121904e-05, + "loss": 1.918, + "mean_token_accuracy": 0.5670897960662842, + "num_tokens": 3115462989.0, + "step": 6093 + }, + { + "epoch": 1.647917793401839, + "grad_norm": 1.267683982849121, + "learning_rate": 1.6087203135617844e-05, + "loss": 2.0658, + "mean_token_accuracy": 0.5399613976478577, + "num_tokens": 3115987245.0, + "step": 6094 + }, + { + "epoch": 1.6481882098431586, + "grad_norm": 1.1687854528427124, + "learning_rate": 1.608590257502344e-05, + "loss": 1.9933, + "mean_token_accuracy": 0.5326088070869446, + "num_tokens": 3116511348.0, + "step": 6095 + }, + { + "epoch": 1.6484586262844783, + "grad_norm": 1.3352750539779663, + "learning_rate": 1.6084601858378582e-05, + "loss": 1.7551, + "mean_token_accuracy": 0.603439211845398, + "num_tokens": 3116976515.0, + "step": 6096 + }, + { + "epoch": 1.6487290427257977, + "grad_norm": 1.5342493057250977, + "learning_rate": 1.6083300985723183e-05, + "loss": 1.8808, + "mean_token_accuracy": 0.5746046900749207, + "num_tokens": 3117500707.0, + "step": 6097 + }, + { + "epoch": 1.6489994591671173, + "grad_norm": 1.2813770771026611, + "learning_rate": 1.6081999957097164e-05, + "loss": 2.0151, + "mean_token_accuracy": 0.5346257090568542, + "num_tokens": 3118024986.0, + "step": 6098 + }, + { + "epoch": 1.649269875608437, + "grad_norm": 1.3640460968017578, + "learning_rate": 1.6080698772540446e-05, + "loss": 1.9935, + "mean_token_accuracy": 0.559702455997467, + "num_tokens": 3118543669.0, + "step": 6099 + }, + { + "epoch": 1.6495402920497566, + "grad_norm": 1.3722646236419678, + "learning_rate": 1.607939743209294e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5514389276504517, + "num_tokens": 3119067892.0, + "step": 6100 + }, + { + "epoch": 1.6498107084910762, + "grad_norm": 0.7503054738044739, + "learning_rate": 1.6078095935794588e-05, + "loss": 1.1274, + "mean_token_accuracy": 0.7078478336334229, + "num_tokens": 3119592100.0, + "step": 6101 + }, + { + "epoch": 1.650081124932396, + "grad_norm": 1.4895626306533813, + "learning_rate": 1.6076794283685322e-05, + "loss": 1.8067, + "mean_token_accuracy": 0.6019130945205688, + "num_tokens": 3120116318.0, + "step": 6102 + }, + { + "epoch": 1.6503515413737155, + "grad_norm": 1.9524896144866943, + "learning_rate": 1.607549247580507e-05, + "loss": 2.015, + "mean_token_accuracy": 0.557201087474823, + "num_tokens": 3120640579.0, + "step": 6103 + }, + { + "epoch": 1.6506219578150352, + "grad_norm": 1.485092282295227, + "learning_rate": 1.6074190512193788e-05, + "loss": 1.9363, + "mean_token_accuracy": 0.5784085988998413, + "num_tokens": 3121092259.0, + "step": 6104 + }, + { + "epoch": 1.6508923742563548, + "grad_norm": 1.531237244606018, + "learning_rate": 1.607288839289142e-05, + "loss": 2.0458, + "mean_token_accuracy": 0.5359058380126953, + "num_tokens": 3121616474.0, + "step": 6105 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 1.3102980852127075, + "learning_rate": 1.6071586117937915e-05, + "loss": 1.8598, + "mean_token_accuracy": 0.5558998584747314, + "num_tokens": 3122140717.0, + "step": 6106 + }, + { + "epoch": 1.6514332071389939, + "grad_norm": 1.5401835441589355, + "learning_rate": 1.607028368737323e-05, + "loss": 1.9222, + "mean_token_accuracy": 0.5618252754211426, + "num_tokens": 3122619284.0, + "step": 6107 + }, + { + "epoch": 1.6517036235803135, + "grad_norm": 1.46053946018219, + "learning_rate": 1.6068981101237337e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5577362775802612, + "num_tokens": 3123129372.0, + "step": 6108 + }, + { + "epoch": 1.6519740400216332, + "grad_norm": 1.2248753309249878, + "learning_rate": 1.6067678359570194e-05, + "loss": 1.8936, + "mean_token_accuracy": 0.5739295482635498, + "num_tokens": 3123593454.0, + "step": 6109 + }, + { + "epoch": 1.6522444564629528, + "grad_norm": 1.652673602104187, + "learning_rate": 1.6066375462411773e-05, + "loss": 2.0184, + "mean_token_accuracy": 0.5321192145347595, + "num_tokens": 3124117713.0, + "step": 6110 + }, + { + "epoch": 1.6525148729042725, + "grad_norm": 1.265497088432312, + "learning_rate": 1.6065072409802055e-05, + "loss": 1.8898, + "mean_token_accuracy": 0.5600152611732483, + "num_tokens": 3124641945.0, + "step": 6111 + }, + { + "epoch": 1.652785289345592, + "grad_norm": 1.6192152500152588, + "learning_rate": 1.6063769201781024e-05, + "loss": 2.0231, + "mean_token_accuracy": 0.5627652406692505, + "num_tokens": 3125129406.0, + "step": 6112 + }, + { + "epoch": 1.6530557057869117, + "grad_norm": 1.2507210969924927, + "learning_rate": 1.6062465838388663e-05, + "loss": 1.6537, + "mean_token_accuracy": 0.6378059387207031, + "num_tokens": 3125653652.0, + "step": 6113 + }, + { + "epoch": 1.6533261222282314, + "grad_norm": 1.213507890701294, + "learning_rate": 1.6061162319664956e-05, + "loss": 1.943, + "mean_token_accuracy": 0.557660698890686, + "num_tokens": 3126177829.0, + "step": 6114 + }, + { + "epoch": 1.653596538669551, + "grad_norm": 1.6519083976745605, + "learning_rate": 1.605985864564991e-05, + "loss": 1.9674, + "mean_token_accuracy": 0.5598106980323792, + "num_tokens": 3126644596.0, + "step": 6115 + }, + { + "epoch": 1.6538669551108707, + "grad_norm": 1.292945146560669, + "learning_rate": 1.605855481638352e-05, + "loss": 2.0163, + "mean_token_accuracy": 0.549682080745697, + "num_tokens": 3127168842.0, + "step": 6116 + }, + { + "epoch": 1.6541373715521903, + "grad_norm": 1.3554039001464844, + "learning_rate": 1.605725083190579e-05, + "loss": 2.1393, + "mean_token_accuracy": 0.5234279036521912, + "num_tokens": 3127693090.0, + "step": 6117 + }, + { + "epoch": 1.65440778799351, + "grad_norm": 1.073545217514038, + "learning_rate": 1.6055946692256733e-05, + "loss": 1.8557, + "mean_token_accuracy": 0.581385612487793, + "num_tokens": 3128217216.0, + "step": 6118 + }, + { + "epoch": 1.6546782044348296, + "grad_norm": 1.3527859449386597, + "learning_rate": 1.6054642397476362e-05, + "loss": 1.9101, + "mean_token_accuracy": 0.5605295300483704, + "num_tokens": 3128666014.0, + "step": 6119 + }, + { + "epoch": 1.6549486208761492, + "grad_norm": 1.2834967374801636, + "learning_rate": 1.60533379476047e-05, + "loss": 2.0421, + "mean_token_accuracy": 0.5398281216621399, + "num_tokens": 3129190260.0, + "step": 6120 + }, + { + "epoch": 1.6552190373174689, + "grad_norm": 0.6071980595588684, + "learning_rate": 1.6052033342681763e-05, + "loss": 1.1544, + "mean_token_accuracy": 0.6977059841156006, + "num_tokens": 3129714523.0, + "step": 6121 + }, + { + "epoch": 1.6554894537587885, + "grad_norm": 1.7807347774505615, + "learning_rate": 1.6050728582747594e-05, + "loss": 2.0815, + "mean_token_accuracy": 0.5341269969940186, + "num_tokens": 3130229123.0, + "step": 6122 + }, + { + "epoch": 1.6557598702001082, + "grad_norm": 1.4476466178894043, + "learning_rate": 1.6049423667842215e-05, + "loss": 1.9941, + "mean_token_accuracy": 0.5495807528495789, + "num_tokens": 3130753348.0, + "step": 6123 + }, + { + "epoch": 1.6560302866414278, + "grad_norm": 1.1150556802749634, + "learning_rate": 1.6048118598005666e-05, + "loss": 1.9633, + "mean_token_accuracy": 0.5416306257247925, + "num_tokens": 3131277603.0, + "step": 6124 + }, + { + "epoch": 1.6563007030827475, + "grad_norm": 1.3542872667312622, + "learning_rate": 1.6046813373278e-05, + "loss": 2.0603, + "mean_token_accuracy": 0.5621215105056763, + "num_tokens": 3131741677.0, + "step": 6125 + }, + { + "epoch": 1.656571119524067, + "grad_norm": 1.155775547027588, + "learning_rate": 1.6045507993699253e-05, + "loss": 1.8616, + "mean_token_accuracy": 0.5788471698760986, + "num_tokens": 3132265930.0, + "step": 6126 + }, + { + "epoch": 1.6568415359653867, + "grad_norm": 1.174955129623413, + "learning_rate": 1.604420245930948e-05, + "loss": 1.8919, + "mean_token_accuracy": 0.5633014440536499, + "num_tokens": 3132790114.0, + "step": 6127 + }, + { + "epoch": 1.6571119524067064, + "grad_norm": 1.196930170059204, + "learning_rate": 1.6042896770148747e-05, + "loss": 1.9152, + "mean_token_accuracy": 0.5529323816299438, + "num_tokens": 3133302475.0, + "step": 6128 + }, + { + "epoch": 1.657382368848026, + "grad_norm": 1.3462694883346558, + "learning_rate": 1.6041590926257106e-05, + "loss": 2.0061, + "mean_token_accuracy": 0.5272855758666992, + "num_tokens": 3133826750.0, + "step": 6129 + }, + { + "epoch": 1.6576527852893457, + "grad_norm": 1.2335376739501953, + "learning_rate": 1.6040284927674633e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5584259033203125, + "num_tokens": 3134350905.0, + "step": 6130 + }, + { + "epoch": 1.6579232017306653, + "grad_norm": 1.2846674919128418, + "learning_rate": 1.6038978774441397e-05, + "loss": 1.9552, + "mean_token_accuracy": 0.5553537607192993, + "num_tokens": 3134875014.0, + "step": 6131 + }, + { + "epoch": 1.658193618171985, + "grad_norm": 1.2817447185516357, + "learning_rate": 1.603767246659747e-05, + "loss": 1.9797, + "mean_token_accuracy": 0.5550812482833862, + "num_tokens": 3135399215.0, + "step": 6132 + }, + { + "epoch": 1.6584640346133046, + "grad_norm": 1.2800315618515015, + "learning_rate": 1.6036366004182938e-05, + "loss": 1.8683, + "mean_token_accuracy": 0.5887551307678223, + "num_tokens": 3135860554.0, + "step": 6133 + }, + { + "epoch": 1.6587344510546242, + "grad_norm": 1.3937638998031616, + "learning_rate": 1.6035059387237886e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.5328143835067749, + "num_tokens": 3136384585.0, + "step": 6134 + }, + { + "epoch": 1.659004867495944, + "grad_norm": 1.1434365510940552, + "learning_rate": 1.6033752615802406e-05, + "loss": 1.8937, + "mean_token_accuracy": 0.5563899278640747, + "num_tokens": 3136908859.0, + "step": 6135 + }, + { + "epoch": 1.6592752839372635, + "grad_norm": 1.2811899185180664, + "learning_rate": 1.6032445689916588e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.5601332187652588, + "num_tokens": 3137413974.0, + "step": 6136 + }, + { + "epoch": 1.6595457003785832, + "grad_norm": 1.3604190349578857, + "learning_rate": 1.6031138609620546e-05, + "loss": 1.9341, + "mean_token_accuracy": 0.5524961948394775, + "num_tokens": 3137886562.0, + "step": 6137 + }, + { + "epoch": 1.6598161168199026, + "grad_norm": 1.3585739135742188, + "learning_rate": 1.602983137495437e-05, + "loss": 2.0294, + "mean_token_accuracy": 0.5341746211051941, + "num_tokens": 3138410721.0, + "step": 6138 + }, + { + "epoch": 1.6600865332612222, + "grad_norm": 1.289708137512207, + "learning_rate": 1.6028523985958172e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5476665496826172, + "num_tokens": 3138934793.0, + "step": 6139 + }, + { + "epoch": 1.6603569497025419, + "grad_norm": 1.436004400253296, + "learning_rate": 1.6027216442672075e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5722278952598572, + "num_tokens": 3139459063.0, + "step": 6140 + }, + { + "epoch": 1.6606273661438615, + "grad_norm": 0.6681404709815979, + "learning_rate": 1.6025908745136192e-05, + "loss": 1.1747, + "mean_token_accuracy": 0.6835170388221741, + "num_tokens": 3139983245.0, + "step": 6141 + }, + { + "epoch": 1.6608977825851812, + "grad_norm": 1.393254280090332, + "learning_rate": 1.602460089339065e-05, + "loss": 1.8866, + "mean_token_accuracy": 0.5549225807189941, + "num_tokens": 3140507391.0, + "step": 6142 + }, + { + "epoch": 1.6611681990265008, + "grad_norm": 1.630773663520813, + "learning_rate": 1.6023292887475573e-05, + "loss": 2.0496, + "mean_token_accuracy": 0.5397913455963135, + "num_tokens": 3141031590.0, + "step": 6143 + }, + { + "epoch": 1.6614386154678205, + "grad_norm": 1.2358239889144897, + "learning_rate": 1.6021984727431102e-05, + "loss": 1.9854, + "mean_token_accuracy": 0.5633264780044556, + "num_tokens": 3141525466.0, + "step": 6144 + }, + { + "epoch": 1.66170903190914, + "grad_norm": 1.4792520999908447, + "learning_rate": 1.602067641329737e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.5076345801353455, + "num_tokens": 3142049699.0, + "step": 6145 + }, + { + "epoch": 1.6619794483504597, + "grad_norm": 1.5866210460662842, + "learning_rate": 1.601936794511452e-05, + "loss": 2.0038, + "mean_token_accuracy": 0.5565946102142334, + "num_tokens": 3142573855.0, + "step": 6146 + }, + { + "epoch": 1.6622498647917794, + "grad_norm": 1.2232080698013306, + "learning_rate": 1.6018059322922698e-05, + "loss": 1.9923, + "mean_token_accuracy": 0.5372769832611084, + "num_tokens": 3143098012.0, + "step": 6147 + }, + { + "epoch": 1.6625202812330988, + "grad_norm": 1.505012035369873, + "learning_rate": 1.601675054676206e-05, + "loss": 2.0185, + "mean_token_accuracy": 0.5447825193405151, + "num_tokens": 3143622290.0, + "step": 6148 + }, + { + "epoch": 1.6627906976744184, + "grad_norm": 1.477692723274231, + "learning_rate": 1.6015441616672764e-05, + "loss": 2.0615, + "mean_token_accuracy": 0.5563858151435852, + "num_tokens": 3144091817.0, + "step": 6149 + }, + { + "epoch": 1.663061114115738, + "grad_norm": 1.339576244354248, + "learning_rate": 1.6014132532694966e-05, + "loss": 1.8501, + "mean_token_accuracy": 0.5714770555496216, + "num_tokens": 3144597815.0, + "step": 6150 + }, + { + "epoch": 1.6633315305570577, + "grad_norm": 1.327560544013977, + "learning_rate": 1.6012823294868842e-05, + "loss": 1.9555, + "mean_token_accuracy": 0.5480698943138123, + "num_tokens": 3145122064.0, + "step": 6151 + }, + { + "epoch": 1.6636019469983774, + "grad_norm": 1.3005874156951904, + "learning_rate": 1.6011513903234552e-05, + "loss": 1.9313, + "mean_token_accuracy": 0.5603311061859131, + "num_tokens": 3145646183.0, + "step": 6152 + }, + { + "epoch": 1.663872363439697, + "grad_norm": 1.3005869388580322, + "learning_rate": 1.6010204357832283e-05, + "loss": 1.9489, + "mean_token_accuracy": 0.5670864582061768, + "num_tokens": 3146170426.0, + "step": 6153 + }, + { + "epoch": 1.6641427798810167, + "grad_norm": 1.4673081636428833, + "learning_rate": 1.6008894658702204e-05, + "loss": 1.947, + "mean_token_accuracy": 0.5576424598693848, + "num_tokens": 3146694583.0, + "step": 6154 + }, + { + "epoch": 1.6644131963223363, + "grad_norm": 1.2636150121688843, + "learning_rate": 1.6007584805884512e-05, + "loss": 2.0347, + "mean_token_accuracy": 0.5559728741645813, + "num_tokens": 3147168116.0, + "step": 6155 + }, + { + "epoch": 1.664683612763656, + "grad_norm": 1.47940993309021, + "learning_rate": 1.6006274799419387e-05, + "loss": 2.0159, + "mean_token_accuracy": 0.5441642999649048, + "num_tokens": 3147692249.0, + "step": 6156 + }, + { + "epoch": 1.6649540292049756, + "grad_norm": 1.213855266571045, + "learning_rate": 1.6004964639347033e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5539544820785522, + "num_tokens": 3148216448.0, + "step": 6157 + }, + { + "epoch": 1.6652244456462952, + "grad_norm": 1.1491221189498901, + "learning_rate": 1.6003654325707643e-05, + "loss": 1.947, + "mean_token_accuracy": 0.5618928670883179, + "num_tokens": 3148740651.0, + "step": 6158 + }, + { + "epoch": 1.6654948620876149, + "grad_norm": 1.164468765258789, + "learning_rate": 1.600234385854142e-05, + "loss": 1.9501, + "mean_token_accuracy": 0.5619829893112183, + "num_tokens": 3149264886.0, + "step": 6159 + }, + { + "epoch": 1.6657652785289345, + "grad_norm": 1.0010656118392944, + "learning_rate": 1.6001033237888585e-05, + "loss": 1.8482, + "mean_token_accuracy": 0.5838093757629395, + "num_tokens": 3149789063.0, + "step": 6160 + }, + { + "epoch": 1.6660356949702542, + "grad_norm": 0.5778952240943909, + "learning_rate": 1.599972246378934e-05, + "loss": 1.1099, + "mean_token_accuracy": 0.7099592089653015, + "num_tokens": 3150313342.0, + "step": 6161 + }, + { + "epoch": 1.6663061114115738, + "grad_norm": 1.6439770460128784, + "learning_rate": 1.5998411536283904e-05, + "loss": 2.0108, + "mean_token_accuracy": 0.5422900319099426, + "num_tokens": 3150837590.0, + "step": 6162 + }, + { + "epoch": 1.6665765278528935, + "grad_norm": 1.26849365234375, + "learning_rate": 1.59971004554125e-05, + "loss": 1.7592, + "mean_token_accuracy": 0.584392786026001, + "num_tokens": 3151361584.0, + "step": 6163 + }, + { + "epoch": 1.666846944294213, + "grad_norm": 1.3910070657730103, + "learning_rate": 1.5995789221215363e-05, + "loss": 1.9539, + "mean_token_accuracy": 0.5684413909912109, + "num_tokens": 3151885650.0, + "step": 6164 + }, + { + "epoch": 1.6671173607355327, + "grad_norm": 1.571486473083496, + "learning_rate": 1.5994477833732722e-05, + "loss": 2.0039, + "mean_token_accuracy": 0.5284659266471863, + "num_tokens": 3152409933.0, + "step": 6165 + }, + { + "epoch": 1.6673877771768524, + "grad_norm": 1.4755895137786865, + "learning_rate": 1.5993166293004812e-05, + "loss": 1.869, + "mean_token_accuracy": 0.5770761370658875, + "num_tokens": 3152934070.0, + "step": 6166 + }, + { + "epoch": 1.667658193618172, + "grad_norm": 22.501585006713867, + "learning_rate": 1.5991854599071874e-05, + "loss": 1.6983, + "mean_token_accuracy": 0.6008812785148621, + "num_tokens": 3153458255.0, + "step": 6167 + }, + { + "epoch": 1.6679286100594917, + "grad_norm": 1.7321940660476685, + "learning_rate": 1.599054275197416e-05, + "loss": 1.6303, + "mean_token_accuracy": 0.6401262283325195, + "num_tokens": 3153982498.0, + "step": 6168 + }, + { + "epoch": 1.6681990265008113, + "grad_norm": 2.0355937480926514, + "learning_rate": 1.5989230751751917e-05, + "loss": 2.0453, + "mean_token_accuracy": 0.5389676690101624, + "num_tokens": 3154506753.0, + "step": 6169 + }, + { + "epoch": 1.668469442942131, + "grad_norm": 1.3557499647140503, + "learning_rate": 1.59879185984454e-05, + "loss": 1.9058, + "mean_token_accuracy": 0.540864109992981, + "num_tokens": 3154993879.0, + "step": 6170 + }, + { + "epoch": 1.6687398593834506, + "grad_norm": 1.2276058197021484, + "learning_rate": 1.5986606292094874e-05, + "loss": 1.9611, + "mean_token_accuracy": 0.545746922492981, + "num_tokens": 3155518067.0, + "step": 6171 + }, + { + "epoch": 1.6690102758247702, + "grad_norm": 1.7368583679199219, + "learning_rate": 1.5985293832740606e-05, + "loss": 1.9741, + "mean_token_accuracy": 0.5494717359542847, + "num_tokens": 3156042267.0, + "step": 6172 + }, + { + "epoch": 1.6692806922660899, + "grad_norm": 1.4257384538650513, + "learning_rate": 1.5983981220422855e-05, + "loss": 1.8343, + "mean_token_accuracy": 0.5926693081855774, + "num_tokens": 3156566540.0, + "step": 6173 + }, + { + "epoch": 1.6695511087074095, + "grad_norm": 1.3935420513153076, + "learning_rate": 1.598266845518191e-05, + "loss": 1.9745, + "mean_token_accuracy": 0.5759787559509277, + "num_tokens": 3157055061.0, + "step": 6174 + }, + { + "epoch": 1.6698215251487292, + "grad_norm": 1.4546338319778442, + "learning_rate": 1.5981355537058038e-05, + "loss": 2.0591, + "mean_token_accuracy": 0.5437911152839661, + "num_tokens": 3157517893.0, + "step": 6175 + }, + { + "epoch": 1.6700919415900488, + "grad_norm": 1.7301762104034424, + "learning_rate": 1.5980042466091532e-05, + "loss": 2.0056, + "mean_token_accuracy": 0.550324022769928, + "num_tokens": 3157993781.0, + "step": 6176 + }, + { + "epoch": 1.6703623580313685, + "grad_norm": 4.03642463684082, + "learning_rate": 1.5978729242322675e-05, + "loss": 1.8702, + "mean_token_accuracy": 0.5573979616165161, + "num_tokens": 3158518034.0, + "step": 6177 + }, + { + "epoch": 1.670632774472688, + "grad_norm": 1.5003737211227417, + "learning_rate": 1.5977415865791767e-05, + "loss": 1.918, + "mean_token_accuracy": 0.5408490896224976, + "num_tokens": 3159030521.0, + "step": 6178 + }, + { + "epoch": 1.6709031909140075, + "grad_norm": 1.8221545219421387, + "learning_rate": 1.5976102336539102e-05, + "loss": 1.9593, + "mean_token_accuracy": 0.5838271379470825, + "num_tokens": 3159453455.0, + "step": 6179 + }, + { + "epoch": 1.6711736073553272, + "grad_norm": 1.3609278202056885, + "learning_rate": 1.597478865460498e-05, + "loss": 1.8801, + "mean_token_accuracy": 0.5788866281509399, + "num_tokens": 3159977612.0, + "step": 6180 + }, + { + "epoch": 1.6714440237966468, + "grad_norm": 0.8411844968795776, + "learning_rate": 1.5973474820029717e-05, + "loss": 1.1729, + "mean_token_accuracy": 0.7146915197372437, + "num_tokens": 3160473621.0, + "step": 6181 + }, + { + "epoch": 1.6717144402379664, + "grad_norm": 1.9968781471252441, + "learning_rate": 1.5972160832853616e-05, + "loss": 1.7916, + "mean_token_accuracy": 0.5770409107208252, + "num_tokens": 3160997902.0, + "step": 6182 + }, + { + "epoch": 1.671984856679286, + "grad_norm": 2.205026388168335, + "learning_rate": 1.5970846693117e-05, + "loss": 2.0612, + "mean_token_accuracy": 0.5421369075775146, + "num_tokens": 3161522112.0, + "step": 6183 + }, + { + "epoch": 1.6722552731206057, + "grad_norm": 1.340267539024353, + "learning_rate": 1.5969532400860193e-05, + "loss": 2.0819, + "mean_token_accuracy": 0.525048553943634, + "num_tokens": 3162046376.0, + "step": 6184 + }, + { + "epoch": 1.6725256895619254, + "grad_norm": 1.4618161916732788, + "learning_rate": 1.5968217956123514e-05, + "loss": 2.0397, + "mean_token_accuracy": 0.5386291146278381, + "num_tokens": 3162570627.0, + "step": 6185 + }, + { + "epoch": 1.672796106003245, + "grad_norm": 1.7851253747940063, + "learning_rate": 1.5966903358947297e-05, + "loss": 1.9538, + "mean_token_accuracy": 0.5643444657325745, + "num_tokens": 3163094851.0, + "step": 6186 + }, + { + "epoch": 1.6730665224445647, + "grad_norm": 1.240419864654541, + "learning_rate": 1.596558860937188e-05, + "loss": 1.9072, + "mean_token_accuracy": 0.5614607930183411, + "num_tokens": 3163619079.0, + "step": 6187 + }, + { + "epoch": 1.6733369388858843, + "grad_norm": 1.8721890449523926, + "learning_rate": 1.5964273707437603e-05, + "loss": 1.9381, + "mean_token_accuracy": 0.5659114718437195, + "num_tokens": 3164143338.0, + "step": 6188 + }, + { + "epoch": 1.673607355327204, + "grad_norm": 1.628234624862671, + "learning_rate": 1.5962958653184807e-05, + "loss": 1.9299, + "mean_token_accuracy": 0.5551167726516724, + "num_tokens": 3164637648.0, + "step": 6189 + }, + { + "epoch": 1.6738777717685234, + "grad_norm": 1.3720550537109375, + "learning_rate": 1.5961643446653843e-05, + "loss": 1.9057, + "mean_token_accuracy": 0.5776999592781067, + "num_tokens": 3165161927.0, + "step": 6190 + }, + { + "epoch": 1.674148188209843, + "grad_norm": 1.5002460479736328, + "learning_rate": 1.596032808788507e-05, + "loss": 1.9113, + "mean_token_accuracy": 0.5676292181015015, + "num_tokens": 3165686211.0, + "step": 6191 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 1.5996898412704468, + "learning_rate": 1.5959012576918843e-05, + "loss": 2.0175, + "mean_token_accuracy": 0.5636220574378967, + "num_tokens": 3166174050.0, + "step": 6192 + }, + { + "epoch": 1.6746890210924823, + "grad_norm": 1.7187198400497437, + "learning_rate": 1.5957696913795526e-05, + "loss": 2.1397, + "mean_token_accuracy": 0.49901077151298523, + "num_tokens": 3166698289.0, + "step": 6193 + }, + { + "epoch": 1.674959437533802, + "grad_norm": 1.2101707458496094, + "learning_rate": 1.595638109855549e-05, + "loss": 1.9219, + "mean_token_accuracy": 0.570014476776123, + "num_tokens": 3167195388.0, + "step": 6194 + }, + { + "epoch": 1.6752298539751216, + "grad_norm": 1.3981610536575317, + "learning_rate": 1.5955065131239102e-05, + "loss": 1.9262, + "mean_token_accuracy": 0.5648103952407837, + "num_tokens": 3167719505.0, + "step": 6195 + }, + { + "epoch": 1.6755002704164412, + "grad_norm": 1.1953389644622803, + "learning_rate": 1.5953749011886746e-05, + "loss": 2.0074, + "mean_token_accuracy": 0.5507001876831055, + "num_tokens": 3168243712.0, + "step": 6196 + }, + { + "epoch": 1.6757706868577609, + "grad_norm": 1.2966032028198242, + "learning_rate": 1.5952432740538802e-05, + "loss": 1.8331, + "mean_token_accuracy": 0.5811975002288818, + "num_tokens": 3168767900.0, + "step": 6197 + }, + { + "epoch": 1.6760411032990805, + "grad_norm": 1.3511747121810913, + "learning_rate": 1.595111631723566e-05, + "loss": 1.9931, + "mean_token_accuracy": 0.5623416900634766, + "num_tokens": 3169235578.0, + "step": 6198 + }, + { + "epoch": 1.6763115197404002, + "grad_norm": 1.2640479803085327, + "learning_rate": 1.5949799742017706e-05, + "loss": 1.8397, + "mean_token_accuracy": 0.5736485123634338, + "num_tokens": 3169759688.0, + "step": 6199 + }, + { + "epoch": 1.6765819361817198, + "grad_norm": 1.3524898290634155, + "learning_rate": 1.594848301492534e-05, + "loss": 1.9543, + "mean_token_accuracy": 0.5555471181869507, + "num_tokens": 3170283954.0, + "step": 6200 + }, + { + "epoch": 1.6768523526230394, + "grad_norm": 0.6635172367095947, + "learning_rate": 1.5947166135998965e-05, + "loss": 1.0523, + "mean_token_accuracy": 0.7121703028678894, + "num_tokens": 3170752137.0, + "step": 6201 + }, + { + "epoch": 1.677122769064359, + "grad_norm": 1.5405175685882568, + "learning_rate": 1.5945849105278982e-05, + "loss": 1.8495, + "mean_token_accuracy": 0.5600390434265137, + "num_tokens": 3171248285.0, + "step": 6202 + }, + { + "epoch": 1.6773931855056787, + "grad_norm": 1.3286762237548828, + "learning_rate": 1.5944531922805802e-05, + "loss": 1.9944, + "mean_token_accuracy": 0.5511953234672546, + "num_tokens": 3171770385.0, + "step": 6203 + }, + { + "epoch": 1.6776636019469984, + "grad_norm": 1.1691161394119263, + "learning_rate": 1.5943214588619843e-05, + "loss": 2.0252, + "mean_token_accuracy": 0.5406802892684937, + "num_tokens": 3172279684.0, + "step": 6204 + }, + { + "epoch": 1.677934018388318, + "grad_norm": 1.2470413446426392, + "learning_rate": 1.5941897102761525e-05, + "loss": 1.8544, + "mean_token_accuracy": 0.5788764953613281, + "num_tokens": 3172771207.0, + "step": 6205 + }, + { + "epoch": 1.6782044348296377, + "grad_norm": 1.4061307907104492, + "learning_rate": 1.594057946527127e-05, + "loss": 2.0169, + "mean_token_accuracy": 0.5531341433525085, + "num_tokens": 3173295366.0, + "step": 6206 + }, + { + "epoch": 1.6784748512709573, + "grad_norm": 1.4791316986083984, + "learning_rate": 1.5939261676189507e-05, + "loss": 2.1245, + "mean_token_accuracy": 0.5264788866043091, + "num_tokens": 3173819636.0, + "step": 6207 + }, + { + "epoch": 1.678745267712277, + "grad_norm": 1.3184698820114136, + "learning_rate": 1.5937943735556672e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5530359745025635, + "num_tokens": 3174343763.0, + "step": 6208 + }, + { + "epoch": 1.6790156841535966, + "grad_norm": 1.4749702215194702, + "learning_rate": 1.59366256434132e-05, + "loss": 2.0723, + "mean_token_accuracy": 0.5402615666389465, + "num_tokens": 3174867928.0, + "step": 6209 + }, + { + "epoch": 1.6792861005949162, + "grad_norm": 1.3019760847091675, + "learning_rate": 1.5935307399799536e-05, + "loss": 1.8349, + "mean_token_accuracy": 0.5760612487792969, + "num_tokens": 3175354353.0, + "step": 6210 + }, + { + "epoch": 1.6795565170362359, + "grad_norm": 1.4110075235366821, + "learning_rate": 1.593398900475613e-05, + "loss": 1.9709, + "mean_token_accuracy": 0.553367555141449, + "num_tokens": 3175815758.0, + "step": 6211 + }, + { + "epoch": 1.6798269334775555, + "grad_norm": 1.3189105987548828, + "learning_rate": 1.5932670458323428e-05, + "loss": 1.9833, + "mean_token_accuracy": 0.5440798997879028, + "num_tokens": 3176339946.0, + "step": 6212 + }, + { + "epoch": 1.6800973499188752, + "grad_norm": 1.356552004814148, + "learning_rate": 1.593135176054189e-05, + "loss": 1.8793, + "mean_token_accuracy": 0.5765812993049622, + "num_tokens": 3176794238.0, + "step": 6213 + }, + { + "epoch": 1.6803677663601948, + "grad_norm": 1.3569220304489136, + "learning_rate": 1.593003291145198e-05, + "loss": 1.9977, + "mean_token_accuracy": 0.5335115194320679, + "num_tokens": 3177318524.0, + "step": 6214 + }, + { + "epoch": 1.6806381828015144, + "grad_norm": 1.2165300846099854, + "learning_rate": 1.592871391109416e-05, + "loss": 1.9042, + "mean_token_accuracy": 0.567406177520752, + "num_tokens": 3177842673.0, + "step": 6215 + }, + { + "epoch": 1.680908599242834, + "grad_norm": 0.9786428213119507, + "learning_rate": 1.5927394759508904e-05, + "loss": 1.8933, + "mean_token_accuracy": 0.5724530220031738, + "num_tokens": 3178322054.0, + "step": 6216 + }, + { + "epoch": 1.6811790156841537, + "grad_norm": 1.3507484197616577, + "learning_rate": 1.5926075456736684e-05, + "loss": 2.0187, + "mean_token_accuracy": 0.5295912623405457, + "num_tokens": 3178846196.0, + "step": 6217 + }, + { + "epoch": 1.6814494321254734, + "grad_norm": 1.6585379838943481, + "learning_rate": 1.5924756002817987e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5500286817550659, + "num_tokens": 3179370429.0, + "step": 6218 + }, + { + "epoch": 1.681719848566793, + "grad_norm": 1.5151749849319458, + "learning_rate": 1.592343639779329e-05, + "loss": 2.0964, + "mean_token_accuracy": 0.535856306552887, + "num_tokens": 3179894694.0, + "step": 6219 + }, + { + "epoch": 1.6819902650081124, + "grad_norm": 1.5108484029769897, + "learning_rate": 1.5922116641703086e-05, + "loss": 1.9806, + "mean_token_accuracy": 0.5458459258079529, + "num_tokens": 3180367609.0, + "step": 6220 + }, + { + "epoch": 1.682260681449432, + "grad_norm": 0.5365947484970093, + "learning_rate": 1.5920796734587867e-05, + "loss": 1.1538, + "mean_token_accuracy": 0.6931661367416382, + "num_tokens": 3180891875.0, + "step": 6221 + }, + { + "epoch": 1.6825310978907517, + "grad_norm": 1.5210479497909546, + "learning_rate": 1.5919476676488135e-05, + "loss": 2.0024, + "mean_token_accuracy": 0.537366509437561, + "num_tokens": 3181416123.0, + "step": 6222 + }, + { + "epoch": 1.6828015143320714, + "grad_norm": 1.3760286569595337, + "learning_rate": 1.591815646744439e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5645919442176819, + "num_tokens": 3181940358.0, + "step": 6223 + }, + { + "epoch": 1.683071930773391, + "grad_norm": 1.0653252601623535, + "learning_rate": 1.5916836107497143e-05, + "loss": 2.0013, + "mean_token_accuracy": 0.5458613038063049, + "num_tokens": 3182464431.0, + "step": 6224 + }, + { + "epoch": 1.6833423472147107, + "grad_norm": 1.5220118761062622, + "learning_rate": 1.5915515596686903e-05, + "loss": 2.0446, + "mean_token_accuracy": 0.531571626663208, + "num_tokens": 3182988683.0, + "step": 6225 + }, + { + "epoch": 1.6836127636560303, + "grad_norm": 1.4488462209701538, + "learning_rate": 1.591419493505419e-05, + "loss": 2.0266, + "mean_token_accuracy": 0.5386644601821899, + "num_tokens": 3183512902.0, + "step": 6226 + }, + { + "epoch": 1.68388318009735, + "grad_norm": 1.2443910837173462, + "learning_rate": 1.5912874122639522e-05, + "loss": 2.054, + "mean_token_accuracy": 0.5393123030662537, + "num_tokens": 3184036977.0, + "step": 6227 + }, + { + "epoch": 1.6841535965386696, + "grad_norm": 1.2845958471298218, + "learning_rate": 1.5911553159483432e-05, + "loss": 2.0624, + "mean_token_accuracy": 0.5456528663635254, + "num_tokens": 3184561235.0, + "step": 6228 + }, + { + "epoch": 1.6844240129799892, + "grad_norm": 1.4723745584487915, + "learning_rate": 1.5910232045626446e-05, + "loss": 1.8302, + "mean_token_accuracy": 0.6050900220870972, + "num_tokens": 3185085398.0, + "step": 6229 + }, + { + "epoch": 1.6846944294213089, + "grad_norm": 1.1736421585083008, + "learning_rate": 1.5908910781109098e-05, + "loss": 1.9657, + "mean_token_accuracy": 0.5492295026779175, + "num_tokens": 3185609684.0, + "step": 6230 + }, + { + "epoch": 1.6849648458626283, + "grad_norm": 1.006795048713684, + "learning_rate": 1.5907589365971935e-05, + "loss": 1.996, + "mean_token_accuracy": 0.5286211371421814, + "num_tokens": 3186133924.0, + "step": 6231 + }, + { + "epoch": 1.685235262303948, + "grad_norm": 1.4645452499389648, + "learning_rate": 1.5906267800255495e-05, + "loss": 2.058, + "mean_token_accuracy": 0.5527195930480957, + "num_tokens": 3186510050.0, + "step": 6232 + }, + { + "epoch": 1.6855056787452676, + "grad_norm": 1.392899990081787, + "learning_rate": 1.5904946084000335e-05, + "loss": 2.0125, + "mean_token_accuracy": 0.5586923360824585, + "num_tokens": 3186971507.0, + "step": 6233 + }, + { + "epoch": 1.6857760951865872, + "grad_norm": 1.4339888095855713, + "learning_rate": 1.5903624217247e-05, + "loss": 1.9218, + "mean_token_accuracy": 0.5605602264404297, + "num_tokens": 3187495696.0, + "step": 6234 + }, + { + "epoch": 1.6860465116279069, + "grad_norm": 1.2161952257156372, + "learning_rate": 1.5902302200036054e-05, + "loss": 1.8808, + "mean_token_accuracy": 0.5676907300949097, + "num_tokens": 3187906105.0, + "step": 6235 + }, + { + "epoch": 1.6863169280692265, + "grad_norm": 1.3371466398239136, + "learning_rate": 1.5900980032408058e-05, + "loss": 2.043, + "mean_token_accuracy": 0.5428258180618286, + "num_tokens": 3188430333.0, + "step": 6236 + }, + { + "epoch": 1.6865873445105461, + "grad_norm": 1.35757577419281, + "learning_rate": 1.589965771440359e-05, + "loss": 1.8994, + "mean_token_accuracy": 0.5456447601318359, + "num_tokens": 3188942187.0, + "step": 6237 + }, + { + "epoch": 1.6868577609518658, + "grad_norm": 1.348811388015747, + "learning_rate": 1.5898335246063206e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.5377616882324219, + "num_tokens": 3189439237.0, + "step": 6238 + }, + { + "epoch": 1.6871281773931854, + "grad_norm": 1.3653357028961182, + "learning_rate": 1.5897012627427492e-05, + "loss": 2.0164, + "mean_token_accuracy": 0.552169919013977, + "num_tokens": 3189942514.0, + "step": 6239 + }, + { + "epoch": 1.687398593834505, + "grad_norm": 1.310562252998352, + "learning_rate": 1.5895689858537035e-05, + "loss": 1.9753, + "mean_token_accuracy": 0.5425752401351929, + "num_tokens": 3190466695.0, + "step": 6240 + }, + { + "epoch": 1.6876690102758247, + "grad_norm": 0.768414318561554, + "learning_rate": 1.5894366939432412e-05, + "loss": 1.1496, + "mean_token_accuracy": 0.6894338130950928, + "num_tokens": 3190957007.0, + "step": 6241 + }, + { + "epoch": 1.6879394267171444, + "grad_norm": 2.3623790740966797, + "learning_rate": 1.5893043870154215e-05, + "loss": 2.031, + "mean_token_accuracy": 0.5640628337860107, + "num_tokens": 3191387526.0, + "step": 6242 + }, + { + "epoch": 1.688209843158464, + "grad_norm": 1.7669070959091187, + "learning_rate": 1.589172065074305e-05, + "loss": 2.05, + "mean_token_accuracy": 0.5505357980728149, + "num_tokens": 3191911769.0, + "step": 6243 + }, + { + "epoch": 1.6884802595997837, + "grad_norm": 1.4790445566177368, + "learning_rate": 1.5890397281239504e-05, + "loss": 2.0072, + "mean_token_accuracy": 0.5447678565979004, + "num_tokens": 3192435960.0, + "step": 6244 + }, + { + "epoch": 1.6887506760411033, + "grad_norm": 1.765041708946228, + "learning_rate": 1.588907376168419e-05, + "loss": 2.0433, + "mean_token_accuracy": 0.5487369894981384, + "num_tokens": 3192960139.0, + "step": 6245 + }, + { + "epoch": 1.689021092482423, + "grad_norm": 1.316029667854309, + "learning_rate": 1.5887750092117717e-05, + "loss": 1.8931, + "mean_token_accuracy": 0.5731081366539001, + "num_tokens": 3193458033.0, + "step": 6246 + }, + { + "epoch": 1.6892915089237426, + "grad_norm": 1.2609329223632812, + "learning_rate": 1.5886426272580693e-05, + "loss": 2.031, + "mean_token_accuracy": 0.5546361207962036, + "num_tokens": 3193982157.0, + "step": 6247 + }, + { + "epoch": 1.6895619253650622, + "grad_norm": 1.35007905960083, + "learning_rate": 1.5885102303113742e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5597995519638062, + "num_tokens": 3194506425.0, + "step": 6248 + }, + { + "epoch": 1.6898323418063819, + "grad_norm": 1.3905550241470337, + "learning_rate": 1.5883778183757488e-05, + "loss": 2.0302, + "mean_token_accuracy": 0.5457642078399658, + "num_tokens": 3195030630.0, + "step": 6249 + }, + { + "epoch": 1.6901027582477015, + "grad_norm": 1.303787112236023, + "learning_rate": 1.5882453914552556e-05, + "loss": 2.0627, + "mean_token_accuracy": 0.539228081703186, + "num_tokens": 3195506195.0, + "step": 6250 + }, + { + "epoch": 1.6903731746890212, + "grad_norm": 1.3879427909851074, + "learning_rate": 1.588112949553958e-05, + "loss": 1.8812, + "mean_token_accuracy": 0.5819821953773499, + "num_tokens": 3196030455.0, + "step": 6251 + }, + { + "epoch": 1.6906435911303408, + "grad_norm": 1.325277328491211, + "learning_rate": 1.5879804926759196e-05, + "loss": 1.9515, + "mean_token_accuracy": 0.5537893176078796, + "num_tokens": 3196554663.0, + "step": 6252 + }, + { + "epoch": 1.6909140075716604, + "grad_norm": 1.4121655225753784, + "learning_rate": 1.5878480208252048e-05, + "loss": 1.9685, + "mean_token_accuracy": 0.5537682175636292, + "num_tokens": 3197078898.0, + "step": 6253 + }, + { + "epoch": 1.69118442401298, + "grad_norm": 1.431918740272522, + "learning_rate": 1.5877155340058776e-05, + "loss": 1.9612, + "mean_token_accuracy": 0.5690641403198242, + "num_tokens": 3197597365.0, + "step": 6254 + }, + { + "epoch": 1.6914548404542997, + "grad_norm": 1.7161732912063599, + "learning_rate": 1.587583032222004e-05, + "loss": 2.0191, + "mean_token_accuracy": 0.5306062698364258, + "num_tokens": 3198121612.0, + "step": 6255 + }, + { + "epoch": 1.6917252568956194, + "grad_norm": 1.3210569620132446, + "learning_rate": 1.5874505154776486e-05, + "loss": 1.9941, + "mean_token_accuracy": 0.5457839965820312, + "num_tokens": 3198645857.0, + "step": 6256 + }, + { + "epoch": 1.691995673336939, + "grad_norm": 1.3667937517166138, + "learning_rate": 1.5873179837768784e-05, + "loss": 1.7841, + "mean_token_accuracy": 0.5786122679710388, + "num_tokens": 3199169879.0, + "step": 6257 + }, + { + "epoch": 1.6922660897782587, + "grad_norm": 1.852988839149475, + "learning_rate": 1.587185437123759e-05, + "loss": 2.0447, + "mean_token_accuracy": 0.5313015580177307, + "num_tokens": 3199694104.0, + "step": 6258 + }, + { + "epoch": 1.6925365062195783, + "grad_norm": 1.4904073476791382, + "learning_rate": 1.587052875522358e-05, + "loss": 2.0371, + "mean_token_accuracy": 0.5182283520698547, + "num_tokens": 3200218196.0, + "step": 6259 + }, + { + "epoch": 1.692806922660898, + "grad_norm": 1.4907748699188232, + "learning_rate": 1.5869202989767424e-05, + "loss": 2.045, + "mean_token_accuracy": 0.541318416595459, + "num_tokens": 3200742394.0, + "step": 6260 + }, + { + "epoch": 1.6930773391022174, + "grad_norm": 0.7328980565071106, + "learning_rate": 1.58678770749098e-05, + "loss": 1.0862, + "mean_token_accuracy": 0.7064278721809387, + "num_tokens": 3201266607.0, + "step": 6261 + }, + { + "epoch": 1.693347755543537, + "grad_norm": 2.159060478210449, + "learning_rate": 1.586655101069139e-05, + "loss": 1.9226, + "mean_token_accuracy": 0.5551311373710632, + "num_tokens": 3201790811.0, + "step": 6262 + }, + { + "epoch": 1.6936181719848566, + "grad_norm": 1.724767804145813, + "learning_rate": 1.586522479715289e-05, + "loss": 1.953, + "mean_token_accuracy": 0.5562260150909424, + "num_tokens": 3202314967.0, + "step": 6263 + }, + { + "epoch": 1.6938885884261763, + "grad_norm": 1.2490919828414917, + "learning_rate": 1.5863898434334975e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.556833028793335, + "num_tokens": 3202839187.0, + "step": 6264 + }, + { + "epoch": 1.694159004867496, + "grad_norm": 1.5067307949066162, + "learning_rate": 1.5862571922278357e-05, + "loss": 1.9076, + "mean_token_accuracy": 0.5739458799362183, + "num_tokens": 3203363460.0, + "step": 6265 + }, + { + "epoch": 1.6944294213088156, + "grad_norm": 1.4167953729629517, + "learning_rate": 1.5861245261023736e-05, + "loss": 1.9475, + "mean_token_accuracy": 0.5609621405601501, + "num_tokens": 3203850856.0, + "step": 6266 + }, + { + "epoch": 1.6946998377501352, + "grad_norm": 1.940042495727539, + "learning_rate": 1.585991845061181e-05, + "loss": 1.9835, + "mean_token_accuracy": 0.5766966342926025, + "num_tokens": 3204297956.0, + "step": 6267 + }, + { + "epoch": 1.6949702541914549, + "grad_norm": 1.7349822521209717, + "learning_rate": 1.58585914910833e-05, + "loss": 1.9942, + "mean_token_accuracy": 0.5418190956115723, + "num_tokens": 3204822135.0, + "step": 6268 + }, + { + "epoch": 1.6952406706327745, + "grad_norm": 1.3575509786605835, + "learning_rate": 1.5857264382478906e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.5549451112747192, + "num_tokens": 3205305170.0, + "step": 6269 + }, + { + "epoch": 1.6955110870740941, + "grad_norm": 1.398919701576233, + "learning_rate": 1.585593712483936e-05, + "loss": 2.0389, + "mean_token_accuracy": 0.5456642508506775, + "num_tokens": 3205829230.0, + "step": 6270 + }, + { + "epoch": 1.6957815035154138, + "grad_norm": 1.3606418371200562, + "learning_rate": 1.5854609718205382e-05, + "loss": 1.9583, + "mean_token_accuracy": 0.5540526509284973, + "num_tokens": 3206326361.0, + "step": 6271 + }, + { + "epoch": 1.6960519199567332, + "grad_norm": 1.2494549751281738, + "learning_rate": 1.5853282162617705e-05, + "loss": 1.8875, + "mean_token_accuracy": 0.5500410795211792, + "num_tokens": 3206850574.0, + "step": 6272 + }, + { + "epoch": 1.6963223363980529, + "grad_norm": 1.3792961835861206, + "learning_rate": 1.5851954458117055e-05, + "loss": 1.8867, + "mean_token_accuracy": 0.5761557817459106, + "num_tokens": 3207310973.0, + "step": 6273 + }, + { + "epoch": 1.6965927528393725, + "grad_norm": 1.640857458114624, + "learning_rate": 1.585062660474418e-05, + "loss": 1.994, + "mean_token_accuracy": 0.5485144257545471, + "num_tokens": 3207835213.0, + "step": 6274 + }, + { + "epoch": 1.6968631692806921, + "grad_norm": 1.9051802158355713, + "learning_rate": 1.584929860253981e-05, + "loss": 1.9398, + "mean_token_accuracy": 0.5604549646377563, + "num_tokens": 3208340649.0, + "step": 6275 + }, + { + "epoch": 1.6971335857220118, + "grad_norm": 1.2480757236480713, + "learning_rate": 1.5847970451544695e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5704308748245239, + "num_tokens": 3208864918.0, + "step": 6276 + }, + { + "epoch": 1.6974040021633314, + "grad_norm": 1.4163289070129395, + "learning_rate": 1.5846642151799597e-05, + "loss": 1.985, + "mean_token_accuracy": 0.551520049571991, + "num_tokens": 3209389198.0, + "step": 6277 + }, + { + "epoch": 1.697674418604651, + "grad_norm": 1.310711145401001, + "learning_rate": 1.584531370334526e-05, + "loss": 1.8806, + "mean_token_accuracy": 0.5711308121681213, + "num_tokens": 3209913460.0, + "step": 6278 + }, + { + "epoch": 1.6979448350459707, + "grad_norm": 1.3650871515274048, + "learning_rate": 1.584398510622245e-05, + "loss": 2.0318, + "mean_token_accuracy": 0.5313934683799744, + "num_tokens": 3210437680.0, + "step": 6279 + }, + { + "epoch": 1.6982152514872904, + "grad_norm": 1.7188032865524292, + "learning_rate": 1.5842656360471933e-05, + "loss": 2.0649, + "mean_token_accuracy": 0.5355713963508606, + "num_tokens": 3210954049.0, + "step": 6280 + }, + { + "epoch": 1.69848566792861, + "grad_norm": 0.5880394577980042, + "learning_rate": 1.5841327466134476e-05, + "loss": 1.0525, + "mean_token_accuracy": 0.7132312655448914, + "num_tokens": 3211434729.0, + "step": 6281 + }, + { + "epoch": 1.6987560843699296, + "grad_norm": 1.7127692699432373, + "learning_rate": 1.5839998423250856e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.573052167892456, + "num_tokens": 3211925333.0, + "step": 6282 + }, + { + "epoch": 1.6990265008112493, + "grad_norm": 1.5163676738739014, + "learning_rate": 1.5838669231861852e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5502862930297852, + "num_tokens": 3212449569.0, + "step": 6283 + }, + { + "epoch": 1.699296917252569, + "grad_norm": 1.241490364074707, + "learning_rate": 1.5837339892008247e-05, + "loss": 1.9768, + "mean_token_accuracy": 0.5642791390419006, + "num_tokens": 3212973745.0, + "step": 6284 + }, + { + "epoch": 1.6995673336938886, + "grad_norm": 1.204668641090393, + "learning_rate": 1.5836010403730827e-05, + "loss": 1.8862, + "mean_token_accuracy": 0.5693271160125732, + "num_tokens": 3213459119.0, + "step": 6285 + }, + { + "epoch": 1.6998377501352082, + "grad_norm": 1.259186029434204, + "learning_rate": 1.5834680767070382e-05, + "loss": 2.0039, + "mean_token_accuracy": 0.5451575517654419, + "num_tokens": 3213958091.0, + "step": 6286 + }, + { + "epoch": 1.7001081665765279, + "grad_norm": 1.4038323163986206, + "learning_rate": 1.583335098206772e-05, + "loss": 2.0308, + "mean_token_accuracy": 0.5230611562728882, + "num_tokens": 3214482362.0, + "step": 6287 + }, + { + "epoch": 1.7003785830178475, + "grad_norm": 1.5324344635009766, + "learning_rate": 1.583202104876363e-05, + "loss": 1.9417, + "mean_token_accuracy": 0.5654879808425903, + "num_tokens": 3215006559.0, + "step": 6288 + }, + { + "epoch": 1.7006489994591671, + "grad_norm": 1.4175515174865723, + "learning_rate": 1.5830690967198926e-05, + "loss": 1.9755, + "mean_token_accuracy": 0.5470784902572632, + "num_tokens": 3215530838.0, + "step": 6289 + }, + { + "epoch": 1.7009194159004868, + "grad_norm": 1.4516913890838623, + "learning_rate": 1.582936073741442e-05, + "loss": 1.944, + "mean_token_accuracy": 0.5605630874633789, + "num_tokens": 3215999237.0, + "step": 6290 + }, + { + "epoch": 1.7011898323418064, + "grad_norm": 1.2209253311157227, + "learning_rate": 1.582803035945092e-05, + "loss": 2.0083, + "mean_token_accuracy": 0.5513678789138794, + "num_tokens": 3216523479.0, + "step": 6291 + }, + { + "epoch": 1.701460248783126, + "grad_norm": 1.1740515232086182, + "learning_rate": 1.5826699833349255e-05, + "loss": 1.8624, + "mean_token_accuracy": 0.5772693157196045, + "num_tokens": 3217047725.0, + "step": 6292 + }, + { + "epoch": 1.7017306652244457, + "grad_norm": 1.1274044513702393, + "learning_rate": 1.5825369159150245e-05, + "loss": 1.9329, + "mean_token_accuracy": 0.5545511841773987, + "num_tokens": 3217571880.0, + "step": 6293 + }, + { + "epoch": 1.7020010816657654, + "grad_norm": 1.0044370889663696, + "learning_rate": 1.582403833689471e-05, + "loss": 1.8562, + "mean_token_accuracy": 0.5556653738021851, + "num_tokens": 3218096084.0, + "step": 6294 + }, + { + "epoch": 1.702271498107085, + "grad_norm": 1.1765103340148926, + "learning_rate": 1.58227073666235e-05, + "loss": 1.9058, + "mean_token_accuracy": 0.5545556545257568, + "num_tokens": 3218620182.0, + "step": 6295 + }, + { + "epoch": 1.7025419145484046, + "grad_norm": 1.197130799293518, + "learning_rate": 1.5821376248377448e-05, + "loss": 1.936, + "mean_token_accuracy": 0.545699954032898, + "num_tokens": 3219144308.0, + "step": 6296 + }, + { + "epoch": 1.7028123309897243, + "grad_norm": 1.2862434387207031, + "learning_rate": 1.5820044982197388e-05, + "loss": 2.0223, + "mean_token_accuracy": 0.5361253023147583, + "num_tokens": 3219668364.0, + "step": 6297 + }, + { + "epoch": 1.703082747431044, + "grad_norm": 1.3886911869049072, + "learning_rate": 1.581871356812417e-05, + "loss": 1.8625, + "mean_token_accuracy": 0.5571904182434082, + "num_tokens": 3220192614.0, + "step": 6298 + }, + { + "epoch": 1.7033531638723636, + "grad_norm": 1.6462801694869995, + "learning_rate": 1.5817382006198658e-05, + "loss": 1.9897, + "mean_token_accuracy": 0.5484527349472046, + "num_tokens": 3220716887.0, + "step": 6299 + }, + { + "epoch": 1.7036235803136832, + "grad_norm": 1.4110764265060425, + "learning_rate": 1.5816050296461693e-05, + "loss": 2.0679, + "mean_token_accuracy": 0.5280715823173523, + "num_tokens": 3221192356.0, + "step": 6300 + }, + { + "epoch": 1.7038939967550029, + "grad_norm": 0.6222073435783386, + "learning_rate": 1.5814718438954143e-05, + "loss": 1.1158, + "mean_token_accuracy": 0.6986911296844482, + "num_tokens": 3221716533.0, + "step": 6301 + }, + { + "epoch": 1.7041644131963225, + "grad_norm": 1.3573544025421143, + "learning_rate": 1.5813386433716876e-05, + "loss": 1.9309, + "mean_token_accuracy": 0.5553004145622253, + "num_tokens": 3222240780.0, + "step": 6302 + }, + { + "epoch": 1.704434829637642, + "grad_norm": 1.471104383468628, + "learning_rate": 1.5812054280790753e-05, + "loss": 1.9341, + "mean_token_accuracy": 0.5736140012741089, + "num_tokens": 3222709655.0, + "step": 6303 + }, + { + "epoch": 1.7047052460789616, + "grad_norm": 1.1821978092193604, + "learning_rate": 1.5810721980216652e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5498438477516174, + "num_tokens": 3223233915.0, + "step": 6304 + }, + { + "epoch": 1.7049756625202812, + "grad_norm": 1.3154855966567993, + "learning_rate": 1.580938953203546e-05, + "loss": 1.9605, + "mean_token_accuracy": 0.5665625333786011, + "num_tokens": 3223757997.0, + "step": 6305 + }, + { + "epoch": 1.7052460789616009, + "grad_norm": 1.233382225036621, + "learning_rate": 1.5808056936288046e-05, + "loss": 1.9387, + "mean_token_accuracy": 0.5749732851982117, + "num_tokens": 3224282111.0, + "step": 6306 + }, + { + "epoch": 1.7055164954029205, + "grad_norm": 1.1663166284561157, + "learning_rate": 1.580672419301531e-05, + "loss": 2.007, + "mean_token_accuracy": 0.5491786003112793, + "num_tokens": 3224806372.0, + "step": 6307 + }, + { + "epoch": 1.7057869118442401, + "grad_norm": 1.2028061151504517, + "learning_rate": 1.5805391302258142e-05, + "loss": 1.8117, + "mean_token_accuracy": 0.5632381439208984, + "num_tokens": 3225330540.0, + "step": 6308 + }, + { + "epoch": 1.7060573282855598, + "grad_norm": 1.2988184690475464, + "learning_rate": 1.5804058264057434e-05, + "loss": 1.9262, + "mean_token_accuracy": 0.5646994113922119, + "num_tokens": 3225854655.0, + "step": 6309 + }, + { + "epoch": 1.7063277447268794, + "grad_norm": 1.0937706232070923, + "learning_rate": 1.580272507845409e-05, + "loss": 1.9713, + "mean_token_accuracy": 0.5528427958488464, + "num_tokens": 3226378917.0, + "step": 6310 + }, + { + "epoch": 1.706598161168199, + "grad_norm": 1.155580759048462, + "learning_rate": 1.5801391745489018e-05, + "loss": 1.8799, + "mean_token_accuracy": 0.5778712630271912, + "num_tokens": 3226903018.0, + "step": 6311 + }, + { + "epoch": 1.7068685776095187, + "grad_norm": 1.1320667266845703, + "learning_rate": 1.5800058265203126e-05, + "loss": 1.8987, + "mean_token_accuracy": 0.5506771802902222, + "num_tokens": 3227368301.0, + "step": 6312 + }, + { + "epoch": 1.7071389940508381, + "grad_norm": 1.0831902027130127, + "learning_rate": 1.5798724637637333e-05, + "loss": 1.8547, + "mean_token_accuracy": 0.5622420310974121, + "num_tokens": 3227892514.0, + "step": 6313 + }, + { + "epoch": 1.7074094104921578, + "grad_norm": 1.21109139919281, + "learning_rate": 1.5797390862832553e-05, + "loss": 1.9589, + "mean_token_accuracy": 0.5449814796447754, + "num_tokens": 3228416702.0, + "step": 6314 + }, + { + "epoch": 1.7076798269334774, + "grad_norm": 1.198935627937317, + "learning_rate": 1.579605694082972e-05, + "loss": 2.0148, + "mean_token_accuracy": 0.5495235323905945, + "num_tokens": 3228890891.0, + "step": 6315 + }, + { + "epoch": 1.707950243374797, + "grad_norm": 1.411885380744934, + "learning_rate": 1.5794722871669747e-05, + "loss": 1.991, + "mean_token_accuracy": 0.5586246252059937, + "num_tokens": 3229415000.0, + "step": 6316 + }, + { + "epoch": 1.7082206598161167, + "grad_norm": 1.2708125114440918, + "learning_rate": 1.5793388655393577e-05, + "loss": 1.9557, + "mean_token_accuracy": 0.5741263628005981, + "num_tokens": 3229907968.0, + "step": 6317 + }, + { + "epoch": 1.7084910762574363, + "grad_norm": 1.2558549642562866, + "learning_rate": 1.579205429204215e-05, + "loss": 1.9792, + "mean_token_accuracy": 0.5576300621032715, + "num_tokens": 3230352249.0, + "step": 6318 + }, + { + "epoch": 1.708761492698756, + "grad_norm": 1.5449975728988647, + "learning_rate": 1.5790719781656406e-05, + "loss": 2.029, + "mean_token_accuracy": 0.5343859195709229, + "num_tokens": 3230876325.0, + "step": 6319 + }, + { + "epoch": 1.7090319091400756, + "grad_norm": 1.3816248178482056, + "learning_rate": 1.5789385124277285e-05, + "loss": 1.9967, + "mean_token_accuracy": 0.5416587591171265, + "num_tokens": 3231400424.0, + "step": 6320 + }, + { + "epoch": 1.7093023255813953, + "grad_norm": 0.7674508690834045, + "learning_rate": 1.5788050319945746e-05, + "loss": 1.136, + "mean_token_accuracy": 0.6962451338768005, + "num_tokens": 3231924646.0, + "step": 6321 + }, + { + "epoch": 1.709572742022715, + "grad_norm": 2.2009646892547607, + "learning_rate": 1.5786715368702746e-05, + "loss": 1.9676, + "mean_token_accuracy": 0.564082145690918, + "num_tokens": 3232448879.0, + "step": 6322 + }, + { + "epoch": 1.7098431584640346, + "grad_norm": 1.7398438453674316, + "learning_rate": 1.578538027058924e-05, + "loss": 2.0494, + "mean_token_accuracy": 0.5376160144805908, + "num_tokens": 3232972972.0, + "step": 6323 + }, + { + "epoch": 1.7101135749053542, + "grad_norm": 1.211202621459961, + "learning_rate": 1.5784045025646192e-05, + "loss": 1.8959, + "mean_token_accuracy": 0.5603775978088379, + "num_tokens": 3233497119.0, + "step": 6324 + }, + { + "epoch": 1.7103839913466738, + "grad_norm": 1.581957459449768, + "learning_rate": 1.5782709633914574e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5608693361282349, + "num_tokens": 3234021288.0, + "step": 6325 + }, + { + "epoch": 1.7106544077879935, + "grad_norm": 1.3077938556671143, + "learning_rate": 1.578137409543536e-05, + "loss": 2.009, + "mean_token_accuracy": 0.5384908318519592, + "num_tokens": 3234493159.0, + "step": 6326 + }, + { + "epoch": 1.7109248242293131, + "grad_norm": 1.4172204732894897, + "learning_rate": 1.578003841024953e-05, + "loss": 1.842, + "mean_token_accuracy": 0.5442603230476379, + "num_tokens": 3234972381.0, + "step": 6327 + }, + { + "epoch": 1.7111952406706328, + "grad_norm": 1.384148359298706, + "learning_rate": 1.5778702578398065e-05, + "loss": 2.0352, + "mean_token_accuracy": 0.5431911945343018, + "num_tokens": 3235496603.0, + "step": 6328 + }, + { + "epoch": 1.7114656571119524, + "grad_norm": 1.3989214897155762, + "learning_rate": 1.5777366599921947e-05, + "loss": 2.0079, + "mean_token_accuracy": 0.5488949418067932, + "num_tokens": 3236020772.0, + "step": 6329 + }, + { + "epoch": 1.711736073553272, + "grad_norm": 1.4339755773544312, + "learning_rate": 1.5776030474862172e-05, + "loss": 1.9419, + "mean_token_accuracy": 0.5776782035827637, + "num_tokens": 3236489753.0, + "step": 6330 + }, + { + "epoch": 1.7120064899945917, + "grad_norm": 1.2661573886871338, + "learning_rate": 1.5774694203259742e-05, + "loss": 1.9436, + "mean_token_accuracy": 0.5547840595245361, + "num_tokens": 3237013993.0, + "step": 6331 + }, + { + "epoch": 1.7122769064359114, + "grad_norm": 1.2953033447265625, + "learning_rate": 1.577335778515565e-05, + "loss": 1.9486, + "mean_token_accuracy": 0.5603296160697937, + "num_tokens": 3237538211.0, + "step": 6332 + }, + { + "epoch": 1.712547322877231, + "grad_norm": 1.4476077556610107, + "learning_rate": 1.57720212205909e-05, + "loss": 2.0897, + "mean_token_accuracy": 0.5393206477165222, + "num_tokens": 3238062480.0, + "step": 6333 + }, + { + "epoch": 1.7128177393185506, + "grad_norm": 1.6803784370422363, + "learning_rate": 1.577068450960651e-05, + "loss": 2.0019, + "mean_token_accuracy": 0.568048357963562, + "num_tokens": 3238586725.0, + "step": 6334 + }, + { + "epoch": 1.7130881557598703, + "grad_norm": 1.3084625005722046, + "learning_rate": 1.576934765224349e-05, + "loss": 2.0161, + "mean_token_accuracy": 0.5554764270782471, + "num_tokens": 3239081600.0, + "step": 6335 + }, + { + "epoch": 1.71335857220119, + "grad_norm": 1.3508195877075195, + "learning_rate": 1.5768010648542854e-05, + "loss": 1.9872, + "mean_token_accuracy": 0.5574318170547485, + "num_tokens": 3239542029.0, + "step": 6336 + }, + { + "epoch": 1.7136289886425096, + "grad_norm": 1.704033374786377, + "learning_rate": 1.5766673498545634e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5645766258239746, + "num_tokens": 3240008585.0, + "step": 6337 + }, + { + "epoch": 1.7138994050838292, + "grad_norm": 1.4123233556747437, + "learning_rate": 1.5765336202292852e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5624228715896606, + "num_tokens": 3240501507.0, + "step": 6338 + }, + { + "epoch": 1.7141698215251489, + "grad_norm": 1.107218861579895, + "learning_rate": 1.576399875982554e-05, + "loss": 2.0597, + "mean_token_accuracy": 0.5398229956626892, + "num_tokens": 3241025740.0, + "step": 6339 + }, + { + "epoch": 1.7144402379664685, + "grad_norm": 1.75760018825531, + "learning_rate": 1.5762661171184736e-05, + "loss": 2.009, + "mean_token_accuracy": 0.545873761177063, + "num_tokens": 3241549910.0, + "step": 6340 + }, + { + "epoch": 1.7147106544077881, + "grad_norm": 0.582858145236969, + "learning_rate": 1.5761323436411484e-05, + "loss": 1.1688, + "mean_token_accuracy": 0.6863195896148682, + "num_tokens": 3242074181.0, + "step": 6341 + }, + { + "epoch": 1.7149810708491078, + "grad_norm": 1.5184937715530396, + "learning_rate": 1.5759985555546823e-05, + "loss": 1.9591, + "mean_token_accuracy": 0.5510284900665283, + "num_tokens": 3242598324.0, + "step": 6342 + }, + { + "epoch": 1.7152514872904274, + "grad_norm": 1.1246311664581299, + "learning_rate": 1.5758647528631806e-05, + "loss": 1.9543, + "mean_token_accuracy": 0.5519868731498718, + "num_tokens": 3243098589.0, + "step": 6343 + }, + { + "epoch": 1.7155219037317468, + "grad_norm": 1.3157604932785034, + "learning_rate": 1.5757309355707492e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5677893757820129, + "num_tokens": 3243622870.0, + "step": 6344 + }, + { + "epoch": 1.7157923201730665, + "grad_norm": 1.2405955791473389, + "learning_rate": 1.5755971036814935e-05, + "loss": 1.8644, + "mean_token_accuracy": 0.5947370529174805, + "num_tokens": 3244147064.0, + "step": 6345 + }, + { + "epoch": 1.7160627366143861, + "grad_norm": 1.4025230407714844, + "learning_rate": 1.5754632571995204e-05, + "loss": 1.9878, + "mean_token_accuracy": 0.5716280937194824, + "num_tokens": 3244611224.0, + "step": 6346 + }, + { + "epoch": 1.7163331530557058, + "grad_norm": 1.5553927421569824, + "learning_rate": 1.575329396128936e-05, + "loss": 2.0565, + "mean_token_accuracy": 0.5445576310157776, + "num_tokens": 3245135424.0, + "step": 6347 + }, + { + "epoch": 1.7166035694970254, + "grad_norm": 1.1726492643356323, + "learning_rate": 1.575195520473848e-05, + "loss": 1.9773, + "mean_token_accuracy": 0.5441596508026123, + "num_tokens": 3245659605.0, + "step": 6348 + }, + { + "epoch": 1.716873985938345, + "grad_norm": 1.3941409587860107, + "learning_rate": 1.5750616302383635e-05, + "loss": 1.8904, + "mean_token_accuracy": 0.5648068785667419, + "num_tokens": 3246183796.0, + "step": 6349 + }, + { + "epoch": 1.7171444023796647, + "grad_norm": 1.3045097589492798, + "learning_rate": 1.5749277254265917e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5333853960037231, + "num_tokens": 3246652175.0, + "step": 6350 + }, + { + "epoch": 1.7174148188209843, + "grad_norm": 1.5792768001556396, + "learning_rate": 1.5747938060426405e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5623663663864136, + "num_tokens": 3247176402.0, + "step": 6351 + }, + { + "epoch": 1.717685235262304, + "grad_norm": 1.4440128803253174, + "learning_rate": 1.574659872090619e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.5289139747619629, + "num_tokens": 3247700675.0, + "step": 6352 + }, + { + "epoch": 1.7179556517036236, + "grad_norm": 1.3384413719177246, + "learning_rate": 1.5745259235746363e-05, + "loss": 1.9274, + "mean_token_accuracy": 0.5596624612808228, + "num_tokens": 3248224886.0, + "step": 6353 + }, + { + "epoch": 1.718226068144943, + "grad_norm": 1.1025550365447998, + "learning_rate": 1.5743919604988033e-05, + "loss": 1.7912, + "mean_token_accuracy": 0.5852786302566528, + "num_tokens": 3248749069.0, + "step": 6354 + }, + { + "epoch": 1.7184964845862627, + "grad_norm": 1.1715078353881836, + "learning_rate": 1.57425798286723e-05, + "loss": 1.8819, + "mean_token_accuracy": 0.5763522386550903, + "num_tokens": 3249273284.0, + "step": 6355 + }, + { + "epoch": 1.7187669010275823, + "grad_norm": 1.316128134727478, + "learning_rate": 1.5741239906840268e-05, + "loss": 1.9646, + "mean_token_accuracy": 0.5531936883926392, + "num_tokens": 3249797285.0, + "step": 6356 + }, + { + "epoch": 1.719037317468902, + "grad_norm": 1.2189080715179443, + "learning_rate": 1.5739899839533052e-05, + "loss": 1.8981, + "mean_token_accuracy": 0.5562366843223572, + "num_tokens": 3250321407.0, + "step": 6357 + }, + { + "epoch": 1.7193077339102216, + "grad_norm": 2.642972707748413, + "learning_rate": 1.573855962679177e-05, + "loss": 1.9479, + "mean_token_accuracy": 0.5537914037704468, + "num_tokens": 3250845602.0, + "step": 6358 + }, + { + "epoch": 1.7195781503515413, + "grad_norm": 1.8994414806365967, + "learning_rate": 1.5737219268657544e-05, + "loss": 2.0979, + "mean_token_accuracy": 0.5263271927833557, + "num_tokens": 3251369728.0, + "step": 6359 + }, + { + "epoch": 1.719848566792861, + "grad_norm": 1.3057434558868408, + "learning_rate": 1.5735878765171498e-05, + "loss": 2.0096, + "mean_token_accuracy": 0.5391589403152466, + "num_tokens": 3251894005.0, + "step": 6360 + }, + { + "epoch": 1.7201189832341806, + "grad_norm": 0.7810719609260559, + "learning_rate": 1.5734538116374765e-05, + "loss": 1.2176, + "mean_token_accuracy": 0.7035647034645081, + "num_tokens": 3252354680.0, + "step": 6361 + }, + { + "epoch": 1.7203893996755002, + "grad_norm": 3.436035633087158, + "learning_rate": 1.5733197322308483e-05, + "loss": 1.9318, + "mean_token_accuracy": 0.565603494644165, + "num_tokens": 3252866154.0, + "step": 6362 + }, + { + "epoch": 1.7206598161168198, + "grad_norm": 2.9385764598846436, + "learning_rate": 1.5731856383013785e-05, + "loss": 2.0063, + "mean_token_accuracy": 0.5670232176780701, + "num_tokens": 3253338927.0, + "step": 6363 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 1.5650503635406494, + "learning_rate": 1.5730515298531815e-05, + "loss": 2.0138, + "mean_token_accuracy": 0.5472648739814758, + "num_tokens": 3253863113.0, + "step": 6364 + }, + { + "epoch": 1.7212006489994591, + "grad_norm": 1.7824342250823975, + "learning_rate": 1.5729174068903725e-05, + "loss": 2.0162, + "mean_token_accuracy": 0.552182674407959, + "num_tokens": 3254374161.0, + "step": 6365 + }, + { + "epoch": 1.7214710654407788, + "grad_norm": 2.4868338108062744, + "learning_rate": 1.572783269417067e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.5543878674507141, + "num_tokens": 3254898344.0, + "step": 6366 + }, + { + "epoch": 1.7217414818820984, + "grad_norm": 2.1596882343292236, + "learning_rate": 1.5726491174373802e-05, + "loss": 2.0395, + "mean_token_accuracy": 0.5484452843666077, + "num_tokens": 3255422618.0, + "step": 6367 + }, + { + "epoch": 1.722011898323418, + "grad_norm": 1.743983268737793, + "learning_rate": 1.5725149509554286e-05, + "loss": 1.9784, + "mean_token_accuracy": 0.5527456998825073, + "num_tokens": 3255946823.0, + "step": 6368 + }, + { + "epoch": 1.7222823147647377, + "grad_norm": 2.2216506004333496, + "learning_rate": 1.5723807699753287e-05, + "loss": 1.971, + "mean_token_accuracy": 0.5574718713760376, + "num_tokens": 3256426584.0, + "step": 6369 + }, + { + "epoch": 1.7225527312060573, + "grad_norm": 2.147789478302002, + "learning_rate": 1.5722465745011974e-05, + "loss": 2.0338, + "mean_token_accuracy": 0.5451235771179199, + "num_tokens": 3256929113.0, + "step": 6370 + }, + { + "epoch": 1.722823147647377, + "grad_norm": 1.5088826417922974, + "learning_rate": 1.5721123645371527e-05, + "loss": 1.8121, + "mean_token_accuracy": 0.5726592540740967, + "num_tokens": 3257453289.0, + "step": 6371 + }, + { + "epoch": 1.7230935640886966, + "grad_norm": 1.746704339981079, + "learning_rate": 1.5719781400873124e-05, + "loss": 2.0028, + "mean_token_accuracy": 0.538591742515564, + "num_tokens": 3257977556.0, + "step": 6372 + }, + { + "epoch": 1.7233639805300163, + "grad_norm": 1.4868764877319336, + "learning_rate": 1.5718439011557943e-05, + "loss": 1.9926, + "mean_token_accuracy": 0.5620369911193848, + "num_tokens": 3258463459.0, + "step": 6373 + }, + { + "epoch": 1.723634396971336, + "grad_norm": 1.3690240383148193, + "learning_rate": 1.5717096477467178e-05, + "loss": 1.9635, + "mean_token_accuracy": 0.5489159226417542, + "num_tokens": 3258987643.0, + "step": 6374 + }, + { + "epoch": 1.7239048134126556, + "grad_norm": 1.693527102470398, + "learning_rate": 1.5715753798642024e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5247623920440674, + "num_tokens": 3259511888.0, + "step": 6375 + }, + { + "epoch": 1.7241752298539752, + "grad_norm": 1.3221298456192017, + "learning_rate": 1.5714410975123672e-05, + "loss": 1.8185, + "mean_token_accuracy": 0.5884228944778442, + "num_tokens": 3260036149.0, + "step": 6376 + }, + { + "epoch": 1.7244456462952948, + "grad_norm": 1.1980012655258179, + "learning_rate": 1.571306800695333e-05, + "loss": 1.9761, + "mean_token_accuracy": 0.5621141791343689, + "num_tokens": 3260560383.0, + "step": 6377 + }, + { + "epoch": 1.7247160627366145, + "grad_norm": 1.321631908416748, + "learning_rate": 1.57117248941722e-05, + "loss": 1.9561, + "mean_token_accuracy": 0.5622777938842773, + "num_tokens": 3261084656.0, + "step": 6378 + }, + { + "epoch": 1.7249864791779341, + "grad_norm": 1.1970468759536743, + "learning_rate": 1.5710381636821493e-05, + "loss": 1.9586, + "mean_token_accuracy": 0.5613333582878113, + "num_tokens": 3261608934.0, + "step": 6379 + }, + { + "epoch": 1.7252568956192538, + "grad_norm": 1.25307035446167, + "learning_rate": 1.570903823494243e-05, + "loss": 2.0376, + "mean_token_accuracy": 0.542313814163208, + "num_tokens": 3262133057.0, + "step": 6380 + }, + { + "epoch": 1.7255273120605734, + "grad_norm": 0.5163004994392395, + "learning_rate": 1.5707694688576223e-05, + "loss": 1.15, + "mean_token_accuracy": 0.7050364017486572, + "num_tokens": 3262612063.0, + "step": 6381 + }, + { + "epoch": 1.725797728501893, + "grad_norm": 2.1319756507873535, + "learning_rate": 1.5706350997764092e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.5614305138587952, + "num_tokens": 3263136227.0, + "step": 6382 + }, + { + "epoch": 1.7260681449432127, + "grad_norm": 1.561126470565796, + "learning_rate": 1.5705007162547278e-05, + "loss": 1.9292, + "mean_token_accuracy": 0.5642125606536865, + "num_tokens": 3263603359.0, + "step": 6383 + }, + { + "epoch": 1.7263385613845323, + "grad_norm": 1.4345462322235107, + "learning_rate": 1.570366318296701e-05, + "loss": 2.0634, + "mean_token_accuracy": 0.5499247312545776, + "num_tokens": 3264127565.0, + "step": 6384 + }, + { + "epoch": 1.7266089778258518, + "grad_norm": 1.3129912614822388, + "learning_rate": 1.5702319059064518e-05, + "loss": 1.8745, + "mean_token_accuracy": 0.5638939142227173, + "num_tokens": 3264602400.0, + "step": 6385 + }, + { + "epoch": 1.7268793942671714, + "grad_norm": 1.8240292072296143, + "learning_rate": 1.5700974790881053e-05, + "loss": 1.9335, + "mean_token_accuracy": 0.581411600112915, + "num_tokens": 3265053135.0, + "step": 6386 + }, + { + "epoch": 1.727149810708491, + "grad_norm": 1.2549784183502197, + "learning_rate": 1.569963037845785e-05, + "loss": 1.9792, + "mean_token_accuracy": 0.5586139559745789, + "num_tokens": 3265556146.0, + "step": 6387 + }, + { + "epoch": 1.7274202271498107, + "grad_norm": 1.18318510055542, + "learning_rate": 1.5698285821836174e-05, + "loss": 1.929, + "mean_token_accuracy": 0.5500830411911011, + "num_tokens": 3266080396.0, + "step": 6388 + }, + { + "epoch": 1.7276906435911303, + "grad_norm": 1.6120491027832031, + "learning_rate": 1.569694112105727e-05, + "loss": 1.953, + "mean_token_accuracy": 0.5518693923950195, + "num_tokens": 3266586447.0, + "step": 6389 + }, + { + "epoch": 1.72796106003245, + "grad_norm": 1.5410523414611816, + "learning_rate": 1.5695596276162392e-05, + "loss": 2.0022, + "mean_token_accuracy": 0.556137204170227, + "num_tokens": 3267110687.0, + "step": 6390 + }, + { + "epoch": 1.7282314764737696, + "grad_norm": 1.3818278312683105, + "learning_rate": 1.5694251287192822e-05, + "loss": 1.9668, + "mean_token_accuracy": 0.5682992339134216, + "num_tokens": 3267628779.0, + "step": 6391 + }, + { + "epoch": 1.7285018929150893, + "grad_norm": 1.556148648262024, + "learning_rate": 1.5692906154189812e-05, + "loss": 1.9653, + "mean_token_accuracy": 0.5606337785720825, + "num_tokens": 3268153054.0, + "step": 6392 + }, + { + "epoch": 1.728772309356409, + "grad_norm": 1.47574782371521, + "learning_rate": 1.5691560877194643e-05, + "loss": 1.888, + "mean_token_accuracy": 0.600426197052002, + "num_tokens": 3268613855.0, + "step": 6393 + }, + { + "epoch": 1.7290427257977286, + "grad_norm": 1.398276686668396, + "learning_rate": 1.569021545624859e-05, + "loss": 1.8657, + "mean_token_accuracy": 0.5742506980895996, + "num_tokens": 3269138107.0, + "step": 6394 + }, + { + "epoch": 1.729313142239048, + "grad_norm": 1.2967034578323364, + "learning_rate": 1.568886989139293e-05, + "loss": 1.961, + "mean_token_accuracy": 0.5373872518539429, + "num_tokens": 3269627720.0, + "step": 6395 + }, + { + "epoch": 1.7295835586803676, + "grad_norm": 2.4192261695861816, + "learning_rate": 1.5687524182668954e-05, + "loss": 1.5134, + "mean_token_accuracy": 0.6347968578338623, + "num_tokens": 3270073923.0, + "step": 6396 + }, + { + "epoch": 1.7298539751216873, + "grad_norm": 1.5745162963867188, + "learning_rate": 1.568617833011795e-05, + "loss": 1.862, + "mean_token_accuracy": 0.5722395181655884, + "num_tokens": 3270598087.0, + "step": 6397 + }, + { + "epoch": 1.730124391563007, + "grad_norm": 1.572127342224121, + "learning_rate": 1.5684832333781217e-05, + "loss": 2.14, + "mean_token_accuracy": 0.5323539972305298, + "num_tokens": 3271122211.0, + "step": 6398 + }, + { + "epoch": 1.7303948080043265, + "grad_norm": 1.7269225120544434, + "learning_rate": 1.5683486193700048e-05, + "loss": 1.8762, + "mean_token_accuracy": 0.5798466205596924, + "num_tokens": 3271611635.0, + "step": 6399 + }, + { + "epoch": 1.7306652244456462, + "grad_norm": 1.633575439453125, + "learning_rate": 1.568213990991575e-05, + "loss": 2.0795, + "mean_token_accuracy": 0.5357394218444824, + "num_tokens": 3272135914.0, + "step": 6400 + }, + { + "epoch": 1.7309356408869658, + "grad_norm": 0.5261335968971252, + "learning_rate": 1.5680793482469633e-05, + "loss": 1.0384, + "mean_token_accuracy": 0.7323030829429626, + "num_tokens": 3272660041.0, + "step": 6401 + }, + { + "epoch": 1.7312060573282855, + "grad_norm": 2.1403188705444336, + "learning_rate": 1.5679446911403006e-05, + "loss": 2.053, + "mean_token_accuracy": 0.5358263254165649, + "num_tokens": 3273184291.0, + "step": 6402 + }, + { + "epoch": 1.7314764737696051, + "grad_norm": 1.4222058057785034, + "learning_rate": 1.567810019675718e-05, + "loss": 1.9833, + "mean_token_accuracy": 0.5618778467178345, + "num_tokens": 3273670610.0, + "step": 6403 + }, + { + "epoch": 1.7317468902109248, + "grad_norm": 1.2635880708694458, + "learning_rate": 1.5676753338573486e-05, + "loss": 1.8128, + "mean_token_accuracy": 0.5767334699630737, + "num_tokens": 3274194762.0, + "step": 6404 + }, + { + "epoch": 1.7320173066522444, + "grad_norm": 1.7040270566940308, + "learning_rate": 1.567540633689325e-05, + "loss": 1.9825, + "mean_token_accuracy": 0.5533818602561951, + "num_tokens": 3274718992.0, + "step": 6405 + }, + { + "epoch": 1.732287723093564, + "grad_norm": 1.5024372339248657, + "learning_rate": 1.5674059191757794e-05, + "loss": 2.095, + "mean_token_accuracy": 0.5339517593383789, + "num_tokens": 3275243214.0, + "step": 6406 + }, + { + "epoch": 1.7325581395348837, + "grad_norm": 1.6954460144042969, + "learning_rate": 1.5672711903208457e-05, + "loss": 1.9186, + "mean_token_accuracy": 0.5684682130813599, + "num_tokens": 3275735558.0, + "step": 6407 + }, + { + "epoch": 1.7328285559762033, + "grad_norm": 1.6818667650222778, + "learning_rate": 1.567136447128658e-05, + "loss": 2.0158, + "mean_token_accuracy": 0.5588075518608093, + "num_tokens": 3276198655.0, + "step": 6408 + }, + { + "epoch": 1.733098972417523, + "grad_norm": 1.1557856798171997, + "learning_rate": 1.56700168960335e-05, + "loss": 1.8685, + "mean_token_accuracy": 0.5858684778213501, + "num_tokens": 3276722799.0, + "step": 6409 + }, + { + "epoch": 1.7333693888588426, + "grad_norm": 1.4935146570205688, + "learning_rate": 1.566866917749057e-05, + "loss": 1.9569, + "mean_token_accuracy": 0.5560110807418823, + "num_tokens": 3277213117.0, + "step": 6410 + }, + { + "epoch": 1.7336398053001623, + "grad_norm": 1.5002349615097046, + "learning_rate": 1.566732131569914e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.5776665210723877, + "num_tokens": 3277720150.0, + "step": 6411 + }, + { + "epoch": 1.733910221741482, + "grad_norm": 1.3193029165267944, + "learning_rate": 1.5665973310700564e-05, + "loss": 1.9176, + "mean_token_accuracy": 0.5761339664459229, + "num_tokens": 3278244337.0, + "step": 6412 + }, + { + "epoch": 1.7341806381828015, + "grad_norm": 1.4085596799850464, + "learning_rate": 1.5664625162536208e-05, + "loss": 1.9021, + "mean_token_accuracy": 0.5303351879119873, + "num_tokens": 3278768507.0, + "step": 6413 + }, + { + "epoch": 1.7344510546241212, + "grad_norm": 1.221273422241211, + "learning_rate": 1.5663276871247435e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5461471676826477, + "num_tokens": 3279292656.0, + "step": 6414 + }, + { + "epoch": 1.7347214710654408, + "grad_norm": 1.45768404006958, + "learning_rate": 1.566192843687561e-05, + "loss": 1.9592, + "mean_token_accuracy": 0.568801999092102, + "num_tokens": 3279730527.0, + "step": 6415 + }, + { + "epoch": 1.7349918875067605, + "grad_norm": 1.502031683921814, + "learning_rate": 1.5660579859462112e-05, + "loss": 1.9362, + "mean_token_accuracy": 0.5482885837554932, + "num_tokens": 3280254720.0, + "step": 6416 + }, + { + "epoch": 1.7352623039480801, + "grad_norm": 1.6978462934494019, + "learning_rate": 1.5659231139048318e-05, + "loss": 1.9406, + "mean_token_accuracy": 0.5686721801757812, + "num_tokens": 3280747321.0, + "step": 6417 + }, + { + "epoch": 1.7355327203893998, + "grad_norm": 1.3123273849487305, + "learning_rate": 1.5657882275675613e-05, + "loss": 1.9574, + "mean_token_accuracy": 0.5636453032493591, + "num_tokens": 3281169363.0, + "step": 6418 + }, + { + "epoch": 1.7358031368307194, + "grad_norm": 1.2142969369888306, + "learning_rate": 1.565653326938538e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.5563242435455322, + "num_tokens": 3281693497.0, + "step": 6419 + }, + { + "epoch": 1.736073553272039, + "grad_norm": 1.4475159645080566, + "learning_rate": 1.5655184120219015e-05, + "loss": 2.0548, + "mean_token_accuracy": 0.5334843397140503, + "num_tokens": 3282217760.0, + "step": 6420 + }, + { + "epoch": 1.7363439697133587, + "grad_norm": 0.676527738571167, + "learning_rate": 1.5653834828217905e-05, + "loss": 1.1727, + "mean_token_accuracy": 0.6802886128425598, + "num_tokens": 3282732144.0, + "step": 6421 + }, + { + "epoch": 1.7366143861546783, + "grad_norm": 1.454742670059204, + "learning_rate": 1.565248539342346e-05, + "loss": 2.0093, + "mean_token_accuracy": 0.5244318246841431, + "num_tokens": 3283256276.0, + "step": 6422 + }, + { + "epoch": 1.736884802595998, + "grad_norm": 1.5612839460372925, + "learning_rate": 1.5651135815877082e-05, + "loss": 1.9463, + "mean_token_accuracy": 0.5561865568161011, + "num_tokens": 3283780560.0, + "step": 6423 + }, + { + "epoch": 1.7371552190373176, + "grad_norm": 1.2990542650222778, + "learning_rate": 1.564978609562018e-05, + "loss": 1.8416, + "mean_token_accuracy": 0.5749425888061523, + "num_tokens": 3284240578.0, + "step": 6424 + }, + { + "epoch": 1.7374256354786373, + "grad_norm": 1.3389811515808105, + "learning_rate": 1.5648436232694163e-05, + "loss": 1.9324, + "mean_token_accuracy": 0.5447794795036316, + "num_tokens": 3284764830.0, + "step": 6425 + }, + { + "epoch": 1.7376960519199567, + "grad_norm": 1.567559838294983, + "learning_rate": 1.5647086227140456e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5493459105491638, + "num_tokens": 3285288934.0, + "step": 6426 + }, + { + "epoch": 1.7379664683612763, + "grad_norm": 1.2807364463806152, + "learning_rate": 1.5645736079000476e-05, + "loss": 1.9531, + "mean_token_accuracy": 0.5590131282806396, + "num_tokens": 3285762598.0, + "step": 6427 + }, + { + "epoch": 1.738236884802596, + "grad_norm": 1.8768174648284912, + "learning_rate": 1.564438578831565e-05, + "loss": 2.1746, + "mean_token_accuracy": 0.525440514087677, + "num_tokens": 3286286840.0, + "step": 6428 + }, + { + "epoch": 1.7385073012439156, + "grad_norm": 1.2251265048980713, + "learning_rate": 1.564303535512741e-05, + "loss": 2.0435, + "mean_token_accuracy": 0.5547465085983276, + "num_tokens": 3286784492.0, + "step": 6429 + }, + { + "epoch": 1.7387777176852353, + "grad_norm": 1.571946144104004, + "learning_rate": 1.5641684779477193e-05, + "loss": 2.0009, + "mean_token_accuracy": 0.5226592421531677, + "num_tokens": 3287308618.0, + "step": 6430 + }, + { + "epoch": 1.739048134126555, + "grad_norm": 1.7358571290969849, + "learning_rate": 1.5640334061406432e-05, + "loss": 1.9645, + "mean_token_accuracy": 0.5634129643440247, + "num_tokens": 3287832787.0, + "step": 6431 + }, + { + "epoch": 1.7393185505678745, + "grad_norm": 1.2208764553070068, + "learning_rate": 1.5638983200956575e-05, + "loss": 1.9888, + "mean_token_accuracy": 0.555168628692627, + "num_tokens": 3288357009.0, + "step": 6432 + }, + { + "epoch": 1.7395889670091942, + "grad_norm": 1.3675035238265991, + "learning_rate": 1.5637632198169076e-05, + "loss": 1.817, + "mean_token_accuracy": 0.6048808693885803, + "num_tokens": 3288881173.0, + "step": 6433 + }, + { + "epoch": 1.7398593834505138, + "grad_norm": 1.7167617082595825, + "learning_rate": 1.5636281053085384e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.551673173904419, + "num_tokens": 3289342212.0, + "step": 6434 + }, + { + "epoch": 1.7401297998918335, + "grad_norm": 1.1852405071258545, + "learning_rate": 1.563492976574695e-05, + "loss": 1.9486, + "mean_token_accuracy": 0.5593696236610413, + "num_tokens": 3289743035.0, + "step": 6435 + }, + { + "epoch": 1.740400216333153, + "grad_norm": 1.3747340440750122, + "learning_rate": 1.5633578336195243e-05, + "loss": 1.9583, + "mean_token_accuracy": 0.5512697100639343, + "num_tokens": 3290267241.0, + "step": 6436 + }, + { + "epoch": 1.7406706327744725, + "grad_norm": 1.8798339366912842, + "learning_rate": 1.5632226764471728e-05, + "loss": 2.0965, + "mean_token_accuracy": 0.5363718271255493, + "num_tokens": 3290785401.0, + "step": 6437 + }, + { + "epoch": 1.7409410492157922, + "grad_norm": 1.3296399116516113, + "learning_rate": 1.563087505061787e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5507901906967163, + "num_tokens": 3291309638.0, + "step": 6438 + }, + { + "epoch": 1.7412114656571118, + "grad_norm": 1.418809175491333, + "learning_rate": 1.5629523194675147e-05, + "loss": 1.9322, + "mean_token_accuracy": 0.5681753158569336, + "num_tokens": 3291833884.0, + "step": 6439 + }, + { + "epoch": 1.7414818820984315, + "grad_norm": 1.429203748703003, + "learning_rate": 1.562817119668504e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.5670656561851501, + "num_tokens": 3292358157.0, + "step": 6440 + }, + { + "epoch": 1.741752298539751, + "grad_norm": 0.5890718698501587, + "learning_rate": 1.5626819056689028e-05, + "loss": 1.1816, + "mean_token_accuracy": 0.686111330986023, + "num_tokens": 3292882423.0, + "step": 6441 + }, + { + "epoch": 1.7420227149810708, + "grad_norm": 1.9541527032852173, + "learning_rate": 1.56254667747286e-05, + "loss": 2.0089, + "mean_token_accuracy": 0.5372627973556519, + "num_tokens": 3293406561.0, + "step": 6442 + }, + { + "epoch": 1.7422931314223904, + "grad_norm": 1.9279752969741821, + "learning_rate": 1.562411435084525e-05, + "loss": 2.0485, + "mean_token_accuracy": 0.5419010519981384, + "num_tokens": 3293868785.0, + "step": 6443 + }, + { + "epoch": 1.74256354786371, + "grad_norm": 1.1435045003890991, + "learning_rate": 1.562276178508048e-05, + "loss": 1.9337, + "mean_token_accuracy": 0.55735182762146, + "num_tokens": 3294392959.0, + "step": 6444 + }, + { + "epoch": 1.7428339643050297, + "grad_norm": 1.483402132987976, + "learning_rate": 1.5621409077475773e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.5400183200836182, + "num_tokens": 3294917216.0, + "step": 6445 + }, + { + "epoch": 1.7431043807463493, + "grad_norm": 1.4518214464187622, + "learning_rate": 1.562005622807265e-05, + "loss": 2.0225, + "mean_token_accuracy": 0.5365055203437805, + "num_tokens": 3295441490.0, + "step": 6446 + }, + { + "epoch": 1.743374797187669, + "grad_norm": 1.387169599533081, + "learning_rate": 1.5618703236912613e-05, + "loss": 2.0095, + "mean_token_accuracy": 0.5462402105331421, + "num_tokens": 3295965608.0, + "step": 6447 + }, + { + "epoch": 1.7436452136289886, + "grad_norm": 1.500577688217163, + "learning_rate": 1.5617350104037178e-05, + "loss": 1.9363, + "mean_token_accuracy": 0.5676818490028381, + "num_tokens": 3296471451.0, + "step": 6448 + }, + { + "epoch": 1.7439156300703083, + "grad_norm": 1.3076895475387573, + "learning_rate": 1.5615996829487866e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.565207302570343, + "num_tokens": 3296964378.0, + "step": 6449 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 1.7929847240447998, + "learning_rate": 1.5614643413306195e-05, + "loss": 1.974, + "mean_token_accuracy": 0.5650287866592407, + "num_tokens": 3297488565.0, + "step": 6450 + }, + { + "epoch": 1.7444564629529475, + "grad_norm": 1.4403389692306519, + "learning_rate": 1.5613289855533694e-05, + "loss": 2.0508, + "mean_token_accuracy": 0.5564384460449219, + "num_tokens": 3298012820.0, + "step": 6451 + }, + { + "epoch": 1.7447268793942672, + "grad_norm": 1.2496627569198608, + "learning_rate": 1.561193615621189e-05, + "loss": 1.7205, + "mean_token_accuracy": 0.5816426277160645, + "num_tokens": 3298536998.0, + "step": 6452 + }, + { + "epoch": 1.7449972958355868, + "grad_norm": 1.9327991008758545, + "learning_rate": 1.5610582315382323e-05, + "loss": 1.9397, + "mean_token_accuracy": 0.5641197562217712, + "num_tokens": 3299044884.0, + "step": 6453 + }, + { + "epoch": 1.7452677122769065, + "grad_norm": 1.5002256631851196, + "learning_rate": 1.5609228333086534e-05, + "loss": 2.0258, + "mean_token_accuracy": 0.5386658310890198, + "num_tokens": 3299569158.0, + "step": 6454 + }, + { + "epoch": 1.7455381287182261, + "grad_norm": 1.3392760753631592, + "learning_rate": 1.5607874209366066e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5537419319152832, + "num_tokens": 3300093328.0, + "step": 6455 + }, + { + "epoch": 1.7458085451595458, + "grad_norm": 1.7697378396987915, + "learning_rate": 1.5606519944262462e-05, + "loss": 2.0379, + "mean_token_accuracy": 0.5457826852798462, + "num_tokens": 3300617496.0, + "step": 6456 + }, + { + "epoch": 1.7460789616008654, + "grad_norm": 1.325001835823059, + "learning_rate": 1.5605165537817283e-05, + "loss": 1.9993, + "mean_token_accuracy": 0.5415347814559937, + "num_tokens": 3301141774.0, + "step": 6457 + }, + { + "epoch": 1.746349378042185, + "grad_norm": 1.594943881034851, + "learning_rate": 1.560381099007208e-05, + "loss": 2.1428, + "mean_token_accuracy": 0.5382645130157471, + "num_tokens": 3301652109.0, + "step": 6458 + }, + { + "epoch": 1.7466197944835047, + "grad_norm": 1.2952847480773926, + "learning_rate": 1.560245630106842e-05, + "loss": 1.8711, + "mean_token_accuracy": 0.5765931606292725, + "num_tokens": 3302176235.0, + "step": 6459 + }, + { + "epoch": 1.7468902109248243, + "grad_norm": 1.2165546417236328, + "learning_rate": 1.5601101470847863e-05, + "loss": 1.9372, + "mean_token_accuracy": 0.563865065574646, + "num_tokens": 3302700389.0, + "step": 6460 + }, + { + "epoch": 1.747160627366144, + "grad_norm": 0.7337183952331543, + "learning_rate": 1.5599746499451983e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.6977883577346802, + "num_tokens": 3303224640.0, + "step": 6461 + }, + { + "epoch": 1.7474310438074636, + "grad_norm": 1.961845874786377, + "learning_rate": 1.559839138692235e-05, + "loss": 2.0058, + "mean_token_accuracy": 0.5386135578155518, + "num_tokens": 3303748893.0, + "step": 6462 + }, + { + "epoch": 1.7477014602487833, + "grad_norm": 1.5986065864562988, + "learning_rate": 1.5597036133300555e-05, + "loss": 1.9794, + "mean_token_accuracy": 0.5472148656845093, + "num_tokens": 3304273005.0, + "step": 6463 + }, + { + "epoch": 1.747971876690103, + "grad_norm": 1.2152351140975952, + "learning_rate": 1.5595680738628162e-05, + "loss": 1.905, + "mean_token_accuracy": 0.5757460594177246, + "num_tokens": 3304797212.0, + "step": 6464 + }, + { + "epoch": 1.7482422931314225, + "grad_norm": 1.8483792543411255, + "learning_rate": 1.5594325202946776e-05, + "loss": 2.0869, + "mean_token_accuracy": 0.5396796464920044, + "num_tokens": 3305321424.0, + "step": 6465 + }, + { + "epoch": 1.7485127095727422, + "grad_norm": 1.596883773803711, + "learning_rate": 1.5592969526297982e-05, + "loss": 2.0422, + "mean_token_accuracy": 0.5369826555252075, + "num_tokens": 3305845696.0, + "step": 6466 + }, + { + "epoch": 1.7487831260140616, + "grad_norm": 1.5046577453613281, + "learning_rate": 1.559161370872337e-05, + "loss": 1.947, + "mean_token_accuracy": 0.5466406345367432, + "num_tokens": 3306369974.0, + "step": 6467 + }, + { + "epoch": 1.7490535424553812, + "grad_norm": 2.312262773513794, + "learning_rate": 1.5590257750264547e-05, + "loss": 2.0196, + "mean_token_accuracy": 0.5729365944862366, + "num_tokens": 3306839959.0, + "step": 6468 + }, + { + "epoch": 1.749323958896701, + "grad_norm": 1.6749731302261353, + "learning_rate": 1.558890165096312e-05, + "loss": 2.0837, + "mean_token_accuracy": 0.548126220703125, + "num_tokens": 3307349934.0, + "step": 6469 + }, + { + "epoch": 1.7495943753380205, + "grad_norm": 1.775805950164795, + "learning_rate": 1.55875454108607e-05, + "loss": 2.0583, + "mean_token_accuracy": 0.5644516348838806, + "num_tokens": 3307811914.0, + "step": 6470 + }, + { + "epoch": 1.7498647917793402, + "grad_norm": 1.3352607488632202, + "learning_rate": 1.5586189029998888e-05, + "loss": 1.9843, + "mean_token_accuracy": 0.5318628549575806, + "num_tokens": 3308336150.0, + "step": 6471 + }, + { + "epoch": 1.7501352082206598, + "grad_norm": 1.4806257486343384, + "learning_rate": 1.5584832508419314e-05, + "loss": 1.8824, + "mean_token_accuracy": 0.5672562718391418, + "num_tokens": 3308800194.0, + "step": 6472 + }, + { + "epoch": 1.7504056246619795, + "grad_norm": 1.7548315525054932, + "learning_rate": 1.5583475846163592e-05, + "loss": 2.0361, + "mean_token_accuracy": 0.5576506853103638, + "num_tokens": 3309324317.0, + "step": 6473 + }, + { + "epoch": 1.750676041103299, + "grad_norm": 1.5841667652130127, + "learning_rate": 1.5582119043273354e-05, + "loss": 2.018, + "mean_token_accuracy": 0.5363650918006897, + "num_tokens": 3309848601.0, + "step": 6474 + }, + { + "epoch": 1.7509464575446188, + "grad_norm": 1.4064515829086304, + "learning_rate": 1.5580762099790226e-05, + "loss": 1.9002, + "mean_token_accuracy": 0.5611043572425842, + "num_tokens": 3310307675.0, + "step": 6475 + }, + { + "epoch": 1.7512168739859384, + "grad_norm": 1.3575297594070435, + "learning_rate": 1.5579405015755852e-05, + "loss": 1.9357, + "mean_token_accuracy": 0.6040538549423218, + "num_tokens": 3310766833.0, + "step": 6476 + }, + { + "epoch": 1.7514872904272578, + "grad_norm": 1.3951971530914307, + "learning_rate": 1.557804779121186e-05, + "loss": 1.9834, + "mean_token_accuracy": 0.5543187856674194, + "num_tokens": 3311291071.0, + "step": 6477 + }, + { + "epoch": 1.7517577068685775, + "grad_norm": 1.3203521966934204, + "learning_rate": 1.55766904261999e-05, + "loss": 1.8909, + "mean_token_accuracy": 0.5595167875289917, + "num_tokens": 3311815334.0, + "step": 6478 + }, + { + "epoch": 1.752028123309897, + "grad_norm": 1.3753103017807007, + "learning_rate": 1.5575332920761616e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5729598999023438, + "num_tokens": 3312298015.0, + "step": 6479 + }, + { + "epoch": 1.7522985397512167, + "grad_norm": 1.2770529985427856, + "learning_rate": 1.557397527493867e-05, + "loss": 2.0441, + "mean_token_accuracy": 0.5541958808898926, + "num_tokens": 3312792023.0, + "step": 6480 + }, + { + "epoch": 1.7525689561925364, + "grad_norm": 0.7374402284622192, + "learning_rate": 1.5572617488772702e-05, + "loss": 1.1345, + "mean_token_accuracy": 0.6972100734710693, + "num_tokens": 3313316222.0, + "step": 6481 + }, + { + "epoch": 1.752839372633856, + "grad_norm": 2.06199049949646, + "learning_rate": 1.5571259562305386e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.54673171043396, + "num_tokens": 3313840314.0, + "step": 6482 + }, + { + "epoch": 1.7531097890751757, + "grad_norm": 1.632053256034851, + "learning_rate": 1.5569901495578385e-05, + "loss": 1.8736, + "mean_token_accuracy": 0.5625672936439514, + "num_tokens": 3314329334.0, + "step": 6483 + }, + { + "epoch": 1.7533802055164953, + "grad_norm": 1.3848090171813965, + "learning_rate": 1.5568543288633364e-05, + "loss": 1.9755, + "mean_token_accuracy": 0.5275540351867676, + "num_tokens": 3314853523.0, + "step": 6484 + }, + { + "epoch": 1.753650621957815, + "grad_norm": 1.7174828052520752, + "learning_rate": 1.5567184941512e-05, + "loss": 1.8973, + "mean_token_accuracy": 0.5748011469841003, + "num_tokens": 3315319045.0, + "step": 6485 + }, + { + "epoch": 1.7539210383991346, + "grad_norm": 1.8594999313354492, + "learning_rate": 1.5565826454255977e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.5444234013557434, + "num_tokens": 3315843295.0, + "step": 6486 + }, + { + "epoch": 1.7541914548404542, + "grad_norm": 1.5447285175323486, + "learning_rate": 1.5564467826906966e-05, + "loss": 1.8543, + "mean_token_accuracy": 0.5795348882675171, + "num_tokens": 3316367490.0, + "step": 6487 + }, + { + "epoch": 1.7544618712817739, + "grad_norm": 1.6188068389892578, + "learning_rate": 1.556310905950666e-05, + "loss": 1.8793, + "mean_token_accuracy": 0.5670205354690552, + "num_tokens": 3316891763.0, + "step": 6488 + }, + { + "epoch": 1.7547322877230935, + "grad_norm": 1.6574689149856567, + "learning_rate": 1.5561750152096747e-05, + "loss": 2.0951, + "mean_token_accuracy": 0.523160457611084, + "num_tokens": 3317390103.0, + "step": 6489 + }, + { + "epoch": 1.7550027041644132, + "grad_norm": 1.4072811603546143, + "learning_rate": 1.5560391104718927e-05, + "loss": 2.0242, + "mean_token_accuracy": 0.5578269362449646, + "num_tokens": 3317914339.0, + "step": 6490 + }, + { + "epoch": 1.7552731206057328, + "grad_norm": 1.442939043045044, + "learning_rate": 1.555903191741489e-05, + "loss": 1.925, + "mean_token_accuracy": 0.5494909286499023, + "num_tokens": 3318438618.0, + "step": 6491 + }, + { + "epoch": 1.7555435370470525, + "grad_norm": 1.5322257280349731, + "learning_rate": 1.5557672590226353e-05, + "loss": 2.0204, + "mean_token_accuracy": 0.5408425331115723, + "num_tokens": 3318962885.0, + "step": 6492 + }, + { + "epoch": 1.755813953488372, + "grad_norm": 1.26796555519104, + "learning_rate": 1.5556313123195017e-05, + "loss": 2.0318, + "mean_token_accuracy": 0.5485491752624512, + "num_tokens": 3319487136.0, + "step": 6493 + }, + { + "epoch": 1.7560843699296917, + "grad_norm": 1.2514009475708008, + "learning_rate": 1.5554953516362595e-05, + "loss": 1.9955, + "mean_token_accuracy": 0.5844098329544067, + "num_tokens": 3319948882.0, + "step": 6494 + }, + { + "epoch": 1.7563547863710114, + "grad_norm": 1.445040225982666, + "learning_rate": 1.5553593769770804e-05, + "loss": 2.145, + "mean_token_accuracy": 0.5272637009620667, + "num_tokens": 3320473157.0, + "step": 6495 + }, + { + "epoch": 1.756625202812331, + "grad_norm": 1.203208088874817, + "learning_rate": 1.555223388346136e-05, + "loss": 1.7144, + "mean_token_accuracy": 0.6055452823638916, + "num_tokens": 3320973221.0, + "step": 6496 + }, + { + "epoch": 1.7568956192536507, + "grad_norm": 1.074616551399231, + "learning_rate": 1.5550873857476e-05, + "loss": 1.8394, + "mean_token_accuracy": 0.5778530836105347, + "num_tokens": 3321497482.0, + "step": 6497 + }, + { + "epoch": 1.7571660356949703, + "grad_norm": 1.5870106220245361, + "learning_rate": 1.5549513691856443e-05, + "loss": 1.9581, + "mean_token_accuracy": 0.556102991104126, + "num_tokens": 3322021627.0, + "step": 6498 + }, + { + "epoch": 1.75743645213629, + "grad_norm": 1.1181247234344482, + "learning_rate": 1.5548153386644427e-05, + "loss": 1.9706, + "mean_token_accuracy": 0.5477108955383301, + "num_tokens": 3322545754.0, + "step": 6499 + }, + { + "epoch": 1.7577068685776096, + "grad_norm": 1.2774178981781006, + "learning_rate": 1.554679294188169e-05, + "loss": 1.9606, + "mean_token_accuracy": 0.5610719919204712, + "num_tokens": 3323070019.0, + "step": 6500 + }, + { + "epoch": 1.7579772850189292, + "grad_norm": 0.6037439703941345, + "learning_rate": 1.5545432357609978e-05, + "loss": 1.1121, + "mean_token_accuracy": 0.7041168808937073, + "num_tokens": 3323594152.0, + "step": 6501 + }, + { + "epoch": 1.758247701460249, + "grad_norm": 1.602555274963379, + "learning_rate": 1.554407163387103e-05, + "loss": 1.9642, + "mean_token_accuracy": 0.5451672673225403, + "num_tokens": 3324062995.0, + "step": 6502 + }, + { + "epoch": 1.7585181179015685, + "grad_norm": 1.5384047031402588, + "learning_rate": 1.5542710770706602e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5629828572273254, + "num_tokens": 3324556632.0, + "step": 6503 + }, + { + "epoch": 1.7587885343428882, + "grad_norm": 1.1623530387878418, + "learning_rate": 1.5541349768158446e-05, + "loss": 1.8676, + "mean_token_accuracy": 0.5801160335540771, + "num_tokens": 3325080745.0, + "step": 6504 + }, + { + "epoch": 1.7590589507842078, + "grad_norm": 1.460267424583435, + "learning_rate": 1.553998862626833e-05, + "loss": 1.8694, + "mean_token_accuracy": 0.5798357725143433, + "num_tokens": 3325573697.0, + "step": 6505 + }, + { + "epoch": 1.7593293672255275, + "grad_norm": 1.3201773166656494, + "learning_rate": 1.5538627345078005e-05, + "loss": 1.9395, + "mean_token_accuracy": 0.5597196817398071, + "num_tokens": 3326081812.0, + "step": 6506 + }, + { + "epoch": 1.759599783666847, + "grad_norm": 1.141577959060669, + "learning_rate": 1.5537265924629248e-05, + "loss": 1.9207, + "mean_token_accuracy": 0.559298038482666, + "num_tokens": 3326605921.0, + "step": 6507 + }, + { + "epoch": 1.7598702001081665, + "grad_norm": 1.3943911790847778, + "learning_rate": 1.5535904364963833e-05, + "loss": 1.9853, + "mean_token_accuracy": 0.5583344101905823, + "num_tokens": 3327130178.0, + "step": 6508 + }, + { + "epoch": 1.7601406165494862, + "grad_norm": 1.4088051319122314, + "learning_rate": 1.5534542666123525e-05, + "loss": 2.0017, + "mean_token_accuracy": 0.5508885383605957, + "num_tokens": 3327654233.0, + "step": 6509 + }, + { + "epoch": 1.7604110329908058, + "grad_norm": 1.431142807006836, + "learning_rate": 1.5533180828150117e-05, + "loss": 2.0933, + "mean_token_accuracy": 0.5319037437438965, + "num_tokens": 3328178469.0, + "step": 6510 + }, + { + "epoch": 1.7606814494321255, + "grad_norm": 1.2588168382644653, + "learning_rate": 1.5531818851085395e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5581114292144775, + "num_tokens": 3328613666.0, + "step": 6511 + }, + { + "epoch": 1.760951865873445, + "grad_norm": 1.289731502532959, + "learning_rate": 1.5530456734971138e-05, + "loss": 1.9884, + "mean_token_accuracy": 0.5463324189186096, + "num_tokens": 3329137940.0, + "step": 6512 + }, + { + "epoch": 1.7612222823147647, + "grad_norm": 1.5078381299972534, + "learning_rate": 1.5529094479849146e-05, + "loss": 2.0196, + "mean_token_accuracy": 0.5413390398025513, + "num_tokens": 3329662191.0, + "step": 6513 + }, + { + "epoch": 1.7614926987560844, + "grad_norm": 1.4119430780410767, + "learning_rate": 1.5527732085761216e-05, + "loss": 1.8793, + "mean_token_accuracy": 0.5577214956283569, + "num_tokens": 3330186213.0, + "step": 6514 + }, + { + "epoch": 1.761763115197404, + "grad_norm": 1.2624003887176514, + "learning_rate": 1.552636955274915e-05, + "loss": 1.9386, + "mean_token_accuracy": 0.5663000345230103, + "num_tokens": 3330710358.0, + "step": 6515 + }, + { + "epoch": 1.7620335316387237, + "grad_norm": 1.3444575071334839, + "learning_rate": 1.5525006880854757e-05, + "loss": 2.0501, + "mean_token_accuracy": 0.5273593664169312, + "num_tokens": 3331234513.0, + "step": 6516 + }, + { + "epoch": 1.7623039480800433, + "grad_norm": 1.402236819267273, + "learning_rate": 1.5523644070119842e-05, + "loss": 1.8917, + "mean_token_accuracy": 0.5680180191993713, + "num_tokens": 3331758752.0, + "step": 6517 + }, + { + "epoch": 1.7625743645213627, + "grad_norm": 10.231820106506348, + "learning_rate": 1.5522281120586226e-05, + "loss": 1.7767, + "mean_token_accuracy": 0.5776467323303223, + "num_tokens": 3332282753.0, + "step": 6518 + }, + { + "epoch": 1.7628447809626824, + "grad_norm": 1.6319209337234497, + "learning_rate": 1.5520918032295728e-05, + "loss": 1.7808, + "mean_token_accuracy": 0.6056102514266968, + "num_tokens": 3332806902.0, + "step": 6519 + }, + { + "epoch": 1.763115197404002, + "grad_norm": 1.6557295322418213, + "learning_rate": 1.5519554805290168e-05, + "loss": 2.0631, + "mean_token_accuracy": 0.5362609624862671, + "num_tokens": 3333331006.0, + "step": 6520 + }, + { + "epoch": 1.7633856138453217, + "grad_norm": 0.7002341747283936, + "learning_rate": 1.5518191439611376e-05, + "loss": 1.2089, + "mean_token_accuracy": 0.6686712503433228, + "num_tokens": 3333855174.0, + "step": 6521 + }, + { + "epoch": 1.7636560302866413, + "grad_norm": 1.1921335458755493, + "learning_rate": 1.5516827935301184e-05, + "loss": 1.8453, + "mean_token_accuracy": 0.5813031196594238, + "num_tokens": 3334374062.0, + "step": 6522 + }, + { + "epoch": 1.763926446727961, + "grad_norm": 1.3086940050125122, + "learning_rate": 1.5515464292401424e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.5293866395950317, + "num_tokens": 3334898317.0, + "step": 6523 + }, + { + "epoch": 1.7641968631692806, + "grad_norm": 1.019841194152832, + "learning_rate": 1.551410051095394e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5707463026046753, + "num_tokens": 3335422557.0, + "step": 6524 + }, + { + "epoch": 1.7644672796106002, + "grad_norm": 1.2274874448776245, + "learning_rate": 1.5512736591000583e-05, + "loss": 1.9454, + "mean_token_accuracy": 0.5661947727203369, + "num_tokens": 3335946790.0, + "step": 6525 + }, + { + "epoch": 1.7647376960519199, + "grad_norm": 1.1899333000183105, + "learning_rate": 1.5511372532583196e-05, + "loss": 2.0384, + "mean_token_accuracy": 0.5594350099563599, + "num_tokens": 3336471003.0, + "step": 6526 + }, + { + "epoch": 1.7650081124932395, + "grad_norm": 1.5965033769607544, + "learning_rate": 1.5510008335743627e-05, + "loss": 1.7973, + "mean_token_accuracy": 0.5981614589691162, + "num_tokens": 3336935712.0, + "step": 6527 + }, + { + "epoch": 1.7652785289345592, + "grad_norm": 1.3088483810424805, + "learning_rate": 1.5508644000523744e-05, + "loss": 1.9423, + "mean_token_accuracy": 0.5627470016479492, + "num_tokens": 3337459875.0, + "step": 6528 + }, + { + "epoch": 1.7655489453758788, + "grad_norm": 1.1051151752471924, + "learning_rate": 1.5507279526965403e-05, + "loss": 1.9963, + "mean_token_accuracy": 0.5408544540405273, + "num_tokens": 3337955223.0, + "step": 6529 + }, + { + "epoch": 1.7658193618171985, + "grad_norm": 1.168015956878662, + "learning_rate": 1.550591491511047e-05, + "loss": 1.9849, + "mean_token_accuracy": 0.5462763905525208, + "num_tokens": 3338479398.0, + "step": 6530 + }, + { + "epoch": 1.766089778258518, + "grad_norm": 1.1814672946929932, + "learning_rate": 1.5504550165000816e-05, + "loss": 2.0452, + "mean_token_accuracy": 0.5400367975234985, + "num_tokens": 3339003431.0, + "step": 6531 + }, + { + "epoch": 1.7663601946998377, + "grad_norm": 1.058707594871521, + "learning_rate": 1.5503185276678318e-05, + "loss": 1.9764, + "mean_token_accuracy": 0.5571587681770325, + "num_tokens": 3339486285.0, + "step": 6532 + }, + { + "epoch": 1.7666306111411574, + "grad_norm": 1.3666096925735474, + "learning_rate": 1.5501820250184848e-05, + "loss": 1.9503, + "mean_token_accuracy": 0.5602341890335083, + "num_tokens": 3340010429.0, + "step": 6533 + }, + { + "epoch": 1.766901027582477, + "grad_norm": 1.4539607763290405, + "learning_rate": 1.5500455085562297e-05, + "loss": 1.9368, + "mean_token_accuracy": 0.5598933100700378, + "num_tokens": 3340534637.0, + "step": 6534 + }, + { + "epoch": 1.7671714440237967, + "grad_norm": 1.251356601715088, + "learning_rate": 1.549908978285255e-05, + "loss": 2.0506, + "mean_token_accuracy": 0.539798378944397, + "num_tokens": 3341058871.0, + "step": 6535 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 1.3082265853881836, + "learning_rate": 1.5497724342097496e-05, + "loss": 2.0361, + "mean_token_accuracy": 0.5423625111579895, + "num_tokens": 3341583075.0, + "step": 6536 + }, + { + "epoch": 1.767712276906436, + "grad_norm": 1.2569563388824463, + "learning_rate": 1.5496358763339036e-05, + "loss": 1.8538, + "mean_token_accuracy": 0.5858319997787476, + "num_tokens": 3342107220.0, + "step": 6537 + }, + { + "epoch": 1.7679826933477556, + "grad_norm": 1.278234839439392, + "learning_rate": 1.549499304661906e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.536017656326294, + "num_tokens": 3342575991.0, + "step": 6538 + }, + { + "epoch": 1.7682531097890752, + "grad_norm": 1.3556333780288696, + "learning_rate": 1.549362719197948e-05, + "loss": 1.8825, + "mean_token_accuracy": 0.563115119934082, + "num_tokens": 3343100267.0, + "step": 6539 + }, + { + "epoch": 1.7685235262303949, + "grad_norm": 1.2635490894317627, + "learning_rate": 1.5492261199462202e-05, + "loss": 1.8763, + "mean_token_accuracy": 0.5642042756080627, + "num_tokens": 3343624468.0, + "step": 6540 + }, + { + "epoch": 1.7687939426717145, + "grad_norm": 0.5530900359153748, + "learning_rate": 1.5490895069109144e-05, + "loss": 1.1088, + "mean_token_accuracy": 0.6991591453552246, + "num_tokens": 3344148746.0, + "step": 6541 + }, + { + "epoch": 1.7690643591130342, + "grad_norm": 1.4950402975082397, + "learning_rate": 1.5489528800962217e-05, + "loss": 1.829, + "mean_token_accuracy": 0.5739032030105591, + "num_tokens": 3344672981.0, + "step": 6542 + }, + { + "epoch": 1.7693347755543538, + "grad_norm": 1.6938238143920898, + "learning_rate": 1.5488162395063344e-05, + "loss": 2.0653, + "mean_token_accuracy": 0.526642918586731, + "num_tokens": 3345197105.0, + "step": 6543 + }, + { + "epoch": 1.7696051919956735, + "grad_norm": 1.3320109844207764, + "learning_rate": 1.5486795851454448e-05, + "loss": 1.9614, + "mean_token_accuracy": 0.5496504306793213, + "num_tokens": 3345721287.0, + "step": 6544 + }, + { + "epoch": 1.769875608436993, + "grad_norm": 1.587982416152954, + "learning_rate": 1.5485429170177462e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5571675300598145, + "num_tokens": 3346245484.0, + "step": 6545 + }, + { + "epoch": 1.7701460248783127, + "grad_norm": 1.4300353527069092, + "learning_rate": 1.5484062351274317e-05, + "loss": 1.9934, + "mean_token_accuracy": 0.5603511333465576, + "num_tokens": 3346769674.0, + "step": 6546 + }, + { + "epoch": 1.7704164413196324, + "grad_norm": 1.5030651092529297, + "learning_rate": 1.5482695394786952e-05, + "loss": 1.9333, + "mean_token_accuracy": 0.5644708871841431, + "num_tokens": 3347235656.0, + "step": 6547 + }, + { + "epoch": 1.770686857760952, + "grad_norm": 1.581692099571228, + "learning_rate": 1.5481328300757307e-05, + "loss": 1.9849, + "mean_token_accuracy": 0.5630044341087341, + "num_tokens": 3347751340.0, + "step": 6548 + }, + { + "epoch": 1.7709572742022714, + "grad_norm": 1.2311768531799316, + "learning_rate": 1.5479961069227336e-05, + "loss": 2.0299, + "mean_token_accuracy": 0.5550536513328552, + "num_tokens": 3348237148.0, + "step": 6549 + }, + { + "epoch": 1.771227690643591, + "grad_norm": 1.434154987335205, + "learning_rate": 1.5478593700238985e-05, + "loss": 1.9177, + "mean_token_accuracy": 0.5687692165374756, + "num_tokens": 3348761158.0, + "step": 6550 + }, + { + "epoch": 1.7714981070849107, + "grad_norm": 1.2028855085372925, + "learning_rate": 1.5477226193834206e-05, + "loss": 1.9248, + "mean_token_accuracy": 0.5587626695632935, + "num_tokens": 3349267896.0, + "step": 6551 + }, + { + "epoch": 1.7717685235262304, + "grad_norm": 1.1159725189208984, + "learning_rate": 1.547585855005496e-05, + "loss": 2.0273, + "mean_token_accuracy": 0.5433337688446045, + "num_tokens": 3349792113.0, + "step": 6552 + }, + { + "epoch": 1.77203893996755, + "grad_norm": 1.7839422225952148, + "learning_rate": 1.547449076894321e-05, + "loss": 1.8993, + "mean_token_accuracy": 0.5724575519561768, + "num_tokens": 3350316226.0, + "step": 6553 + }, + { + "epoch": 1.7723093564088697, + "grad_norm": 1.5733290910720825, + "learning_rate": 1.547312285054093e-05, + "loss": 2.0889, + "mean_token_accuracy": 0.5505926609039307, + "num_tokens": 3350781087.0, + "step": 6554 + }, + { + "epoch": 1.7725797728501893, + "grad_norm": 1.5931947231292725, + "learning_rate": 1.5471754794890084e-05, + "loss": 2.0912, + "mean_token_accuracy": 0.5321121215820312, + "num_tokens": 3351305336.0, + "step": 6555 + }, + { + "epoch": 1.772850189291509, + "grad_norm": 1.3871535062789917, + "learning_rate": 1.5470386602032647e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.5414496660232544, + "num_tokens": 3351829600.0, + "step": 6556 + }, + { + "epoch": 1.7731206057328286, + "grad_norm": 1.299500584602356, + "learning_rate": 1.5469018272010607e-05, + "loss": 1.9012, + "mean_token_accuracy": 0.5561543703079224, + "num_tokens": 3352352724.0, + "step": 6557 + }, + { + "epoch": 1.7733910221741482, + "grad_norm": 1.2473186254501343, + "learning_rate": 1.5467649804865944e-05, + "loss": 1.9118, + "mean_token_accuracy": 0.5585243701934814, + "num_tokens": 3352876989.0, + "step": 6558 + }, + { + "epoch": 1.7736614386154677, + "grad_norm": 1.4055548906326294, + "learning_rate": 1.546628120064064e-05, + "loss": 1.9911, + "mean_token_accuracy": 0.5594956278800964, + "num_tokens": 3353401109.0, + "step": 6559 + }, + { + "epoch": 1.7739318550567873, + "grad_norm": 1.644737958908081, + "learning_rate": 1.54649124593767e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.5484383702278137, + "num_tokens": 3353925307.0, + "step": 6560 + }, + { + "epoch": 1.774202271498107, + "grad_norm": 0.5865132808685303, + "learning_rate": 1.546354358111611e-05, + "loss": 1.1175, + "mean_token_accuracy": 0.6989141702651978, + "num_tokens": 3354449548.0, + "step": 6561 + }, + { + "epoch": 1.7744726879394266, + "grad_norm": 1.6878249645233154, + "learning_rate": 1.5462174565900884e-05, + "loss": 2.0605, + "mean_token_accuracy": 0.5550966262817383, + "num_tokens": 3354908673.0, + "step": 6562 + }, + { + "epoch": 1.7747431043807462, + "grad_norm": 1.4636707305908203, + "learning_rate": 1.5460805413773016e-05, + "loss": 1.9053, + "mean_token_accuracy": 0.5712231397628784, + "num_tokens": 3355432876.0, + "step": 6563 + }, + { + "epoch": 1.7750135208220659, + "grad_norm": 1.2323302030563354, + "learning_rate": 1.5459436124774518e-05, + "loss": 1.9444, + "mean_token_accuracy": 0.5628161430358887, + "num_tokens": 3355956864.0, + "step": 6564 + }, + { + "epoch": 1.7752839372633855, + "grad_norm": 1.0952988862991333, + "learning_rate": 1.545806669894741e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.5738223791122437, + "num_tokens": 3356419948.0, + "step": 6565 + }, + { + "epoch": 1.7755543537047052, + "grad_norm": 1.1025409698486328, + "learning_rate": 1.5456697136333703e-05, + "loss": 1.7945, + "mean_token_accuracy": 0.5701251029968262, + "num_tokens": 3356944179.0, + "step": 6566 + }, + { + "epoch": 1.7758247701460248, + "grad_norm": 1.3365142345428467, + "learning_rate": 1.545532743697542e-05, + "loss": 1.9324, + "mean_token_accuracy": 0.563054621219635, + "num_tokens": 3357468416.0, + "step": 6567 + }, + { + "epoch": 1.7760951865873444, + "grad_norm": 1.2051657438278198, + "learning_rate": 1.5453957600914592e-05, + "loss": 2.0166, + "mean_token_accuracy": 0.5357259511947632, + "num_tokens": 3357992596.0, + "step": 6568 + }, + { + "epoch": 1.776365603028664, + "grad_norm": 1.117351770401001, + "learning_rate": 1.5452587628193243e-05, + "loss": 1.9748, + "mean_token_accuracy": 0.5637217164039612, + "num_tokens": 3358516832.0, + "step": 6569 + }, + { + "epoch": 1.7766360194699837, + "grad_norm": 1.3980273008346558, + "learning_rate": 1.5451217518853415e-05, + "loss": 1.9806, + "mean_token_accuracy": 0.5278580784797668, + "num_tokens": 3359041023.0, + "step": 6570 + }, + { + "epoch": 1.7769064359113034, + "grad_norm": 1.4360241889953613, + "learning_rate": 1.544984727293714e-05, + "loss": 2.0215, + "mean_token_accuracy": 0.5542970895767212, + "num_tokens": 3359557878.0, + "step": 6571 + }, + { + "epoch": 1.777176852352623, + "grad_norm": 1.0370882749557495, + "learning_rate": 1.5448476890486466e-05, + "loss": 1.9405, + "mean_token_accuracy": 0.5655244588851929, + "num_tokens": 3360082070.0, + "step": 6572 + }, + { + "epoch": 1.7774472687939427, + "grad_norm": 1.519578456878662, + "learning_rate": 1.544710637154344e-05, + "loss": 2.1004, + "mean_token_accuracy": 0.5463653802871704, + "num_tokens": 3360571808.0, + "step": 6573 + }, + { + "epoch": 1.7777176852352623, + "grad_norm": 1.2886556386947632, + "learning_rate": 1.544573571615011e-05, + "loss": 1.9139, + "mean_token_accuracy": 0.561591386795044, + "num_tokens": 3361095998.0, + "step": 6574 + }, + { + "epoch": 1.777988101676582, + "grad_norm": 1.0731933116912842, + "learning_rate": 1.5444364924348535e-05, + "loss": 1.9834, + "mean_token_accuracy": 0.539000391960144, + "num_tokens": 3361620230.0, + "step": 6575 + }, + { + "epoch": 1.7782585181179016, + "grad_norm": 1.5797183513641357, + "learning_rate": 1.5442993996180774e-05, + "loss": 1.8891, + "mean_token_accuracy": 0.5662053823471069, + "num_tokens": 3362114642.0, + "step": 6576 + }, + { + "epoch": 1.7785289345592212, + "grad_norm": 1.4727942943572998, + "learning_rate": 1.544162293168889e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5312116146087646, + "num_tokens": 3362638830.0, + "step": 6577 + }, + { + "epoch": 1.7787993510005409, + "grad_norm": 1.1372902393341064, + "learning_rate": 1.5440251730914957e-05, + "loss": 1.9595, + "mean_token_accuracy": 0.5535526871681213, + "num_tokens": 3363163073.0, + "step": 6578 + }, + { + "epoch": 1.7790697674418605, + "grad_norm": 1.254486322402954, + "learning_rate": 1.543888039390104e-05, + "loss": 1.8321, + "mean_token_accuracy": 0.5684984922409058, + "num_tokens": 3363651103.0, + "step": 6579 + }, + { + "epoch": 1.7793401838831802, + "grad_norm": 1.4521206617355347, + "learning_rate": 1.543750892068922e-05, + "loss": 1.8539, + "mean_token_accuracy": 0.57307368516922, + "num_tokens": 3364174305.0, + "step": 6580 + }, + { + "epoch": 1.7796106003244998, + "grad_norm": 0.6120886206626892, + "learning_rate": 1.5436137311321573e-05, + "loss": 1.1889, + "mean_token_accuracy": 0.6891819834709167, + "num_tokens": 3364698555.0, + "step": 6581 + }, + { + "epoch": 1.7798810167658194, + "grad_norm": 1.689375877380371, + "learning_rate": 1.5434765565840192e-05, + "loss": 1.9954, + "mean_token_accuracy": 0.5696951150894165, + "num_tokens": 3365222826.0, + "step": 6582 + }, + { + "epoch": 1.780151433207139, + "grad_norm": 1.392848014831543, + "learning_rate": 1.543339368428716e-05, + "loss": 1.9644, + "mean_token_accuracy": 0.5491414070129395, + "num_tokens": 3365747106.0, + "step": 6583 + }, + { + "epoch": 1.7804218496484587, + "grad_norm": 1.1627562046051025, + "learning_rate": 1.5432021666704567e-05, + "loss": 1.8736, + "mean_token_accuracy": 0.5885913372039795, + "num_tokens": 3366212013.0, + "step": 6584 + }, + { + "epoch": 1.7806922660897784, + "grad_norm": 1.6163045167922974, + "learning_rate": 1.5430649513134523e-05, + "loss": 2.0241, + "mean_token_accuracy": 0.5468019247055054, + "num_tokens": 3366736230.0, + "step": 6585 + }, + { + "epoch": 1.780962682531098, + "grad_norm": 1.3244212865829468, + "learning_rate": 1.542927722361912e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.5520936846733093, + "num_tokens": 3367260492.0, + "step": 6586 + }, + { + "epoch": 1.7812330989724177, + "grad_norm": 1.3469825983047485, + "learning_rate": 1.542790479820046e-05, + "loss": 2.0277, + "mean_token_accuracy": 0.5527476072311401, + "num_tokens": 3367738666.0, + "step": 6587 + }, + { + "epoch": 1.7815035154137373, + "grad_norm": 1.3119608163833618, + "learning_rate": 1.5426532236920665e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5725516080856323, + "num_tokens": 3368262911.0, + "step": 6588 + }, + { + "epoch": 1.781773931855057, + "grad_norm": 1.2542002201080322, + "learning_rate": 1.5425159539821846e-05, + "loss": 1.9373, + "mean_token_accuracy": 0.5509768724441528, + "num_tokens": 3368787092.0, + "step": 6589 + }, + { + "epoch": 1.7820443482963764, + "grad_norm": 1.2887158393859863, + "learning_rate": 1.5423786706946113e-05, + "loss": 1.9706, + "mean_token_accuracy": 0.5688525438308716, + "num_tokens": 3369247844.0, + "step": 6590 + }, + { + "epoch": 1.782314764737696, + "grad_norm": 1.5281143188476562, + "learning_rate": 1.5422413738335597e-05, + "loss": 1.9302, + "mean_token_accuracy": 0.5579906702041626, + "num_tokens": 3369726127.0, + "step": 6591 + }, + { + "epoch": 1.7825851811790157, + "grad_norm": 1.8282870054244995, + "learning_rate": 1.5421040634032424e-05, + "loss": 1.8695, + "mean_token_accuracy": 0.6073547601699829, + "num_tokens": 3370247982.0, + "step": 6592 + }, + { + "epoch": 1.7828555976203353, + "grad_norm": 1.2983981370925903, + "learning_rate": 1.5419667394078718e-05, + "loss": 1.9896, + "mean_token_accuracy": 0.5620632171630859, + "num_tokens": 3370698769.0, + "step": 6593 + }, + { + "epoch": 1.783126014061655, + "grad_norm": 1.2406086921691895, + "learning_rate": 1.5418294018516623e-05, + "loss": 1.8539, + "mean_token_accuracy": 0.5735626816749573, + "num_tokens": 3371223000.0, + "step": 6594 + }, + { + "epoch": 1.7833964305029746, + "grad_norm": 1.591210126876831, + "learning_rate": 1.541692050738827e-05, + "loss": 1.8618, + "mean_token_accuracy": 0.5645388960838318, + "num_tokens": 3371709487.0, + "step": 6595 + }, + { + "epoch": 1.7836668469442942, + "grad_norm": 1.6898146867752075, + "learning_rate": 1.5415546860735813e-05, + "loss": 2.0349, + "mean_token_accuracy": 0.5440611839294434, + "num_tokens": 3372233673.0, + "step": 6596 + }, + { + "epoch": 1.7839372633856139, + "grad_norm": 1.3992964029312134, + "learning_rate": 1.541417307860139e-05, + "loss": 1.871, + "mean_token_accuracy": 0.5742173194885254, + "num_tokens": 3372706265.0, + "step": 6597 + }, + { + "epoch": 1.7842076798269335, + "grad_norm": 1.3318586349487305, + "learning_rate": 1.5412799161027153e-05, + "loss": 1.8484, + "mean_token_accuracy": 0.569159746170044, + "num_tokens": 3373230391.0, + "step": 6598 + }, + { + "epoch": 1.7844780962682532, + "grad_norm": 1.221846342086792, + "learning_rate": 1.5411425108055266e-05, + "loss": 1.8713, + "mean_token_accuracy": 0.5801805257797241, + "num_tokens": 3373754602.0, + "step": 6599 + }, + { + "epoch": 1.7847485127095726, + "grad_norm": 1.3345377445220947, + "learning_rate": 1.541005091972788e-05, + "loss": 2.0094, + "mean_token_accuracy": 0.5513837337493896, + "num_tokens": 3374278855.0, + "step": 6600 + }, + { + "epoch": 1.7850189291508922, + "grad_norm": 0.7235909700393677, + "learning_rate": 1.5408676596087163e-05, + "loss": 1.1331, + "mean_token_accuracy": 0.7007335424423218, + "num_tokens": 3374802936.0, + "step": 6601 + }, + { + "epoch": 1.7852893455922119, + "grad_norm": 1.8934024572372437, + "learning_rate": 1.540730213717528e-05, + "loss": 1.8866, + "mean_token_accuracy": 0.5625733733177185, + "num_tokens": 3375327164.0, + "step": 6602 + }, + { + "epoch": 1.7855597620335315, + "grad_norm": 1.6124995946884155, + "learning_rate": 1.540592754303441e-05, + "loss": 1.9364, + "mean_token_accuracy": 0.556983232498169, + "num_tokens": 3375818097.0, + "step": 6603 + }, + { + "epoch": 1.7858301784748511, + "grad_norm": 1.3333390951156616, + "learning_rate": 1.5404552813706728e-05, + "loss": 1.9713, + "mean_token_accuracy": 0.5623637437820435, + "num_tokens": 3376263630.0, + "step": 6604 + }, + { + "epoch": 1.7861005949161708, + "grad_norm": 1.5082249641418457, + "learning_rate": 1.540317794923441e-05, + "loss": 2.0621, + "mean_token_accuracy": 0.5391088724136353, + "num_tokens": 3376787808.0, + "step": 6605 + }, + { + "epoch": 1.7863710113574904, + "grad_norm": 1.3960065841674805, + "learning_rate": 1.540180294965964e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5523195862770081, + "num_tokens": 3377311895.0, + "step": 6606 + }, + { + "epoch": 1.78664142779881, + "grad_norm": 1.4586657285690308, + "learning_rate": 1.5400427815024615e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.5985826253890991, + "num_tokens": 3377836169.0, + "step": 6607 + }, + { + "epoch": 1.7869118442401297, + "grad_norm": 1.734322190284729, + "learning_rate": 1.5399052545371523e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.5410147905349731, + "num_tokens": 3378360409.0, + "step": 6608 + }, + { + "epoch": 1.7871822606814494, + "grad_norm": 1.3746562004089355, + "learning_rate": 1.5397677140742565e-05, + "loss": 1.989, + "mean_token_accuracy": 0.543473482131958, + "num_tokens": 3378884692.0, + "step": 6609 + }, + { + "epoch": 1.787452677122769, + "grad_norm": 1.2441086769104004, + "learning_rate": 1.5396301601179937e-05, + "loss": 2.0334, + "mean_token_accuracy": 0.5525943636894226, + "num_tokens": 3379408967.0, + "step": 6610 + }, + { + "epoch": 1.7877230935640886, + "grad_norm": 1.36886465549469, + "learning_rate": 1.5394925926725845e-05, + "loss": 1.927, + "mean_token_accuracy": 0.5514781475067139, + "num_tokens": 3379933216.0, + "step": 6611 + }, + { + "epoch": 1.7879935100054083, + "grad_norm": 1.1554560661315918, + "learning_rate": 1.53935501174225e-05, + "loss": 1.9363, + "mean_token_accuracy": 0.5760505199432373, + "num_tokens": 3380412728.0, + "step": 6612 + }, + { + "epoch": 1.788263926446728, + "grad_norm": 1.2821321487426758, + "learning_rate": 1.5392174173312123e-05, + "loss": 2.0603, + "mean_token_accuracy": 0.5506448745727539, + "num_tokens": 3380936999.0, + "step": 6613 + }, + { + "epoch": 1.7885343428880476, + "grad_norm": 1.3801076412200928, + "learning_rate": 1.5390798094436923e-05, + "loss": 2.0093, + "mean_token_accuracy": 0.5430852174758911, + "num_tokens": 3381425359.0, + "step": 6614 + }, + { + "epoch": 1.7888047593293672, + "grad_norm": 1.127273678779602, + "learning_rate": 1.5389421880839123e-05, + "loss": 1.9489, + "mean_token_accuracy": 0.5690441131591797, + "num_tokens": 3381949637.0, + "step": 6615 + }, + { + "epoch": 1.7890751757706869, + "grad_norm": 1.311392068862915, + "learning_rate": 1.5388045532560953e-05, + "loss": 1.9876, + "mean_token_accuracy": 0.5630598068237305, + "num_tokens": 3382444938.0, + "step": 6616 + }, + { + "epoch": 1.7893455922120065, + "grad_norm": 1.1786532402038574, + "learning_rate": 1.5386669049644643e-05, + "loss": 1.8473, + "mean_token_accuracy": 0.5653701424598694, + "num_tokens": 3382969122.0, + "step": 6617 + }, + { + "epoch": 1.7896160086533262, + "grad_norm": 1.221138596534729, + "learning_rate": 1.5385292432132423e-05, + "loss": 2.0178, + "mean_token_accuracy": 0.5386872291564941, + "num_tokens": 3383493042.0, + "step": 6618 + }, + { + "epoch": 1.7898864250946458, + "grad_norm": 1.62802255153656, + "learning_rate": 1.5383915680066536e-05, + "loss": 2.0789, + "mean_token_accuracy": 0.5651520490646362, + "num_tokens": 3383996057.0, + "step": 6619 + }, + { + "epoch": 1.7901568415359654, + "grad_norm": 1.2875947952270508, + "learning_rate": 1.5382538793489224e-05, + "loss": 1.9721, + "mean_token_accuracy": 0.5652768611907959, + "num_tokens": 3384465992.0, + "step": 6620 + }, + { + "epoch": 1.790427257977285, + "grad_norm": 0.744225025177002, + "learning_rate": 1.5381161772442736e-05, + "loss": 1.1228, + "mean_token_accuracy": 0.7061904072761536, + "num_tokens": 3384892050.0, + "step": 6621 + }, + { + "epoch": 1.7906976744186047, + "grad_norm": 2.493149995803833, + "learning_rate": 1.537978461696932e-05, + "loss": 1.8754, + "mean_token_accuracy": 0.5749199390411377, + "num_tokens": 3385366330.0, + "step": 6622 + }, + { + "epoch": 1.7909680908599244, + "grad_norm": 2.376643419265747, + "learning_rate": 1.537840732711123e-05, + "loss": 1.9028, + "mean_token_accuracy": 0.5701552629470825, + "num_tokens": 3385869002.0, + "step": 6623 + }, + { + "epoch": 1.791238507301244, + "grad_norm": 1.5569138526916504, + "learning_rate": 1.5377029902910726e-05, + "loss": 1.7741, + "mean_token_accuracy": 0.5986963510513306, + "num_tokens": 3386393284.0, + "step": 6624 + }, + { + "epoch": 1.7915089237425637, + "grad_norm": 1.4973005056381226, + "learning_rate": 1.5375652344410076e-05, + "loss": 1.998, + "mean_token_accuracy": 0.5443413257598877, + "num_tokens": 3386917531.0, + "step": 6625 + }, + { + "epoch": 1.7917793401838833, + "grad_norm": 1.7996680736541748, + "learning_rate": 1.5374274651651542e-05, + "loss": 1.9441, + "mean_token_accuracy": 0.5578396320343018, + "num_tokens": 3387441547.0, + "step": 6626 + }, + { + "epoch": 1.792049756625203, + "grad_norm": 1.5784940719604492, + "learning_rate": 1.5372896824677395e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.5528031587600708, + "num_tokens": 3387965788.0, + "step": 6627 + }, + { + "epoch": 1.7923201730665226, + "grad_norm": 1.3903369903564453, + "learning_rate": 1.537151886352992e-05, + "loss": 2.0446, + "mean_token_accuracy": 0.552521824836731, + "num_tokens": 3388456326.0, + "step": 6628 + }, + { + "epoch": 1.7925905895078422, + "grad_norm": 1.9775943756103516, + "learning_rate": 1.5370140768251385e-05, + "loss": 1.9826, + "mean_token_accuracy": 0.5557991862297058, + "num_tokens": 3388973382.0, + "step": 6629 + }, + { + "epoch": 1.7928610059491619, + "grad_norm": 1.7286629676818848, + "learning_rate": 1.5368762538884083e-05, + "loss": 1.9038, + "mean_token_accuracy": 0.5631037950515747, + "num_tokens": 3389497539.0, + "step": 6630 + }, + { + "epoch": 1.7931314223904813, + "grad_norm": 1.3947598934173584, + "learning_rate": 1.5367384175470296e-05, + "loss": 1.8786, + "mean_token_accuracy": 0.5581676959991455, + "num_tokens": 3390021799.0, + "step": 6631 + }, + { + "epoch": 1.793401838831801, + "grad_norm": 1.705522894859314, + "learning_rate": 1.536600567805232e-05, + "loss": 1.9345, + "mean_token_accuracy": 0.5466996431350708, + "num_tokens": 3390545969.0, + "step": 6632 + }, + { + "epoch": 1.7936722552731206, + "grad_norm": 1.7593882083892822, + "learning_rate": 1.536462704667245e-05, + "loss": 2.0161, + "mean_token_accuracy": 0.5639675259590149, + "num_tokens": 3391008394.0, + "step": 6633 + }, + { + "epoch": 1.7939426717144402, + "grad_norm": 1.2687853574752808, + "learning_rate": 1.5363248281372988e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.53194659948349, + "num_tokens": 3391532677.0, + "step": 6634 + }, + { + "epoch": 1.7942130881557599, + "grad_norm": 1.5622448921203613, + "learning_rate": 1.5361869382196236e-05, + "loss": 1.982, + "mean_token_accuracy": 0.5434001684188843, + "num_tokens": 3392056802.0, + "step": 6635 + }, + { + "epoch": 1.7944835045970795, + "grad_norm": 1.6807548999786377, + "learning_rate": 1.5360490349184503e-05, + "loss": 1.993, + "mean_token_accuracy": 0.5558863282203674, + "num_tokens": 3392581056.0, + "step": 6636 + }, + { + "epoch": 1.7947539210383991, + "grad_norm": 1.1336511373519897, + "learning_rate": 1.5359111182380105e-05, + "loss": 2.009, + "mean_token_accuracy": 0.5862270593643188, + "num_tokens": 3393040717.0, + "step": 6637 + }, + { + "epoch": 1.7950243374797188, + "grad_norm": 1.2571619749069214, + "learning_rate": 1.5357731881825353e-05, + "loss": 1.8541, + "mean_token_accuracy": 0.5576276779174805, + "num_tokens": 3393527511.0, + "step": 6638 + }, + { + "epoch": 1.7952947539210384, + "grad_norm": 1.7193318605422974, + "learning_rate": 1.5356352447562573e-05, + "loss": 1.9635, + "mean_token_accuracy": 0.549762487411499, + "num_tokens": 3394051623.0, + "step": 6639 + }, + { + "epoch": 1.795565170362358, + "grad_norm": 1.5130845308303833, + "learning_rate": 1.535497287963409e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5470789670944214, + "num_tokens": 3394575776.0, + "step": 6640 + }, + { + "epoch": 1.7958355868036775, + "grad_norm": 0.6783888339996338, + "learning_rate": 1.5353593178082232e-05, + "loss": 1.134, + "mean_token_accuracy": 0.7025745511054993, + "num_tokens": 3395099959.0, + "step": 6641 + }, + { + "epoch": 1.7961060032449971, + "grad_norm": 1.6916266679763794, + "learning_rate": 1.5352213342949333e-05, + "loss": 1.8466, + "mean_token_accuracy": 0.5871495008468628, + "num_tokens": 3395624219.0, + "step": 6642 + }, + { + "epoch": 1.7963764196863168, + "grad_norm": 1.4784412384033203, + "learning_rate": 1.535083337427773e-05, + "loss": 1.9177, + "mean_token_accuracy": 0.5616098046302795, + "num_tokens": 3396120686.0, + "step": 6643 + }, + { + "epoch": 1.7966468361276364, + "grad_norm": 1.2395987510681152, + "learning_rate": 1.534945327210976e-05, + "loss": 2.0381, + "mean_token_accuracy": 0.5414080619812012, + "num_tokens": 3396644877.0, + "step": 6644 + }, + { + "epoch": 1.796917252568956, + "grad_norm": 1.457541823387146, + "learning_rate": 1.5348073036487773e-05, + "loss": 1.9868, + "mean_token_accuracy": 0.5593281984329224, + "num_tokens": 3397169155.0, + "step": 6645 + }, + { + "epoch": 1.7971876690102757, + "grad_norm": 1.1443276405334473, + "learning_rate": 1.534669266745412e-05, + "loss": 1.8945, + "mean_token_accuracy": 0.5544084310531616, + "num_tokens": 3397693257.0, + "step": 6646 + }, + { + "epoch": 1.7974580854515954, + "grad_norm": 1.235564112663269, + "learning_rate": 1.534531216505116e-05, + "loss": 1.8983, + "mean_token_accuracy": 0.5628345012664795, + "num_tokens": 3398211930.0, + "step": 6647 + }, + { + "epoch": 1.797728501892915, + "grad_norm": 1.109215259552002, + "learning_rate": 1.5343931529321238e-05, + "loss": 1.8913, + "mean_token_accuracy": 0.5535395741462708, + "num_tokens": 3398735995.0, + "step": 6648 + }, + { + "epoch": 1.7979989183342346, + "grad_norm": 1.1447654962539673, + "learning_rate": 1.534255076030672e-05, + "loss": 1.985, + "mean_token_accuracy": 0.5507221817970276, + "num_tokens": 3399260244.0, + "step": 6649 + }, + { + "epoch": 1.7982693347755543, + "grad_norm": 1.1418590545654297, + "learning_rate": 1.5341169858049976e-05, + "loss": 1.8439, + "mean_token_accuracy": 0.5628135800361633, + "num_tokens": 3399784528.0, + "step": 6650 + }, + { + "epoch": 1.798539751216874, + "grad_norm": 1.371317744255066, + "learning_rate": 1.5339788822593376e-05, + "loss": 2.0149, + "mean_token_accuracy": 0.5564104914665222, + "num_tokens": 3400308715.0, + "step": 6651 + }, + { + "epoch": 1.7988101676581936, + "grad_norm": 1.2331428527832031, + "learning_rate": 1.5338407653979286e-05, + "loss": 1.8613, + "mean_token_accuracy": 0.565223753452301, + "num_tokens": 3400832934.0, + "step": 6652 + }, + { + "epoch": 1.7990805840995132, + "grad_norm": 1.552372694015503, + "learning_rate": 1.5337026352250096e-05, + "loss": 2.0441, + "mean_token_accuracy": 0.5372267365455627, + "num_tokens": 3401357206.0, + "step": 6653 + }, + { + "epoch": 1.7993510005408329, + "grad_norm": 1.3753507137298584, + "learning_rate": 1.5335644917448185e-05, + "loss": 1.9773, + "mean_token_accuracy": 0.5547335743904114, + "num_tokens": 3401856628.0, + "step": 6654 + }, + { + "epoch": 1.7996214169821525, + "grad_norm": 1.4657903909683228, + "learning_rate": 1.533426334961593e-05, + "loss": 2.0082, + "mean_token_accuracy": 0.5326486825942993, + "num_tokens": 3402380904.0, + "step": 6655 + }, + { + "epoch": 1.7998918334234721, + "grad_norm": 1.1446560621261597, + "learning_rate": 1.5332881648795736e-05, + "loss": 1.9607, + "mean_token_accuracy": 0.565934419631958, + "num_tokens": 3402841121.0, + "step": 6656 + }, + { + "epoch": 1.8001622498647918, + "grad_norm": 1.1947020292282104, + "learning_rate": 1.533149981502999e-05, + "loss": 1.98, + "mean_token_accuracy": 0.5544584393501282, + "num_tokens": 3403365210.0, + "step": 6657 + }, + { + "epoch": 1.8004326663061114, + "grad_norm": 1.7020463943481445, + "learning_rate": 1.5330117848361092e-05, + "loss": 1.9946, + "mean_token_accuracy": 0.5630838871002197, + "num_tokens": 3403828069.0, + "step": 6658 + }, + { + "epoch": 1.800703082747431, + "grad_norm": 1.5176365375518799, + "learning_rate": 1.5328735748831444e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5694385170936584, + "num_tokens": 3404339714.0, + "step": 6659 + }, + { + "epoch": 1.8009734991887507, + "grad_norm": 1.344157338142395, + "learning_rate": 1.532735351648345e-05, + "loss": 1.9788, + "mean_token_accuracy": 0.5686874389648438, + "num_tokens": 3404856274.0, + "step": 6660 + }, + { + "epoch": 1.8012439156300704, + "grad_norm": 0.6679907441139221, + "learning_rate": 1.532597115135953e-05, + "loss": 1.1068, + "mean_token_accuracy": 0.7077267169952393, + "num_tokens": 3405380561.0, + "step": 6661 + }, + { + "epoch": 1.80151433207139, + "grad_norm": 2.4575939178466797, + "learning_rate": 1.5324588653502086e-05, + "loss": 1.9465, + "mean_token_accuracy": 0.535947322845459, + "num_tokens": 3405904805.0, + "step": 6662 + }, + { + "epoch": 1.8017847485127096, + "grad_norm": 1.5657470226287842, + "learning_rate": 1.532320602295355e-05, + "loss": 1.7916, + "mean_token_accuracy": 0.5917340517044067, + "num_tokens": 3406429074.0, + "step": 6663 + }, + { + "epoch": 1.8020551649540293, + "grad_norm": 1.468220591545105, + "learning_rate": 1.5321823259756337e-05, + "loss": 2.0116, + "mean_token_accuracy": 0.5274105072021484, + "num_tokens": 3406953337.0, + "step": 6664 + }, + { + "epoch": 1.802325581395349, + "grad_norm": 2.1045823097229004, + "learning_rate": 1.532044036395288e-05, + "loss": 2.0085, + "mean_token_accuracy": 0.548606812953949, + "num_tokens": 3407477590.0, + "step": 6665 + }, + { + "epoch": 1.8025959978366686, + "grad_norm": 1.2169594764709473, + "learning_rate": 1.53190573355856e-05, + "loss": 1.8674, + "mean_token_accuracy": 0.5654997825622559, + "num_tokens": 3408001773.0, + "step": 6666 + }, + { + "epoch": 1.8028664142779882, + "grad_norm": 1.6662214994430542, + "learning_rate": 1.5317674174696943e-05, + "loss": 1.9911, + "mean_token_accuracy": 0.5488111972808838, + "num_tokens": 3408526043.0, + "step": 6667 + }, + { + "epoch": 1.8031368307193079, + "grad_norm": 1.6373119354248047, + "learning_rate": 1.5316290881329346e-05, + "loss": 1.877, + "mean_token_accuracy": 0.5567160248756409, + "num_tokens": 3409050260.0, + "step": 6668 + }, + { + "epoch": 1.8034072471606275, + "grad_norm": 1.3141156435012817, + "learning_rate": 1.5314907455525253e-05, + "loss": 1.9247, + "mean_token_accuracy": 0.5511348843574524, + "num_tokens": 3409574521.0, + "step": 6669 + }, + { + "epoch": 1.8036776636019471, + "grad_norm": 1.4796867370605469, + "learning_rate": 1.531352389732711e-05, + "loss": 1.9503, + "mean_token_accuracy": 0.5463719367980957, + "num_tokens": 3410098619.0, + "step": 6670 + }, + { + "epoch": 1.8039480800432668, + "grad_norm": 1.697352409362793, + "learning_rate": 1.5312140206777363e-05, + "loss": 1.9786, + "mean_token_accuracy": 0.5497203469276428, + "num_tokens": 3410622874.0, + "step": 6671 + }, + { + "epoch": 1.8042184964845862, + "grad_norm": 1.134531855583191, + "learning_rate": 1.531075638391848e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5543289184570312, + "num_tokens": 3411147032.0, + "step": 6672 + }, + { + "epoch": 1.8044889129259059, + "grad_norm": 1.4691601991653442, + "learning_rate": 1.530937242879291e-05, + "loss": 1.8507, + "mean_token_accuracy": 0.5713980793952942, + "num_tokens": 3411671189.0, + "step": 6673 + }, + { + "epoch": 1.8047593293672255, + "grad_norm": 1.6405770778656006, + "learning_rate": 1.5307988341443118e-05, + "loss": 1.9348, + "mean_token_accuracy": 0.5595271587371826, + "num_tokens": 3412195302.0, + "step": 6674 + }, + { + "epoch": 1.8050297458085451, + "grad_norm": 1.4324893951416016, + "learning_rate": 1.530660412191158e-05, + "loss": 1.9712, + "mean_token_accuracy": 0.5535769462585449, + "num_tokens": 3412719583.0, + "step": 6675 + }, + { + "epoch": 1.8053001622498648, + "grad_norm": 1.5916122198104858, + "learning_rate": 1.530521977024076e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5517085790634155, + "num_tokens": 3413182067.0, + "step": 6676 + }, + { + "epoch": 1.8055705786911844, + "grad_norm": 1.4638046026229858, + "learning_rate": 1.5303835286473143e-05, + "loss": 2.0996, + "mean_token_accuracy": 0.52280193567276, + "num_tokens": 3413703599.0, + "step": 6677 + }, + { + "epoch": 1.805840995132504, + "grad_norm": 1.2951264381408691, + "learning_rate": 1.5302450670651192e-05, + "loss": 1.9677, + "mean_token_accuracy": 0.5523781776428223, + "num_tokens": 3414227609.0, + "step": 6678 + }, + { + "epoch": 1.8061114115738237, + "grad_norm": 1.296879768371582, + "learning_rate": 1.5301065922817408e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5670668482780457, + "num_tokens": 3414751848.0, + "step": 6679 + }, + { + "epoch": 1.8063818280151434, + "grad_norm": 1.212553858757019, + "learning_rate": 1.529968104301427e-05, + "loss": 1.9575, + "mean_token_accuracy": 0.5663514137268066, + "num_tokens": 3415275972.0, + "step": 6680 + }, + { + "epoch": 1.806652244456463, + "grad_norm": 0.609880805015564, + "learning_rate": 1.5298296031284276e-05, + "loss": 1.1015, + "mean_token_accuracy": 0.7005093097686768, + "num_tokens": 3415766018.0, + "step": 6681 + }, + { + "epoch": 1.8069226608977826, + "grad_norm": 1.5058015584945679, + "learning_rate": 1.529691088766992e-05, + "loss": 1.9239, + "mean_token_accuracy": 0.5546180009841919, + "num_tokens": 3416290215.0, + "step": 6682 + }, + { + "epoch": 1.807193077339102, + "grad_norm": 1.3020517826080322, + "learning_rate": 1.52955256122137e-05, + "loss": 1.9244, + "mean_token_accuracy": 0.5597575306892395, + "num_tokens": 3416771053.0, + "step": 6683 + }, + { + "epoch": 1.8074634937804217, + "grad_norm": 1.1580289602279663, + "learning_rate": 1.529414020495812e-05, + "loss": 1.9642, + "mean_token_accuracy": 0.5453406572341919, + "num_tokens": 3417295229.0, + "step": 6684 + }, + { + "epoch": 1.8077339102217413, + "grad_norm": 1.1403549909591675, + "learning_rate": 1.5292754665945693e-05, + "loss": 1.911, + "mean_token_accuracy": 0.5512226819992065, + "num_tokens": 3417819466.0, + "step": 6685 + }, + { + "epoch": 1.808004326663061, + "grad_norm": 1.163050651550293, + "learning_rate": 1.529136899521893e-05, + "loss": 1.9855, + "mean_token_accuracy": 0.5579782724380493, + "num_tokens": 3418343610.0, + "step": 6686 + }, + { + "epoch": 1.8082747431043806, + "grad_norm": 1.2660486698150635, + "learning_rate": 1.528998319282034e-05, + "loss": 1.9795, + "mean_token_accuracy": 0.5722583532333374, + "num_tokens": 3418822658.0, + "step": 6687 + }, + { + "epoch": 1.8085451595457003, + "grad_norm": 1.2098371982574463, + "learning_rate": 1.5288597258792455e-05, + "loss": 2.0318, + "mean_token_accuracy": 0.5453784465789795, + "num_tokens": 3419346867.0, + "step": 6688 + }, + { + "epoch": 1.80881557598702, + "grad_norm": 1.2950048446655273, + "learning_rate": 1.528721119317779e-05, + "loss": 1.9295, + "mean_token_accuracy": 0.5411256551742554, + "num_tokens": 3419871138.0, + "step": 6689 + }, + { + "epoch": 1.8090859924283396, + "grad_norm": 1.7488713264465332, + "learning_rate": 1.5285824996018884e-05, + "loss": 1.9416, + "mean_token_accuracy": 0.5747355818748474, + "num_tokens": 3420395406.0, + "step": 6690 + }, + { + "epoch": 1.8093564088696592, + "grad_norm": 1.303682804107666, + "learning_rate": 1.5284438667358258e-05, + "loss": 2.0493, + "mean_token_accuracy": 0.5486354827880859, + "num_tokens": 3420919680.0, + "step": 6691 + }, + { + "epoch": 1.8096268253109788, + "grad_norm": 1.4503130912780762, + "learning_rate": 1.5283052207238457e-05, + "loss": 2.0275, + "mean_token_accuracy": 0.544325053691864, + "num_tokens": 3421408456.0, + "step": 6692 + }, + { + "epoch": 1.8098972417522985, + "grad_norm": 1.127783179283142, + "learning_rate": 1.5281665615702016e-05, + "loss": 1.9432, + "mean_token_accuracy": 0.5715624690055847, + "num_tokens": 3421932684.0, + "step": 6693 + }, + { + "epoch": 1.8101676581936181, + "grad_norm": 1.415488839149475, + "learning_rate": 1.5280278892791486e-05, + "loss": 1.9421, + "mean_token_accuracy": 0.5723055601119995, + "num_tokens": 3422456839.0, + "step": 6694 + }, + { + "epoch": 1.8104380746349378, + "grad_norm": 1.3778533935546875, + "learning_rate": 1.5278892038549406e-05, + "loss": 1.9682, + "mean_token_accuracy": 0.5477184057235718, + "num_tokens": 3422981047.0, + "step": 6695 + }, + { + "epoch": 1.8107084910762574, + "grad_norm": 1.205043911933899, + "learning_rate": 1.527750505301834e-05, + "loss": 1.9877, + "mean_token_accuracy": 0.5609285831451416, + "num_tokens": 3423505327.0, + "step": 6696 + }, + { + "epoch": 1.810978907517577, + "grad_norm": 1.302608609199524, + "learning_rate": 1.527611793624084e-05, + "loss": 1.9182, + "mean_token_accuracy": 0.5436022877693176, + "num_tokens": 3424029268.0, + "step": 6697 + }, + { + "epoch": 1.8112493239588967, + "grad_norm": 1.3092342615127563, + "learning_rate": 1.5274730688259464e-05, + "loss": 1.9293, + "mean_token_accuracy": 0.5505125522613525, + "num_tokens": 3424553444.0, + "step": 6698 + }, + { + "epoch": 1.8115197404002163, + "grad_norm": 1.3854893445968628, + "learning_rate": 1.527334330911678e-05, + "loss": 1.9407, + "mean_token_accuracy": 0.5570471286773682, + "num_tokens": 3425077633.0, + "step": 6699 + }, + { + "epoch": 1.811790156841536, + "grad_norm": 1.2174495458602905, + "learning_rate": 1.527195579885536e-05, + "loss": 1.8627, + "mean_token_accuracy": 0.5761343240737915, + "num_tokens": 3425544375.0, + "step": 6700 + }, + { + "epoch": 1.8120605732828556, + "grad_norm": 0.5783001184463501, + "learning_rate": 1.5270568157517765e-05, + "loss": 1.0723, + "mean_token_accuracy": 0.6957076191902161, + "num_tokens": 3426045076.0, + "step": 6701 + }, + { + "epoch": 1.8123309897241753, + "grad_norm": 1.7687650918960571, + "learning_rate": 1.5269180385146584e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.5503151416778564, + "num_tokens": 3426569349.0, + "step": 6702 + }, + { + "epoch": 1.812601406165495, + "grad_norm": 1.6120038032531738, + "learning_rate": 1.5267792481784393e-05, + "loss": 2.0043, + "mean_token_accuracy": 0.5421730279922485, + "num_tokens": 3427093624.0, + "step": 6703 + }, + { + "epoch": 1.8128718226068146, + "grad_norm": 1.2232329845428467, + "learning_rate": 1.5266404447473782e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5566583871841431, + "num_tokens": 3427534385.0, + "step": 6704 + }, + { + "epoch": 1.8131422390481342, + "grad_norm": 1.2456780672073364, + "learning_rate": 1.526501628225733e-05, + "loss": 1.8989, + "mean_token_accuracy": 0.5450969338417053, + "num_tokens": 3428058552.0, + "step": 6705 + }, + { + "epoch": 1.8134126554894539, + "grad_norm": 1.7907580137252808, + "learning_rate": 1.5263627986177643e-05, + "loss": 2.06, + "mean_token_accuracy": 0.5099383592605591, + "num_tokens": 3428582705.0, + "step": 6706 + }, + { + "epoch": 1.8136830719307735, + "grad_norm": 1.2665367126464844, + "learning_rate": 1.5262239559277306e-05, + "loss": 1.8594, + "mean_token_accuracy": 0.5586192607879639, + "num_tokens": 3429099192.0, + "step": 6707 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 1.179809808731079, + "learning_rate": 1.5260851001598925e-05, + "loss": 1.899, + "mean_token_accuracy": 0.5544856786727905, + "num_tokens": 3429623445.0, + "step": 6708 + }, + { + "epoch": 1.8142239048134128, + "grad_norm": 1.444108247756958, + "learning_rate": 1.5259462313185104e-05, + "loss": 1.9654, + "mean_token_accuracy": 0.518900990486145, + "num_tokens": 3430147551.0, + "step": 6709 + }, + { + "epoch": 1.8144943212547324, + "grad_norm": 2.7137844562530518, + "learning_rate": 1.5258073494078455e-05, + "loss": 1.7839, + "mean_token_accuracy": 0.6259006261825562, + "num_tokens": 3430608039.0, + "step": 6710 + }, + { + "epoch": 1.814764737696052, + "grad_norm": 1.9628978967666626, + "learning_rate": 1.5256684544321589e-05, + "loss": 1.7475, + "mean_token_accuracy": 0.627405047416687, + "num_tokens": 3431070497.0, + "step": 6711 + }, + { + "epoch": 1.8150351541373717, + "grad_norm": 1.584155797958374, + "learning_rate": 1.5255295463957122e-05, + "loss": 1.9848, + "mean_token_accuracy": 0.546619176864624, + "num_tokens": 3431594591.0, + "step": 6712 + }, + { + "epoch": 1.8153055705786911, + "grad_norm": 1.3065986633300781, + "learning_rate": 1.5253906253027679e-05, + "loss": 1.943, + "mean_token_accuracy": 0.5572855472564697, + "num_tokens": 3432102548.0, + "step": 6713 + }, + { + "epoch": 1.8155759870200108, + "grad_norm": 1.7041833400726318, + "learning_rate": 1.525251691157588e-05, + "loss": 1.867, + "mean_token_accuracy": 0.5862419605255127, + "num_tokens": 3432573386.0, + "step": 6714 + }, + { + "epoch": 1.8158464034613304, + "grad_norm": 1.689671516418457, + "learning_rate": 1.5251127439644355e-05, + "loss": 1.9338, + "mean_token_accuracy": 0.5542185306549072, + "num_tokens": 3433097646.0, + "step": 6715 + }, + { + "epoch": 1.81611681990265, + "grad_norm": 1.4210108518600464, + "learning_rate": 1.524973783727574e-05, + "loss": 1.9453, + "mean_token_accuracy": 0.5627436637878418, + "num_tokens": 3433621848.0, + "step": 6716 + }, + { + "epoch": 1.8163872363439697, + "grad_norm": 1.5964252948760986, + "learning_rate": 1.5248348104512669e-05, + "loss": 1.8656, + "mean_token_accuracy": 0.5826646089553833, + "num_tokens": 3434140369.0, + "step": 6717 + }, + { + "epoch": 1.8166576527852893, + "grad_norm": 1.5187100172042847, + "learning_rate": 1.5246958241397786e-05, + "loss": 1.949, + "mean_token_accuracy": 0.569949209690094, + "num_tokens": 3434641158.0, + "step": 6718 + }, + { + "epoch": 1.816928069226609, + "grad_norm": 1.276477336883545, + "learning_rate": 1.5245568247973736e-05, + "loss": 1.897, + "mean_token_accuracy": 0.5607820153236389, + "num_tokens": 3435165239.0, + "step": 6719 + }, + { + "epoch": 1.8171984856679286, + "grad_norm": 1.8929983377456665, + "learning_rate": 1.5244178124283166e-05, + "loss": 1.8967, + "mean_token_accuracy": 0.5615382194519043, + "num_tokens": 3435642717.0, + "step": 6720 + }, + { + "epoch": 1.8174689021092483, + "grad_norm": 0.7473350167274475, + "learning_rate": 1.5242787870368727e-05, + "loss": 1.0505, + "mean_token_accuracy": 0.7175149321556091, + "num_tokens": 3436143448.0, + "step": 6721 + }, + { + "epoch": 1.817739318550568, + "grad_norm": 1.6723670959472656, + "learning_rate": 1.5241397486273083e-05, + "loss": 1.9872, + "mean_token_accuracy": 0.5660688281059265, + "num_tokens": 3436667672.0, + "step": 6722 + }, + { + "epoch": 1.8180097349918876, + "grad_norm": 1.388738751411438, + "learning_rate": 1.5240006972038887e-05, + "loss": 1.8379, + "mean_token_accuracy": 0.5746105909347534, + "num_tokens": 3437191870.0, + "step": 6723 + }, + { + "epoch": 1.818280151433207, + "grad_norm": 1.476675271987915, + "learning_rate": 1.5238616327708811e-05, + "loss": 1.989, + "mean_token_accuracy": 0.562404453754425, + "num_tokens": 3437655815.0, + "step": 6724 + }, + { + "epoch": 1.8185505678745266, + "grad_norm": 1.4317001104354858, + "learning_rate": 1.5237225553325518e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.539795994758606, + "num_tokens": 3438179883.0, + "step": 6725 + }, + { + "epoch": 1.8188209843158463, + "grad_norm": 1.339922308921814, + "learning_rate": 1.5235834648931689e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.5406326055526733, + "num_tokens": 3438704063.0, + "step": 6726 + }, + { + "epoch": 1.819091400757166, + "grad_norm": 1.761108160018921, + "learning_rate": 1.5234443614569993e-05, + "loss": 1.8959, + "mean_token_accuracy": 0.5692716240882874, + "num_tokens": 3439228260.0, + "step": 6727 + }, + { + "epoch": 1.8193618171984856, + "grad_norm": 1.411346197128296, + "learning_rate": 1.5233052450283115e-05, + "loss": 1.8875, + "mean_token_accuracy": 0.5848532915115356, + "num_tokens": 3439752428.0, + "step": 6728 + }, + { + "epoch": 1.8196322336398052, + "grad_norm": 1.4136998653411865, + "learning_rate": 1.5231661156113739e-05, + "loss": 1.9977, + "mean_token_accuracy": 0.5448623895645142, + "num_tokens": 3440276559.0, + "step": 6729 + }, + { + "epoch": 1.8199026500811248, + "grad_norm": 1.6489636898040771, + "learning_rate": 1.5230269732104553e-05, + "loss": 1.9245, + "mean_token_accuracy": 0.5618475675582886, + "num_tokens": 3440800759.0, + "step": 6730 + }, + { + "epoch": 1.8201730665224445, + "grad_norm": 1.112708330154419, + "learning_rate": 1.5228878178298252e-05, + "loss": 2.0041, + "mean_token_accuracy": 0.5343603491783142, + "num_tokens": 3441301227.0, + "step": 6731 + }, + { + "epoch": 1.8204434829637641, + "grad_norm": 1.214543104171753, + "learning_rate": 1.5227486494737533e-05, + "loss": 1.9222, + "mean_token_accuracy": 0.5641568303108215, + "num_tokens": 3441825214.0, + "step": 6732 + }, + { + "epoch": 1.8207138994050838, + "grad_norm": 1.2093104124069214, + "learning_rate": 1.5226094681465098e-05, + "loss": 1.9899, + "mean_token_accuracy": 0.5626780986785889, + "num_tokens": 3442349416.0, + "step": 6733 + }, + { + "epoch": 1.8209843158464034, + "grad_norm": 1.261415958404541, + "learning_rate": 1.5224702738523646e-05, + "loss": 2.0736, + "mean_token_accuracy": 0.5303828120231628, + "num_tokens": 3442873601.0, + "step": 6734 + }, + { + "epoch": 1.821254732287723, + "grad_norm": 1.1653746366500854, + "learning_rate": 1.522331066595589e-05, + "loss": 1.9727, + "mean_token_accuracy": 0.5663626194000244, + "num_tokens": 3443343952.0, + "step": 6735 + }, + { + "epoch": 1.8215251487290427, + "grad_norm": 1.393947958946228, + "learning_rate": 1.5221918463804549e-05, + "loss": 1.9151, + "mean_token_accuracy": 0.5717036724090576, + "num_tokens": 3443868227.0, + "step": 6736 + }, + { + "epoch": 1.8217955651703623, + "grad_norm": 1.1327791213989258, + "learning_rate": 1.5220526132112326e-05, + "loss": 1.8964, + "mean_token_accuracy": 0.5694760680198669, + "num_tokens": 3444339880.0, + "step": 6737 + }, + { + "epoch": 1.822065981611682, + "grad_norm": 1.1832226514816284, + "learning_rate": 1.5219133670921953e-05, + "loss": 1.9276, + "mean_token_accuracy": 0.5509992241859436, + "num_tokens": 3444824046.0, + "step": 6738 + }, + { + "epoch": 1.8223363980530016, + "grad_norm": 1.6526366472244263, + "learning_rate": 1.5217741080276152e-05, + "loss": 1.7974, + "mean_token_accuracy": 0.595531702041626, + "num_tokens": 3445348223.0, + "step": 6739 + }, + { + "epoch": 1.8226068144943213, + "grad_norm": 1.8865468502044678, + "learning_rate": 1.5216348360217647e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5599432587623596, + "num_tokens": 3445835974.0, + "step": 6740 + }, + { + "epoch": 1.822877230935641, + "grad_norm": 0.6372512578964233, + "learning_rate": 1.521495551078918e-05, + "loss": 1.1794, + "mean_token_accuracy": 0.6869944334030151, + "num_tokens": 3446354008.0, + "step": 6741 + }, + { + "epoch": 1.8231476473769606, + "grad_norm": 1.4991275072097778, + "learning_rate": 1.5213562532033482e-05, + "loss": 1.9231, + "mean_token_accuracy": 0.5709801912307739, + "num_tokens": 3446832771.0, + "step": 6742 + }, + { + "epoch": 1.8234180638182802, + "grad_norm": 1.3144673109054565, + "learning_rate": 1.5212169423993292e-05, + "loss": 1.9111, + "mean_token_accuracy": 0.555796205997467, + "num_tokens": 3447314699.0, + "step": 6743 + }, + { + "epoch": 1.8236884802595998, + "grad_norm": 1.1532515287399292, + "learning_rate": 1.5210776186711355e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.546796441078186, + "num_tokens": 3447838744.0, + "step": 6744 + }, + { + "epoch": 1.8239588967009195, + "grad_norm": 1.5026193857192993, + "learning_rate": 1.5209382820230426e-05, + "loss": 2.0184, + "mean_token_accuracy": 0.5490090847015381, + "num_tokens": 3448363017.0, + "step": 6745 + }, + { + "epoch": 1.8242293131422391, + "grad_norm": 1.282352089881897, + "learning_rate": 1.5207989324593251e-05, + "loss": 1.8717, + "mean_token_accuracy": 0.5721824765205383, + "num_tokens": 3448887294.0, + "step": 6746 + }, + { + "epoch": 1.8244997295835588, + "grad_norm": 1.269330382347107, + "learning_rate": 1.5206595699842588e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5647233724594116, + "num_tokens": 3449411572.0, + "step": 6747 + }, + { + "epoch": 1.8247701460248784, + "grad_norm": 1.402055025100708, + "learning_rate": 1.5205201946021198e-05, + "loss": 1.8339, + "mean_token_accuracy": 0.584703266620636, + "num_tokens": 3449935727.0, + "step": 6748 + }, + { + "epoch": 1.825040562466198, + "grad_norm": 1.3850364685058594, + "learning_rate": 1.520380806317185e-05, + "loss": 1.732, + "mean_token_accuracy": 0.583457887172699, + "num_tokens": 3450459992.0, + "step": 6749 + }, + { + "epoch": 1.8253109789075177, + "grad_norm": 1.467666506767273, + "learning_rate": 1.5202414051337304e-05, + "loss": 1.9344, + "mean_token_accuracy": 0.5636218786239624, + "num_tokens": 3450937126.0, + "step": 6750 + }, + { + "epoch": 1.8255813953488373, + "grad_norm": 1.34380304813385, + "learning_rate": 1.5201019910560335e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5495551824569702, + "num_tokens": 3451461302.0, + "step": 6751 + }, + { + "epoch": 1.825851811790157, + "grad_norm": 1.325922966003418, + "learning_rate": 1.5199625640883725e-05, + "loss": 1.8108, + "mean_token_accuracy": 0.5666273832321167, + "num_tokens": 3451959022.0, + "step": 6752 + }, + { + "epoch": 1.8261222282314766, + "grad_norm": 1.2233668565750122, + "learning_rate": 1.5198231242350246e-05, + "loss": 1.794, + "mean_token_accuracy": 0.570797860622406, + "num_tokens": 3452483221.0, + "step": 6753 + }, + { + "epoch": 1.826392644672796, + "grad_norm": 1.2871675491333008, + "learning_rate": 1.5196836715002688e-05, + "loss": 1.7835, + "mean_token_accuracy": 0.5921470522880554, + "num_tokens": 3452947689.0, + "step": 6754 + }, + { + "epoch": 1.8266630611141157, + "grad_norm": 1.5360640287399292, + "learning_rate": 1.5195442058883838e-05, + "loss": 2.0143, + "mean_token_accuracy": 0.5375738739967346, + "num_tokens": 3453471744.0, + "step": 6755 + }, + { + "epoch": 1.8269334775554353, + "grad_norm": 1.6090320348739624, + "learning_rate": 1.5194047274036486e-05, + "loss": 1.9877, + "mean_token_accuracy": 0.5523556470870972, + "num_tokens": 3453995955.0, + "step": 6756 + }, + { + "epoch": 1.827203893996755, + "grad_norm": 1.4578455686569214, + "learning_rate": 1.5192652360503431e-05, + "loss": 1.8589, + "mean_token_accuracy": 0.5632544755935669, + "num_tokens": 3454470216.0, + "step": 6757 + }, + { + "epoch": 1.8274743104380746, + "grad_norm": 1.3659878969192505, + "learning_rate": 1.5191257318327472e-05, + "loss": 1.9339, + "mean_token_accuracy": 0.5635255575180054, + "num_tokens": 3454994262.0, + "step": 6758 + }, + { + "epoch": 1.8277447268793943, + "grad_norm": 1.5487606525421143, + "learning_rate": 1.5189862147551413e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.5232113599777222, + "num_tokens": 3455517743.0, + "step": 6759 + }, + { + "epoch": 1.828015143320714, + "grad_norm": 1.4532936811447144, + "learning_rate": 1.518846684821806e-05, + "loss": 1.9448, + "mean_token_accuracy": 0.5522847175598145, + "num_tokens": 3456041961.0, + "step": 6760 + }, + { + "epoch": 1.8282855597620336, + "grad_norm": 0.6319822072982788, + "learning_rate": 1.5187071420370227e-05, + "loss": 1.1841, + "mean_token_accuracy": 0.6969537734985352, + "num_tokens": 3456539253.0, + "step": 6761 + }, + { + "epoch": 1.8285559762033532, + "grad_norm": 1.942525863647461, + "learning_rate": 1.5185675864050733e-05, + "loss": 2.0004, + "mean_token_accuracy": 0.5597649216651917, + "num_tokens": 3457063540.0, + "step": 6762 + }, + { + "epoch": 1.8288263926446728, + "grad_norm": 1.4877132177352905, + "learning_rate": 1.518428017930239e-05, + "loss": 1.8653, + "mean_token_accuracy": 0.5526032447814941, + "num_tokens": 3457587776.0, + "step": 6763 + }, + { + "epoch": 1.8290968090859925, + "grad_norm": 1.1223785877227783, + "learning_rate": 1.5182884366168028e-05, + "loss": 1.9392, + "mean_token_accuracy": 0.5505954027175903, + "num_tokens": 3458111912.0, + "step": 6764 + }, + { + "epoch": 1.829367225527312, + "grad_norm": 1.2498725652694702, + "learning_rate": 1.5181488424690472e-05, + "loss": 1.9112, + "mean_token_accuracy": 0.5638540387153625, + "num_tokens": 3458636127.0, + "step": 6765 + }, + { + "epoch": 1.8296376419686315, + "grad_norm": 1.246161699295044, + "learning_rate": 1.5180092354912552e-05, + "loss": 1.8608, + "mean_token_accuracy": 0.5707422494888306, + "num_tokens": 3459160290.0, + "step": 6766 + }, + { + "epoch": 1.8299080584099512, + "grad_norm": 1.1788710355758667, + "learning_rate": 1.5178696156877109e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.5417159795761108, + "num_tokens": 3459684273.0, + "step": 6767 + }, + { + "epoch": 1.8301784748512708, + "grad_norm": 1.4587441682815552, + "learning_rate": 1.5177299830626976e-05, + "loss": 1.8931, + "mean_token_accuracy": 0.5914044976234436, + "num_tokens": 3460145166.0, + "step": 6768 + }, + { + "epoch": 1.8304488912925905, + "grad_norm": 1.4824355840682983, + "learning_rate": 1.5175903376205004e-05, + "loss": 1.971, + "mean_token_accuracy": 0.5354406833648682, + "num_tokens": 3460669447.0, + "step": 6769 + }, + { + "epoch": 1.8307193077339101, + "grad_norm": 1.291306495666504, + "learning_rate": 1.5174506793654034e-05, + "loss": 1.8462, + "mean_token_accuracy": 0.5714425444602966, + "num_tokens": 3461193575.0, + "step": 6770 + }, + { + "epoch": 1.8309897241752298, + "grad_norm": 1.2622472047805786, + "learning_rate": 1.517311008301692e-05, + "loss": 1.9219, + "mean_token_accuracy": 0.5610756874084473, + "num_tokens": 3461717826.0, + "step": 6771 + }, + { + "epoch": 1.8312601406165494, + "grad_norm": 1.2430461645126343, + "learning_rate": 1.5171713244336513e-05, + "loss": 1.8174, + "mean_token_accuracy": 0.5740689635276794, + "num_tokens": 3462242060.0, + "step": 6772 + }, + { + "epoch": 1.831530557057869, + "grad_norm": 1.0024597644805908, + "learning_rate": 1.5170316277655677e-05, + "loss": 1.8829, + "mean_token_accuracy": 0.5814820528030396, + "num_tokens": 3462715374.0, + "step": 6773 + }, + { + "epoch": 1.8318009734991887, + "grad_norm": 1.3385311365127563, + "learning_rate": 1.5168919183017272e-05, + "loss": 1.9538, + "mean_token_accuracy": 0.5684288740158081, + "num_tokens": 3463194965.0, + "step": 6774 + }, + { + "epoch": 1.8320713899405083, + "grad_norm": 1.4106179475784302, + "learning_rate": 1.5167521960464173e-05, + "loss": 2.1386, + "mean_token_accuracy": 0.5138658285140991, + "num_tokens": 3463719234.0, + "step": 6775 + }, + { + "epoch": 1.832341806381828, + "grad_norm": 1.354740858078003, + "learning_rate": 1.5166124610039238e-05, + "loss": 1.8899, + "mean_token_accuracy": 0.5661326050758362, + "num_tokens": 3464243352.0, + "step": 6776 + }, + { + "epoch": 1.8326122228231476, + "grad_norm": 1.2168757915496826, + "learning_rate": 1.5164727131785348e-05, + "loss": 1.9191, + "mean_token_accuracy": 0.5550516247749329, + "num_tokens": 3464767582.0, + "step": 6777 + }, + { + "epoch": 1.8328826392644673, + "grad_norm": 1.0254358053207397, + "learning_rate": 1.5163329525745387e-05, + "loss": 1.948, + "mean_token_accuracy": 0.5632820129394531, + "num_tokens": 3465291785.0, + "step": 6778 + }, + { + "epoch": 1.833153055705787, + "grad_norm": 1.1694117784500122, + "learning_rate": 1.516193179196223e-05, + "loss": 1.9774, + "mean_token_accuracy": 0.5620251893997192, + "num_tokens": 3465758012.0, + "step": 6779 + }, + { + "epoch": 1.8334234721471065, + "grad_norm": 1.3571192026138306, + "learning_rate": 1.5160533930478764e-05, + "loss": 2.0191, + "mean_token_accuracy": 0.5537238121032715, + "num_tokens": 3466230604.0, + "step": 6780 + }, + { + "epoch": 1.8336938885884262, + "grad_norm": 0.5444556474685669, + "learning_rate": 1.5159135941337882e-05, + "loss": 1.186, + "mean_token_accuracy": 0.680575966835022, + "num_tokens": 3466754827.0, + "step": 6781 + }, + { + "epoch": 1.8339643050297458, + "grad_norm": 2.0626070499420166, + "learning_rate": 1.5157737824582481e-05, + "loss": 1.9902, + "mean_token_accuracy": 0.5495246648788452, + "num_tokens": 3467279106.0, + "step": 6782 + }, + { + "epoch": 1.8342347214710655, + "grad_norm": 1.301961898803711, + "learning_rate": 1.5156339580255454e-05, + "loss": 1.8922, + "mean_token_accuracy": 0.562525749206543, + "num_tokens": 3467803352.0, + "step": 6783 + }, + { + "epoch": 1.8345051379123851, + "grad_norm": 1.4648065567016602, + "learning_rate": 1.5154941208399707e-05, + "loss": 2.0341, + "mean_token_accuracy": 0.549904465675354, + "num_tokens": 3468327612.0, + "step": 6784 + }, + { + "epoch": 1.8347755543537048, + "grad_norm": 1.4920564889907837, + "learning_rate": 1.5153542709058144e-05, + "loss": 1.988, + "mean_token_accuracy": 0.5578081607818604, + "num_tokens": 3468829013.0, + "step": 6785 + }, + { + "epoch": 1.8350459707950244, + "grad_norm": 1.1647875308990479, + "learning_rate": 1.5152144082273676e-05, + "loss": 1.9358, + "mean_token_accuracy": 0.5569813847541809, + "num_tokens": 3469353154.0, + "step": 6786 + }, + { + "epoch": 1.835316387236344, + "grad_norm": 1.6049230098724365, + "learning_rate": 1.5150745328089219e-05, + "loss": 1.923, + "mean_token_accuracy": 0.5491018295288086, + "num_tokens": 3469877359.0, + "step": 6787 + }, + { + "epoch": 1.8355868036776637, + "grad_norm": 1.5739208459854126, + "learning_rate": 1.5149346446547686e-05, + "loss": 1.9731, + "mean_token_accuracy": 0.582390308380127, + "num_tokens": 3470278188.0, + "step": 6788 + }, + { + "epoch": 1.8358572201189833, + "grad_norm": 1.2297632694244385, + "learning_rate": 1.5147947437692003e-05, + "loss": 1.8806, + "mean_token_accuracy": 0.543688952922821, + "num_tokens": 3470802466.0, + "step": 6789 + }, + { + "epoch": 1.836127636560303, + "grad_norm": 1.7994650602340698, + "learning_rate": 1.5146548301565094e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.555479884147644, + "num_tokens": 3471326740.0, + "step": 6790 + }, + { + "epoch": 1.8363980530016226, + "grad_norm": 1.6879903078079224, + "learning_rate": 1.5145149038209887e-05, + "loss": 2.0704, + "mean_token_accuracy": 0.5265139937400818, + "num_tokens": 3471850985.0, + "step": 6791 + }, + { + "epoch": 1.8366684694429423, + "grad_norm": 1.125454306602478, + "learning_rate": 1.5143749647669321e-05, + "loss": 1.8313, + "mean_token_accuracy": 0.580880343914032, + "num_tokens": 3472375263.0, + "step": 6792 + }, + { + "epoch": 1.836938885884262, + "grad_norm": 1.4173060655593872, + "learning_rate": 1.5142350129986327e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.563782811164856, + "num_tokens": 3472899383.0, + "step": 6793 + }, + { + "epoch": 1.8372093023255816, + "grad_norm": 1.5553560256958008, + "learning_rate": 1.5140950485203851e-05, + "loss": 1.926, + "mean_token_accuracy": 0.5646538734436035, + "num_tokens": 3473399779.0, + "step": 6794 + }, + { + "epoch": 1.8374797187669012, + "grad_norm": 1.2761098146438599, + "learning_rate": 1.5139550713364834e-05, + "loss": 1.9588, + "mean_token_accuracy": 0.546947717666626, + "num_tokens": 3473924032.0, + "step": 6795 + }, + { + "epoch": 1.8377501352082206, + "grad_norm": 1.2765601873397827, + "learning_rate": 1.5138150814512232e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.5371780395507812, + "num_tokens": 3474448159.0, + "step": 6796 + }, + { + "epoch": 1.8380205516495403, + "grad_norm": 1.4137033224105835, + "learning_rate": 1.513675078868899e-05, + "loss": 2.0127, + "mean_token_accuracy": 0.5669501423835754, + "num_tokens": 3474911850.0, + "step": 6797 + }, + { + "epoch": 1.83829096809086, + "grad_norm": 1.628718376159668, + "learning_rate": 1.5135350635938068e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.5609036684036255, + "num_tokens": 3475408898.0, + "step": 6798 + }, + { + "epoch": 1.8385613845321795, + "grad_norm": 1.5946623086929321, + "learning_rate": 1.513395035630243e-05, + "loss": 1.993, + "mean_token_accuracy": 0.5575897097587585, + "num_tokens": 3475933027.0, + "step": 6799 + }, + { + "epoch": 1.8388318009734992, + "grad_norm": 1.113085389137268, + "learning_rate": 1.5132549949825036e-05, + "loss": 1.8625, + "mean_token_accuracy": 0.5556750893592834, + "num_tokens": 3476457302.0, + "step": 6800 + }, + { + "epoch": 1.8391022174148188, + "grad_norm": 0.5438410043716431, + "learning_rate": 1.5131149416548856e-05, + "loss": 1.0722, + "mean_token_accuracy": 0.7087407112121582, + "num_tokens": 3476973679.0, + "step": 6801 + }, + { + "epoch": 1.8393726338561385, + "grad_norm": 2.520899534225464, + "learning_rate": 1.5129748756516862e-05, + "loss": 1.9069, + "mean_token_accuracy": 0.5504640340805054, + "num_tokens": 3477497871.0, + "step": 6802 + }, + { + "epoch": 1.8396430502974581, + "grad_norm": 2.266136407852173, + "learning_rate": 1.5128347969772032e-05, + "loss": 1.9154, + "mean_token_accuracy": 0.5678913593292236, + "num_tokens": 3478022122.0, + "step": 6803 + }, + { + "epoch": 1.8399134667387778, + "grad_norm": 1.4990389347076416, + "learning_rate": 1.5126947056357348e-05, + "loss": 2.0899, + "mean_token_accuracy": 0.5255005359649658, + "num_tokens": 3478546373.0, + "step": 6804 + }, + { + "epoch": 1.8401838831800974, + "grad_norm": 1.6814161539077759, + "learning_rate": 1.5125546016315791e-05, + "loss": 1.9685, + "mean_token_accuracy": 0.5489696264266968, + "num_tokens": 3479070541.0, + "step": 6805 + }, + { + "epoch": 1.8404542996214168, + "grad_norm": 1.8696869611740112, + "learning_rate": 1.512414484969035e-05, + "loss": 1.8463, + "mean_token_accuracy": 0.5630572438240051, + "num_tokens": 3479594634.0, + "step": 6806 + }, + { + "epoch": 1.8407247160627365, + "grad_norm": 1.6444861888885498, + "learning_rate": 1.5122743556524015e-05, + "loss": 2.0246, + "mean_token_accuracy": 0.5558795928955078, + "num_tokens": 3480066815.0, + "step": 6807 + }, + { + "epoch": 1.840995132504056, + "grad_norm": 1.596391201019287, + "learning_rate": 1.5121342136859784e-05, + "loss": 2.0847, + "mean_token_accuracy": 0.5430182218551636, + "num_tokens": 3480591087.0, + "step": 6808 + }, + { + "epoch": 1.8412655489453758, + "grad_norm": 1.6698318719863892, + "learning_rate": 1.5119940590740656e-05, + "loss": 2.0616, + "mean_token_accuracy": 0.5338460803031921, + "num_tokens": 3481115370.0, + "step": 6809 + }, + { + "epoch": 1.8415359653866954, + "grad_norm": 1.2357362508773804, + "learning_rate": 1.5118538918209636e-05, + "loss": 2.094, + "mean_token_accuracy": 0.5542418360710144, + "num_tokens": 3481603000.0, + "step": 6810 + }, + { + "epoch": 1.841806381828015, + "grad_norm": 1.5850368738174438, + "learning_rate": 1.5117137119309728e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.5704238414764404, + "num_tokens": 3482127075.0, + "step": 6811 + }, + { + "epoch": 1.8420767982693347, + "grad_norm": 1.297728180885315, + "learning_rate": 1.511573519408395e-05, + "loss": 1.9226, + "mean_token_accuracy": 0.549263596534729, + "num_tokens": 3482651294.0, + "step": 6812 + }, + { + "epoch": 1.8423472147106543, + "grad_norm": 1.2043120861053467, + "learning_rate": 1.5114333142575305e-05, + "loss": 2.002, + "mean_token_accuracy": 0.5580328702926636, + "num_tokens": 3483129285.0, + "step": 6813 + }, + { + "epoch": 1.842617631151974, + "grad_norm": 1.3111085891723633, + "learning_rate": 1.5112930964826827e-05, + "loss": 1.944, + "mean_token_accuracy": 0.5589865446090698, + "num_tokens": 3483653462.0, + "step": 6814 + }, + { + "epoch": 1.8428880475932936, + "grad_norm": 1.1931722164154053, + "learning_rate": 1.5111528660881527e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5565172433853149, + "num_tokens": 3484177741.0, + "step": 6815 + }, + { + "epoch": 1.8431584640346133, + "grad_norm": 1.296600341796875, + "learning_rate": 1.5110126230782438e-05, + "loss": 1.9762, + "mean_token_accuracy": 0.5483323335647583, + "num_tokens": 3484702020.0, + "step": 6816 + }, + { + "epoch": 1.843428880475933, + "grad_norm": 1.9400169849395752, + "learning_rate": 1.510872367457259e-05, + "loss": 1.9986, + "mean_token_accuracy": 0.5512548685073853, + "num_tokens": 3485226071.0, + "step": 6817 + }, + { + "epoch": 1.8436992969172525, + "grad_norm": 1.533858060836792, + "learning_rate": 1.5107320992295017e-05, + "loss": 1.9589, + "mean_token_accuracy": 0.560856282711029, + "num_tokens": 3485750223.0, + "step": 6818 + }, + { + "epoch": 1.8439697133585722, + "grad_norm": 1.353176236152649, + "learning_rate": 1.5105918183992756e-05, + "loss": 1.8631, + "mean_token_accuracy": 0.5819352269172668, + "num_tokens": 3486252696.0, + "step": 6819 + }, + { + "epoch": 1.8442401297998918, + "grad_norm": 1.294075846672058, + "learning_rate": 1.5104515249708849e-05, + "loss": 1.999, + "mean_token_accuracy": 0.5225633382797241, + "num_tokens": 3486776877.0, + "step": 6820 + }, + { + "epoch": 1.8445105462412115, + "grad_norm": 0.6474079489707947, + "learning_rate": 1.5103112189486346e-05, + "loss": 1.0868, + "mean_token_accuracy": 0.7104814052581787, + "num_tokens": 3487226637.0, + "step": 6821 + }, + { + "epoch": 1.8447809626825311, + "grad_norm": 2.1941304206848145, + "learning_rate": 1.5101709003368292e-05, + "loss": 2.0034, + "mean_token_accuracy": 0.5500897169113159, + "num_tokens": 3487750840.0, + "step": 6822 + }, + { + "epoch": 1.8450513791238508, + "grad_norm": 1.860428810119629, + "learning_rate": 1.5100305691397745e-05, + "loss": 1.9661, + "mean_token_accuracy": 0.5610705614089966, + "num_tokens": 3488274917.0, + "step": 6823 + }, + { + "epoch": 1.8453217955651704, + "grad_norm": 1.382016658782959, + "learning_rate": 1.5098902253617764e-05, + "loss": 1.9989, + "mean_token_accuracy": 0.5395535230636597, + "num_tokens": 3488799123.0, + "step": 6824 + }, + { + "epoch": 1.84559221200649, + "grad_norm": 1.4118423461914062, + "learning_rate": 1.5097498690071403e-05, + "loss": 1.958, + "mean_token_accuracy": 0.5551233291625977, + "num_tokens": 3489212610.0, + "step": 6825 + }, + { + "epoch": 1.8458626284478097, + "grad_norm": 1.4455939531326294, + "learning_rate": 1.5096095000801736e-05, + "loss": 1.9328, + "mean_token_accuracy": 0.535651445388794, + "num_tokens": 3489736709.0, + "step": 6826 + }, + { + "epoch": 1.8461330448891293, + "grad_norm": 1.8250250816345215, + "learning_rate": 1.5094691185851826e-05, + "loss": 1.9949, + "mean_token_accuracy": 0.5355782508850098, + "num_tokens": 3490257072.0, + "step": 6827 + }, + { + "epoch": 1.846403461330449, + "grad_norm": 1.5408656597137451, + "learning_rate": 1.5093287245264752e-05, + "loss": 2.0102, + "mean_token_accuracy": 0.5355244874954224, + "num_tokens": 3490781252.0, + "step": 6828 + }, + { + "epoch": 1.8466738777717686, + "grad_norm": 1.5571651458740234, + "learning_rate": 1.5091883179083586e-05, + "loss": 1.8449, + "mean_token_accuracy": 0.5797861814498901, + "num_tokens": 3491305527.0, + "step": 6829 + }, + { + "epoch": 1.8469442942130883, + "grad_norm": 1.8684064149856567, + "learning_rate": 1.5090478987351408e-05, + "loss": 2.0485, + "mean_token_accuracy": 0.5482125282287598, + "num_tokens": 3491829705.0, + "step": 6830 + }, + { + "epoch": 1.847214710654408, + "grad_norm": 1.5372419357299805, + "learning_rate": 1.5089074670111305e-05, + "loss": 1.9007, + "mean_token_accuracy": 0.5778763294219971, + "num_tokens": 3492268253.0, + "step": 6831 + }, + { + "epoch": 1.8474851270957275, + "grad_norm": 1.633302927017212, + "learning_rate": 1.508767022740637e-05, + "loss": 1.9223, + "mean_token_accuracy": 0.5750507116317749, + "num_tokens": 3492751135.0, + "step": 6832 + }, + { + "epoch": 1.8477555435370472, + "grad_norm": 1.4793058633804321, + "learning_rate": 1.508626565927969e-05, + "loss": 2.0002, + "mean_token_accuracy": 0.5466455221176147, + "num_tokens": 3493220255.0, + "step": 6833 + }, + { + "epoch": 1.8480259599783668, + "grad_norm": 1.4894893169403076, + "learning_rate": 1.5084860965774363e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.5431020259857178, + "num_tokens": 3493744507.0, + "step": 6834 + }, + { + "epoch": 1.8482963764196865, + "grad_norm": 1.6276715993881226, + "learning_rate": 1.5083456146933488e-05, + "loss": 1.9913, + "mean_token_accuracy": 0.5533828735351562, + "num_tokens": 3494268687.0, + "step": 6835 + }, + { + "epoch": 1.8485667928610061, + "grad_norm": 1.2383177280426025, + "learning_rate": 1.5082051202800167e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5538519620895386, + "num_tokens": 3494745094.0, + "step": 6836 + }, + { + "epoch": 1.8488372093023255, + "grad_norm": 1.1153258085250854, + "learning_rate": 1.5080646133417513e-05, + "loss": 1.9729, + "mean_token_accuracy": 0.5525996685028076, + "num_tokens": 3495269316.0, + "step": 6837 + }, + { + "epoch": 1.8491076257436452, + "grad_norm": 1.2980656623840332, + "learning_rate": 1.5079240938828636e-05, + "loss": 2.0119, + "mean_token_accuracy": 0.5416423082351685, + "num_tokens": 3495793566.0, + "step": 6838 + }, + { + "epoch": 1.8493780421849648, + "grad_norm": 1.393811583518982, + "learning_rate": 1.5077835619076651e-05, + "loss": 2.0284, + "mean_token_accuracy": 0.5406802892684937, + "num_tokens": 3496317714.0, + "step": 6839 + }, + { + "epoch": 1.8496484586262845, + "grad_norm": 1.2606767416000366, + "learning_rate": 1.5076430174204678e-05, + "loss": 1.8253, + "mean_token_accuracy": 0.553896427154541, + "num_tokens": 3496841922.0, + "step": 6840 + }, + { + "epoch": 1.849918875067604, + "grad_norm": 0.6537465453147888, + "learning_rate": 1.5075024604255837e-05, + "loss": 1.138, + "mean_token_accuracy": 0.6906136870384216, + "num_tokens": 3497366130.0, + "step": 6841 + }, + { + "epoch": 1.8501892915089238, + "grad_norm": 1.9165351390838623, + "learning_rate": 1.5073618909273257e-05, + "loss": 1.8808, + "mean_token_accuracy": 0.5594539642333984, + "num_tokens": 3497879013.0, + "step": 6842 + }, + { + "epoch": 1.8504597079502434, + "grad_norm": 1.942073106765747, + "learning_rate": 1.5072213089300071e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.5789210796356201, + "num_tokens": 3498403140.0, + "step": 6843 + }, + { + "epoch": 1.850730124391563, + "grad_norm": 1.107758641242981, + "learning_rate": 1.5070807144379411e-05, + "loss": 1.8846, + "mean_token_accuracy": 0.5563961267471313, + "num_tokens": 3498927280.0, + "step": 6844 + }, + { + "epoch": 1.8510005408328827, + "grad_norm": 1.266229510307312, + "learning_rate": 1.5069401074554415e-05, + "loss": 1.8517, + "mean_token_accuracy": 0.5696811676025391, + "num_tokens": 3499392942.0, + "step": 6845 + }, + { + "epoch": 1.8512709572742023, + "grad_norm": 1.7074004411697388, + "learning_rate": 1.506799487986823e-05, + "loss": 1.9403, + "mean_token_accuracy": 0.5734933614730835, + "num_tokens": 3499877019.0, + "step": 6846 + }, + { + "epoch": 1.8515413737155217, + "grad_norm": 1.243564486503601, + "learning_rate": 1.5066588560363997e-05, + "loss": 1.8501, + "mean_token_accuracy": 0.5729786157608032, + "num_tokens": 3500401236.0, + "step": 6847 + }, + { + "epoch": 1.8518117901568414, + "grad_norm": 1.5076336860656738, + "learning_rate": 1.5065182116084866e-05, + "loss": 1.7472, + "mean_token_accuracy": 0.6005789041519165, + "num_tokens": 3500925520.0, + "step": 6848 + }, + { + "epoch": 1.852082206598161, + "grad_norm": 1.3217017650604248, + "learning_rate": 1.5063775547073996e-05, + "loss": 1.8657, + "mean_token_accuracy": 0.5707358121871948, + "num_tokens": 3501449627.0, + "step": 6849 + }, + { + "epoch": 1.8523526230394807, + "grad_norm": 1.3479732275009155, + "learning_rate": 1.5062368853374538e-05, + "loss": 1.987, + "mean_token_accuracy": 0.558001697063446, + "num_tokens": 3501973908.0, + "step": 6850 + }, + { + "epoch": 1.8526230394808003, + "grad_norm": 1.2817485332489014, + "learning_rate": 1.506096203502966e-05, + "loss": 1.9797, + "mean_token_accuracy": 0.538915753364563, + "num_tokens": 3502498174.0, + "step": 6851 + }, + { + "epoch": 1.85289345592212, + "grad_norm": 1.303996205329895, + "learning_rate": 1.5059555092082523e-05, + "loss": 1.8962, + "mean_token_accuracy": 0.5642435550689697, + "num_tokens": 3503022310.0, + "step": 6852 + }, + { + "epoch": 1.8531638723634396, + "grad_norm": 1.4426007270812988, + "learning_rate": 1.5058148024576295e-05, + "loss": 1.9996, + "mean_token_accuracy": 0.5640999674797058, + "num_tokens": 3503546284.0, + "step": 6853 + }, + { + "epoch": 1.8534342888047592, + "grad_norm": 1.6303112506866455, + "learning_rate": 1.5056740832554158e-05, + "loss": 1.9711, + "mean_token_accuracy": 0.5367711782455444, + "num_tokens": 3504070532.0, + "step": 6854 + }, + { + "epoch": 1.8537047052460789, + "grad_norm": 1.364090919494629, + "learning_rate": 1.5055333516059278e-05, + "loss": 1.8933, + "mean_token_accuracy": 0.5698724389076233, + "num_tokens": 3504574015.0, + "step": 6855 + }, + { + "epoch": 1.8539751216873985, + "grad_norm": 1.3294413089752197, + "learning_rate": 1.5053926075134837e-05, + "loss": 1.9225, + "mean_token_accuracy": 0.5662297010421753, + "num_tokens": 3505098251.0, + "step": 6856 + }, + { + "epoch": 1.8542455381287182, + "grad_norm": 1.4651052951812744, + "learning_rate": 1.5052518509824026e-05, + "loss": 1.9402, + "mean_token_accuracy": 0.5578876733779907, + "num_tokens": 3505622482.0, + "step": 6857 + }, + { + "epoch": 1.8545159545700378, + "grad_norm": 1.2865732908248901, + "learning_rate": 1.5051110820170027e-05, + "loss": 1.9541, + "mean_token_accuracy": 0.5681184530258179, + "num_tokens": 3506086357.0, + "step": 6858 + }, + { + "epoch": 1.8547863710113575, + "grad_norm": 1.3464843034744263, + "learning_rate": 1.5049703006216037e-05, + "loss": 1.9259, + "mean_token_accuracy": 0.5633269548416138, + "num_tokens": 3506610579.0, + "step": 6859 + }, + { + "epoch": 1.855056787452677, + "grad_norm": 1.579123616218567, + "learning_rate": 1.5048295068005247e-05, + "loss": 1.9901, + "mean_token_accuracy": 0.5515402555465698, + "num_tokens": 3507134738.0, + "step": 6860 + }, + { + "epoch": 1.8553272038939967, + "grad_norm": 0.5642276406288147, + "learning_rate": 1.504688700558086e-05, + "loss": 1.1614, + "mean_token_accuracy": 0.6809876561164856, + "num_tokens": 3507658983.0, + "step": 6861 + }, + { + "epoch": 1.8555976203353164, + "grad_norm": 1.7566382884979248, + "learning_rate": 1.504547881898608e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5485221147537231, + "num_tokens": 3508183249.0, + "step": 6862 + }, + { + "epoch": 1.855868036776636, + "grad_norm": 1.5154489278793335, + "learning_rate": 1.5044070508264112e-05, + "loss": 1.9562, + "mean_token_accuracy": 0.5603542327880859, + "num_tokens": 3508704117.0, + "step": 6863 + }, + { + "epoch": 1.8561384532179557, + "grad_norm": 1.2344609498977661, + "learning_rate": 1.504266207345817e-05, + "loss": 1.8507, + "mean_token_accuracy": 0.5741130709648132, + "num_tokens": 3509170450.0, + "step": 6864 + }, + { + "epoch": 1.8564088696592753, + "grad_norm": 1.7094064950942993, + "learning_rate": 1.5041253514611463e-05, + "loss": 2.0022, + "mean_token_accuracy": 0.5403956174850464, + "num_tokens": 3509694626.0, + "step": 6865 + }, + { + "epoch": 1.856679286100595, + "grad_norm": 1.3139715194702148, + "learning_rate": 1.5039844831767215e-05, + "loss": 1.904, + "mean_token_accuracy": 0.5507228374481201, + "num_tokens": 3510218791.0, + "step": 6866 + }, + { + "epoch": 1.8569497025419146, + "grad_norm": 1.672628402709961, + "learning_rate": 1.5038436024968648e-05, + "loss": 2.0395, + "mean_token_accuracy": 0.5436317920684814, + "num_tokens": 3510742955.0, + "step": 6867 + }, + { + "epoch": 1.8572201189832342, + "grad_norm": 1.863006591796875, + "learning_rate": 1.5037027094258988e-05, + "loss": 1.924, + "mean_token_accuracy": 0.5763252973556519, + "num_tokens": 3511220489.0, + "step": 6868 + }, + { + "epoch": 1.857490535424554, + "grad_norm": 1.4609862565994263, + "learning_rate": 1.503561803968146e-05, + "loss": 2.0258, + "mean_token_accuracy": 0.5341219305992126, + "num_tokens": 3511744744.0, + "step": 6869 + }, + { + "epoch": 1.8577609518658735, + "grad_norm": 1.7058089971542358, + "learning_rate": 1.5034208861279309e-05, + "loss": 2.0981, + "mean_token_accuracy": 0.5257420539855957, + "num_tokens": 3512269020.0, + "step": 6870 + }, + { + "epoch": 1.8580313683071932, + "grad_norm": 2.0160622596740723, + "learning_rate": 1.5032799559095764e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.5549141764640808, + "num_tokens": 3512793202.0, + "step": 6871 + }, + { + "epoch": 1.8583017847485128, + "grad_norm": 1.3458069562911987, + "learning_rate": 1.5031390133174065e-05, + "loss": 1.7697, + "mean_token_accuracy": 0.5641347765922546, + "num_tokens": 3513272878.0, + "step": 6872 + }, + { + "epoch": 1.8585722011898325, + "grad_norm": 1.5284518003463745, + "learning_rate": 1.5029980583557466e-05, + "loss": 1.8796, + "mean_token_accuracy": 0.5787279009819031, + "num_tokens": 3513743510.0, + "step": 6873 + }, + { + "epoch": 1.858842617631152, + "grad_norm": 1.6380860805511475, + "learning_rate": 1.502857091028921e-05, + "loss": 2.0394, + "mean_token_accuracy": 0.5426144003868103, + "num_tokens": 3514212942.0, + "step": 6874 + }, + { + "epoch": 1.8591130340724717, + "grad_norm": 1.4528307914733887, + "learning_rate": 1.5027161113412552e-05, + "loss": 1.9155, + "mean_token_accuracy": 0.5802250504493713, + "num_tokens": 3514721724.0, + "step": 6875 + }, + { + "epoch": 1.8593834505137914, + "grad_norm": 1.2122536897659302, + "learning_rate": 1.5025751192970745e-05, + "loss": 2.0358, + "mean_token_accuracy": 0.5440886616706848, + "num_tokens": 3515245942.0, + "step": 6876 + }, + { + "epoch": 1.859653866955111, + "grad_norm": 1.217930555343628, + "learning_rate": 1.5024341149007056e-05, + "loss": 1.592, + "mean_token_accuracy": 0.6242609024047852, + "num_tokens": 3515727477.0, + "step": 6877 + }, + { + "epoch": 1.8599242833964305, + "grad_norm": 1.6962414979934692, + "learning_rate": 1.5022930981564743e-05, + "loss": 1.9467, + "mean_token_accuracy": 0.5809966325759888, + "num_tokens": 3516251737.0, + "step": 6878 + }, + { + "epoch": 1.86019469983775, + "grad_norm": 1.241392970085144, + "learning_rate": 1.5021520690687077e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.5632106065750122, + "num_tokens": 3516775832.0, + "step": 6879 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 1.4124948978424072, + "learning_rate": 1.5020110276417333e-05, + "loss": 2.0009, + "mean_token_accuracy": 0.5387932062149048, + "num_tokens": 3517300095.0, + "step": 6880 + }, + { + "epoch": 1.8607355327203894, + "grad_norm": 0.6558712124824524, + "learning_rate": 1.5018699738798779e-05, + "loss": 1.0638, + "mean_token_accuracy": 0.7085915803909302, + "num_tokens": 3517824356.0, + "step": 6881 + }, + { + "epoch": 1.861005949161709, + "grad_norm": 2.1881909370422363, + "learning_rate": 1.5017289077874699e-05, + "loss": 2.0125, + "mean_token_accuracy": 0.5544205904006958, + "num_tokens": 3518300984.0, + "step": 6882 + }, + { + "epoch": 1.8612763656030287, + "grad_norm": 1.9538806676864624, + "learning_rate": 1.5015878293688379e-05, + "loss": 1.955, + "mean_token_accuracy": 0.546165943145752, + "num_tokens": 3518825077.0, + "step": 6883 + }, + { + "epoch": 1.8615467820443483, + "grad_norm": 1.2087862491607666, + "learning_rate": 1.5014467386283102e-05, + "loss": 1.9611, + "mean_token_accuracy": 0.5555237531661987, + "num_tokens": 3519349277.0, + "step": 6884 + }, + { + "epoch": 1.861817198485668, + "grad_norm": 1.4514803886413574, + "learning_rate": 1.5013056355702159e-05, + "loss": 1.9116, + "mean_token_accuracy": 0.562344491481781, + "num_tokens": 3519873558.0, + "step": 6885 + }, + { + "epoch": 1.8620876149269876, + "grad_norm": 1.9357638359069824, + "learning_rate": 1.5011645201988847e-05, + "loss": 1.9549, + "mean_token_accuracy": 0.5510535836219788, + "num_tokens": 3520397754.0, + "step": 6886 + }, + { + "epoch": 1.8623580313683072, + "grad_norm": 1.4006235599517822, + "learning_rate": 1.5010233925186459e-05, + "loss": 1.9213, + "mean_token_accuracy": 0.5669511556625366, + "num_tokens": 3520911284.0, + "step": 6887 + }, + { + "epoch": 1.8626284478096267, + "grad_norm": 1.3948779106140137, + "learning_rate": 1.5008822525338305e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.5737003087997437, + "num_tokens": 3521435491.0, + "step": 6888 + }, + { + "epoch": 1.8628988642509463, + "grad_norm": 1.3110527992248535, + "learning_rate": 1.5007411002487685e-05, + "loss": 1.8865, + "mean_token_accuracy": 0.5603101253509521, + "num_tokens": 3521959624.0, + "step": 6889 + }, + { + "epoch": 1.863169280692266, + "grad_norm": 1.4973336458206177, + "learning_rate": 1.5005999356677912e-05, + "loss": 1.9925, + "mean_token_accuracy": 0.5725200176239014, + "num_tokens": 3522421939.0, + "step": 6890 + }, + { + "epoch": 1.8634396971335856, + "grad_norm": 1.075249195098877, + "learning_rate": 1.5004587587952296e-05, + "loss": 1.892, + "mean_token_accuracy": 0.5640060901641846, + "num_tokens": 3522895154.0, + "step": 6891 + }, + { + "epoch": 1.8637101135749052, + "grad_norm": 1.1039115190505981, + "learning_rate": 1.5003175696354155e-05, + "loss": 2.0321, + "mean_token_accuracy": 0.5428895354270935, + "num_tokens": 3523419437.0, + "step": 6892 + }, + { + "epoch": 1.8639805300162249, + "grad_norm": 1.1302040815353394, + "learning_rate": 1.5001763681926813e-05, + "loss": 1.9192, + "mean_token_accuracy": 0.5680050849914551, + "num_tokens": 3523943685.0, + "step": 6893 + }, + { + "epoch": 1.8642509464575445, + "grad_norm": 1.1920565366744995, + "learning_rate": 1.5000351544713592e-05, + "loss": 1.9746, + "mean_token_accuracy": 0.5575491189956665, + "num_tokens": 3524467904.0, + "step": 6894 + }, + { + "epoch": 1.8645213628988642, + "grad_norm": 1.3032292127609253, + "learning_rate": 1.4998939284757823e-05, + "loss": 1.8445, + "mean_token_accuracy": 0.579774022102356, + "num_tokens": 3524981667.0, + "step": 6895 + }, + { + "epoch": 1.8647917793401838, + "grad_norm": 1.0306284427642822, + "learning_rate": 1.4997526902102834e-05, + "loss": 1.9519, + "mean_token_accuracy": 0.5320574045181274, + "num_tokens": 3525505686.0, + "step": 6896 + }, + { + "epoch": 1.8650621957815035, + "grad_norm": 1.4107203483581543, + "learning_rate": 1.4996114396791966e-05, + "loss": 2.0556, + "mean_token_accuracy": 0.538204550743103, + "num_tokens": 3526029743.0, + "step": 6897 + }, + { + "epoch": 1.865332612222823, + "grad_norm": 1.2754391431808472, + "learning_rate": 1.4994701768868555e-05, + "loss": 1.9046, + "mean_token_accuracy": 0.5630857348442078, + "num_tokens": 3526554013.0, + "step": 6898 + }, + { + "epoch": 1.8656030286641427, + "grad_norm": 1.169353723526001, + "learning_rate": 1.4993289018375947e-05, + "loss": 1.9479, + "mean_token_accuracy": 0.5446123480796814, + "num_tokens": 3527041069.0, + "step": 6899 + }, + { + "epoch": 1.8658734451054624, + "grad_norm": 1.0440934896469116, + "learning_rate": 1.4991876145357488e-05, + "loss": 1.8381, + "mean_token_accuracy": 0.5707756280899048, + "num_tokens": 3527515871.0, + "step": 6900 + }, + { + "epoch": 1.866143861546782, + "grad_norm": 0.4840971529483795, + "learning_rate": 1.4990463149856529e-05, + "loss": 1.2054, + "mean_token_accuracy": 0.6807119846343994, + "num_tokens": 3528039943.0, + "step": 6901 + }, + { + "epoch": 1.8664142779881017, + "grad_norm": 1.437970757484436, + "learning_rate": 1.4989050031916425e-05, + "loss": 1.9095, + "mean_token_accuracy": 0.5593042373657227, + "num_tokens": 3528564159.0, + "step": 6902 + }, + { + "epoch": 1.8666846944294213, + "grad_norm": 1.3747786283493042, + "learning_rate": 1.4987636791580538e-05, + "loss": 1.9657, + "mean_token_accuracy": 0.570753812789917, + "num_tokens": 3529022949.0, + "step": 6903 + }, + { + "epoch": 1.866955110870741, + "grad_norm": 1.0439800024032593, + "learning_rate": 1.4986223428892224e-05, + "loss": 1.7613, + "mean_token_accuracy": 0.5656239986419678, + "num_tokens": 3529547218.0, + "step": 6904 + }, + { + "epoch": 1.8672255273120606, + "grad_norm": 1.4172800779342651, + "learning_rate": 1.4984809943894855e-05, + "loss": 2.0044, + "mean_token_accuracy": 0.5484964847564697, + "num_tokens": 3530016955.0, + "step": 6905 + }, + { + "epoch": 1.8674959437533802, + "grad_norm": 1.3454920053482056, + "learning_rate": 1.4983396336631797e-05, + "loss": 1.9299, + "mean_token_accuracy": 0.5687583684921265, + "num_tokens": 3530504649.0, + "step": 6906 + }, + { + "epoch": 1.8677663601946999, + "grad_norm": 1.05916166305542, + "learning_rate": 1.4981982607146424e-05, + "loss": 1.9306, + "mean_token_accuracy": 0.5774946808815002, + "num_tokens": 3530930707.0, + "step": 6907 + }, + { + "epoch": 1.8680367766360195, + "grad_norm": 1.2879832983016968, + "learning_rate": 1.4980568755482114e-05, + "loss": 2.0464, + "mean_token_accuracy": 0.5569128394126892, + "num_tokens": 3531378191.0, + "step": 6908 + }, + { + "epoch": 1.8683071930773392, + "grad_norm": 1.3506355285644531, + "learning_rate": 1.4979154781682251e-05, + "loss": 2.0711, + "mean_token_accuracy": 0.547819972038269, + "num_tokens": 3531889855.0, + "step": 6909 + }, + { + "epoch": 1.8685776095186588, + "grad_norm": 1.2992560863494873, + "learning_rate": 1.4977740685790219e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.579330563545227, + "num_tokens": 3532387614.0, + "step": 6910 + }, + { + "epoch": 1.8688480259599785, + "grad_norm": 1.314515471458435, + "learning_rate": 1.4976326467849401e-05, + "loss": 1.9109, + "mean_token_accuracy": 0.5771851539611816, + "num_tokens": 3532890898.0, + "step": 6911 + }, + { + "epoch": 1.869118442401298, + "grad_norm": 1.3002123832702637, + "learning_rate": 1.4974912127903194e-05, + "loss": 1.8882, + "mean_token_accuracy": 0.5662607550621033, + "num_tokens": 3533415123.0, + "step": 6912 + }, + { + "epoch": 1.8693888588426177, + "grad_norm": 1.1719797849655151, + "learning_rate": 1.4973497665994996e-05, + "loss": 1.9692, + "mean_token_accuracy": 0.5639269948005676, + "num_tokens": 3533893587.0, + "step": 6913 + }, + { + "epoch": 1.8696592752839374, + "grad_norm": 1.0418446063995361, + "learning_rate": 1.49720830821682e-05, + "loss": 1.9403, + "mean_token_accuracy": 0.5522305965423584, + "num_tokens": 3534417728.0, + "step": 6914 + }, + { + "epoch": 1.869929691725257, + "grad_norm": 1.1139520406723022, + "learning_rate": 1.4970668376466216e-05, + "loss": 2.0398, + "mean_token_accuracy": 0.5299734473228455, + "num_tokens": 3534941998.0, + "step": 6915 + }, + { + "epoch": 1.8702001081665767, + "grad_norm": 0.9731109142303467, + "learning_rate": 1.4969253548932452e-05, + "loss": 2.0228, + "mean_token_accuracy": 0.5482224225997925, + "num_tokens": 3535466278.0, + "step": 6916 + }, + { + "epoch": 1.8704705246078963, + "grad_norm": 1.3001630306243896, + "learning_rate": 1.4967838599610312e-05, + "loss": 2.0071, + "mean_token_accuracy": 0.5638506412506104, + "num_tokens": 3535955478.0, + "step": 6917 + }, + { + "epoch": 1.870740941049216, + "grad_norm": 1.12013578414917, + "learning_rate": 1.4966423528543217e-05, + "loss": 1.7984, + "mean_token_accuracy": 0.5933352708816528, + "num_tokens": 3536459021.0, + "step": 6918 + }, + { + "epoch": 1.8710113574905354, + "grad_norm": 1.2024552822113037, + "learning_rate": 1.4965008335774584e-05, + "loss": 1.9839, + "mean_token_accuracy": 0.546285092830658, + "num_tokens": 3536966391.0, + "step": 6919 + }, + { + "epoch": 1.871281773931855, + "grad_norm": 1.0798412561416626, + "learning_rate": 1.4963593021347835e-05, + "loss": 1.9158, + "mean_token_accuracy": 0.5480345487594604, + "num_tokens": 3537490635.0, + "step": 6920 + }, + { + "epoch": 1.8715521903731747, + "grad_norm": 0.4776667654514313, + "learning_rate": 1.4962177585306396e-05, + "loss": 1.0734, + "mean_token_accuracy": 0.7036693096160889, + "num_tokens": 3538014671.0, + "step": 6921 + }, + { + "epoch": 1.8718226068144943, + "grad_norm": 1.2729952335357666, + "learning_rate": 1.4960762027693693e-05, + "loss": 1.8252, + "mean_token_accuracy": 0.5815525054931641, + "num_tokens": 3538538825.0, + "step": 6922 + }, + { + "epoch": 1.872093023255814, + "grad_norm": 1.1537652015686035, + "learning_rate": 1.4959346348553167e-05, + "loss": 1.9946, + "mean_token_accuracy": 0.5472543835639954, + "num_tokens": 3539062954.0, + "step": 6923 + }, + { + "epoch": 1.8723634396971336, + "grad_norm": 1.0995749235153198, + "learning_rate": 1.4957930547928253e-05, + "loss": 1.8916, + "mean_token_accuracy": 0.5655632615089417, + "num_tokens": 3539535900.0, + "step": 6924 + }, + { + "epoch": 1.8726338561384532, + "grad_norm": 1.0431257486343384, + "learning_rate": 1.4956514625862385e-05, + "loss": 1.9515, + "mean_token_accuracy": 0.5872058272361755, + "num_tokens": 3539995339.0, + "step": 6925 + }, + { + "epoch": 1.8729042725797729, + "grad_norm": 1.5085355043411255, + "learning_rate": 1.4955098582399019e-05, + "loss": 1.8866, + "mean_token_accuracy": 0.5439046025276184, + "num_tokens": 3540519441.0, + "step": 6926 + }, + { + "epoch": 1.8731746890210925, + "grad_norm": 1.1259338855743408, + "learning_rate": 1.4953682417581595e-05, + "loss": 1.8578, + "mean_token_accuracy": 0.5779199600219727, + "num_tokens": 3541043671.0, + "step": 6927 + }, + { + "epoch": 1.8734451054624122, + "grad_norm": 1.1728489398956299, + "learning_rate": 1.4952266131453565e-05, + "loss": 1.9033, + "mean_token_accuracy": 0.5587866306304932, + "num_tokens": 3541567950.0, + "step": 6928 + }, + { + "epoch": 1.8737155219037316, + "grad_norm": 1.194164752960205, + "learning_rate": 1.495084972405839e-05, + "loss": 1.9897, + "mean_token_accuracy": 0.5582860708236694, + "num_tokens": 3542046700.0, + "step": 6929 + }, + { + "epoch": 1.8739859383450512, + "grad_norm": 1.2400692701339722, + "learning_rate": 1.4949433195439525e-05, + "loss": 1.9414, + "mean_token_accuracy": 0.5650099515914917, + "num_tokens": 3542570986.0, + "step": 6930 + }, + { + "epoch": 1.8742563547863709, + "grad_norm": 1.2060405015945435, + "learning_rate": 1.4948016545640434e-05, + "loss": 1.8054, + "mean_token_accuracy": 0.6064872145652771, + "num_tokens": 3543064898.0, + "step": 6931 + }, + { + "epoch": 1.8745267712276905, + "grad_norm": 1.102135419845581, + "learning_rate": 1.4946599774704586e-05, + "loss": 1.9122, + "mean_token_accuracy": 0.5656601190567017, + "num_tokens": 3543589113.0, + "step": 6932 + }, + { + "epoch": 1.8747971876690102, + "grad_norm": 1.1224552392959595, + "learning_rate": 1.4945182882675453e-05, + "loss": 1.9437, + "mean_token_accuracy": 0.5740702152252197, + "num_tokens": 3544051552.0, + "step": 6933 + }, + { + "epoch": 1.8750676041103298, + "grad_norm": 1.1426007747650146, + "learning_rate": 1.4943765869596507e-05, + "loss": 1.951, + "mean_token_accuracy": 0.5602657198905945, + "num_tokens": 3544575670.0, + "step": 6934 + }, + { + "epoch": 1.8753380205516494, + "grad_norm": 1.2342244386672974, + "learning_rate": 1.4942348735511223e-05, + "loss": 2.0435, + "mean_token_accuracy": 0.5376133918762207, + "num_tokens": 3545099771.0, + "step": 6935 + }, + { + "epoch": 1.875608436992969, + "grad_norm": 1.066672921180725, + "learning_rate": 1.4940931480463088e-05, + "loss": 1.8502, + "mean_token_accuracy": 0.5661934018135071, + "num_tokens": 3545623994.0, + "step": 6936 + }, + { + "epoch": 1.8758788534342887, + "grad_norm": 1.4724924564361572, + "learning_rate": 1.4939514104495585e-05, + "loss": 1.9858, + "mean_token_accuracy": 0.5317206382751465, + "num_tokens": 3546148207.0, + "step": 6937 + }, + { + "epoch": 1.8761492698756084, + "grad_norm": 1.3502087593078613, + "learning_rate": 1.4938096607652205e-05, + "loss": 1.9036, + "mean_token_accuracy": 0.5497530698776245, + "num_tokens": 3546672473.0, + "step": 6938 + }, + { + "epoch": 1.876419686316928, + "grad_norm": 1.1756261587142944, + "learning_rate": 1.4936678989976439e-05, + "loss": 1.854, + "mean_token_accuracy": 0.5716489553451538, + "num_tokens": 3547196746.0, + "step": 6939 + }, + { + "epoch": 1.8766901027582477, + "grad_norm": 1.2219505310058594, + "learning_rate": 1.4935261251511786e-05, + "loss": 1.9987, + "mean_token_accuracy": 0.547250509262085, + "num_tokens": 3547720976.0, + "step": 6940 + }, + { + "epoch": 1.8769605191995673, + "grad_norm": 0.5995472073554993, + "learning_rate": 1.4933843392301743e-05, + "loss": 1.1711, + "mean_token_accuracy": 0.6822885274887085, + "num_tokens": 3548245220.0, + "step": 6941 + }, + { + "epoch": 1.877230935640887, + "grad_norm": 1.3934876918792725, + "learning_rate": 1.4932425412389818e-05, + "loss": 2.0494, + "mean_token_accuracy": 0.5539984703063965, + "num_tokens": 3548717112.0, + "step": 6942 + }, + { + "epoch": 1.8775013520822066, + "grad_norm": 1.0677189826965332, + "learning_rate": 1.4931007311819513e-05, + "loss": 1.9522, + "mean_token_accuracy": 0.5664629936218262, + "num_tokens": 3549241306.0, + "step": 6943 + }, + { + "epoch": 1.8777717685235262, + "grad_norm": 1.3418834209442139, + "learning_rate": 1.4929589090634347e-05, + "loss": 1.9607, + "mean_token_accuracy": 0.5408545732498169, + "num_tokens": 3549765475.0, + "step": 6944 + }, + { + "epoch": 1.8780421849648459, + "grad_norm": 1.2698240280151367, + "learning_rate": 1.4928170748877831e-05, + "loss": 1.8999, + "mean_token_accuracy": 0.5674864053726196, + "num_tokens": 3550289667.0, + "step": 6945 + }, + { + "epoch": 1.8783126014061655, + "grad_norm": 0.979396402835846, + "learning_rate": 1.4926752286593484e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.5432720184326172, + "num_tokens": 3550813919.0, + "step": 6946 + }, + { + "epoch": 1.8785830178474852, + "grad_norm": 1.1218870878219604, + "learning_rate": 1.4925333703824829e-05, + "loss": 2.0168, + "mean_token_accuracy": 0.542110025882721, + "num_tokens": 3551338119.0, + "step": 6947 + }, + { + "epoch": 1.8788534342888048, + "grad_norm": 1.3245766162872314, + "learning_rate": 1.4923915000615394e-05, + "loss": 1.918, + "mean_token_accuracy": 0.5394552946090698, + "num_tokens": 3551862260.0, + "step": 6948 + }, + { + "epoch": 1.8791238507301244, + "grad_norm": 1.1346378326416016, + "learning_rate": 1.4922496177008706e-05, + "loss": 1.9181, + "mean_token_accuracy": 0.5679952502250671, + "num_tokens": 3552339287.0, + "step": 6949 + }, + { + "epoch": 1.879394267171444, + "grad_norm": 1.3082600831985474, + "learning_rate": 1.4921077233048297e-05, + "loss": 2.0089, + "mean_token_accuracy": 0.5533233880996704, + "num_tokens": 3552830322.0, + "step": 6950 + }, + { + "epoch": 1.8796646836127637, + "grad_norm": 1.3830056190490723, + "learning_rate": 1.491965816877771e-05, + "loss": 1.9915, + "mean_token_accuracy": 0.545070230960846, + "num_tokens": 3553354443.0, + "step": 6951 + }, + { + "epoch": 1.8799351000540834, + "grad_norm": 1.2855446338653564, + "learning_rate": 1.4918238984240485e-05, + "loss": 2.0998, + "mean_token_accuracy": 0.5259255766868591, + "num_tokens": 3553878584.0, + "step": 6952 + }, + { + "epoch": 1.880205516495403, + "grad_norm": 1.3206714391708374, + "learning_rate": 1.4916819679480163e-05, + "loss": 1.9228, + "mean_token_accuracy": 0.5662524104118347, + "num_tokens": 3554402609.0, + "step": 6953 + }, + { + "epoch": 1.8804759329367227, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4915400254540295e-05, + "loss": 1.9092, + "mean_token_accuracy": 0.5605965852737427, + "num_tokens": 3554926770.0, + "step": 6954 + }, + { + "epoch": 1.8807463493780423, + "grad_norm": 1.120987057685852, + "learning_rate": 1.4913980709464436e-05, + "loss": 1.949, + "mean_token_accuracy": 0.5466048121452332, + "num_tokens": 3555450905.0, + "step": 6955 + }, + { + "epoch": 1.881016765819362, + "grad_norm": 1.152071237564087, + "learning_rate": 1.4912561044296134e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5297853946685791, + "num_tokens": 3555975111.0, + "step": 6956 + }, + { + "epoch": 1.8812871822606816, + "grad_norm": 1.2502999305725098, + "learning_rate": 1.4911141259078955e-05, + "loss": 1.8176, + "mean_token_accuracy": 0.561392068862915, + "num_tokens": 3556499293.0, + "step": 6957 + }, + { + "epoch": 1.8815575987020012, + "grad_norm": 1.2014527320861816, + "learning_rate": 1.4909721353856458e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.598286509513855, + "num_tokens": 3557023343.0, + "step": 6958 + }, + { + "epoch": 1.8818280151433209, + "grad_norm": 1.2286678552627563, + "learning_rate": 1.4908301328672214e-05, + "loss": 2.0943, + "mean_token_accuracy": 0.5415099859237671, + "num_tokens": 3557547440.0, + "step": 6959 + }, + { + "epoch": 1.8820984315846403, + "grad_norm": 1.1705976724624634, + "learning_rate": 1.4906881183569791e-05, + "loss": 1.9928, + "mean_token_accuracy": 0.5568569302558899, + "num_tokens": 3558071663.0, + "step": 6960 + }, + { + "epoch": 1.88236884802596, + "grad_norm": 0.5038662552833557, + "learning_rate": 1.4905460918592764e-05, + "loss": 1.1132, + "mean_token_accuracy": 0.7025657892227173, + "num_tokens": 3558595948.0, + "step": 6961 + }, + { + "epoch": 1.8826392644672796, + "grad_norm": 1.7617573738098145, + "learning_rate": 1.4904040533784715e-05, + "loss": 1.9001, + "mean_token_accuracy": 0.5709927678108215, + "num_tokens": 3559120175.0, + "step": 6962 + }, + { + "epoch": 1.8829096809085992, + "grad_norm": 1.5616410970687866, + "learning_rate": 1.4902620029189216e-05, + "loss": 1.9881, + "mean_token_accuracy": 0.5349371433258057, + "num_tokens": 3559644294.0, + "step": 6963 + }, + { + "epoch": 1.8831800973499189, + "grad_norm": 1.2251360416412354, + "learning_rate": 1.490119940484986e-05, + "loss": 1.9549, + "mean_token_accuracy": 0.5472315549850464, + "num_tokens": 3560168456.0, + "step": 6964 + }, + { + "epoch": 1.8834505137912385, + "grad_norm": 1.585547924041748, + "learning_rate": 1.4899778660810232e-05, + "loss": 1.9386, + "mean_token_accuracy": 0.5423104763031006, + "num_tokens": 3560692658.0, + "step": 6965 + }, + { + "epoch": 1.8837209302325582, + "grad_norm": 1.5083740949630737, + "learning_rate": 1.4898357797113926e-05, + "loss": 2.0162, + "mean_token_accuracy": 0.5520771741867065, + "num_tokens": 3561216849.0, + "step": 6966 + }, + { + "epoch": 1.8839913466738778, + "grad_norm": 1.262433648109436, + "learning_rate": 1.4896936813804539e-05, + "loss": 1.9159, + "mean_token_accuracy": 0.5334387421607971, + "num_tokens": 3561741072.0, + "step": 6967 + }, + { + "epoch": 1.8842617631151974, + "grad_norm": 1.283704400062561, + "learning_rate": 1.4895515710925672e-05, + "loss": 1.8416, + "mean_token_accuracy": 0.5930773019790649, + "num_tokens": 3562265342.0, + "step": 6968 + }, + { + "epoch": 1.884532179556517, + "grad_norm": 1.5149405002593994, + "learning_rate": 1.4894094488520927e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.5704437494277954, + "num_tokens": 3562789501.0, + "step": 6969 + }, + { + "epoch": 1.8848025959978365, + "grad_norm": 1.2173511981964111, + "learning_rate": 1.4892673146633912e-05, + "loss": 1.9062, + "mean_token_accuracy": 0.5699623823165894, + "num_tokens": 3563245326.0, + "step": 6970 + }, + { + "epoch": 1.8850730124391561, + "grad_norm": 1.258673071861267, + "learning_rate": 1.4891251685308235e-05, + "loss": 1.9836, + "mean_token_accuracy": 0.5453889966011047, + "num_tokens": 3563769520.0, + "step": 6971 + }, + { + "epoch": 1.8853434288804758, + "grad_norm": 1.3127108812332153, + "learning_rate": 1.488983010458751e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5705468654632568, + "num_tokens": 3564229021.0, + "step": 6972 + }, + { + "epoch": 1.8856138453217954, + "grad_norm": 1.3007515668869019, + "learning_rate": 1.4888408404515363e-05, + "loss": 1.9125, + "mean_token_accuracy": 0.5725018382072449, + "num_tokens": 3564753062.0, + "step": 6973 + }, + { + "epoch": 1.885884261763115, + "grad_norm": 1.2122999429702759, + "learning_rate": 1.4886986585135408e-05, + "loss": 2.0268, + "mean_token_accuracy": 0.551175594329834, + "num_tokens": 3565277267.0, + "step": 6974 + }, + { + "epoch": 1.8861546782044347, + "grad_norm": 1.5739110708236694, + "learning_rate": 1.4885564646491277e-05, + "loss": 2.0572, + "mean_token_accuracy": 0.549575924873352, + "num_tokens": 3565801519.0, + "step": 6975 + }, + { + "epoch": 1.8864250946457544, + "grad_norm": 1.147782325744629, + "learning_rate": 1.4884142588626595e-05, + "loss": 1.937, + "mean_token_accuracy": 0.5543294548988342, + "num_tokens": 3566325788.0, + "step": 6976 + }, + { + "epoch": 1.886695511087074, + "grad_norm": 1.2516452074050903, + "learning_rate": 1.4882720411584997e-05, + "loss": 2.1113, + "mean_token_accuracy": 0.5331779718399048, + "num_tokens": 3566849970.0, + "step": 6977 + }, + { + "epoch": 1.8869659275283936, + "grad_norm": 1.5499272346496582, + "learning_rate": 1.4881298115410115e-05, + "loss": 1.8322, + "mean_token_accuracy": 0.5786873698234558, + "num_tokens": 3567338283.0, + "step": 6978 + }, + { + "epoch": 1.8872363439697133, + "grad_norm": 1.3063137531280518, + "learning_rate": 1.4879875700145594e-05, + "loss": 2.0524, + "mean_token_accuracy": 0.5393447875976562, + "num_tokens": 3567862549.0, + "step": 6979 + }, + { + "epoch": 1.887506760411033, + "grad_norm": 1.0398386716842651, + "learning_rate": 1.4878453165835076e-05, + "loss": 1.9101, + "mean_token_accuracy": 0.5736459493637085, + "num_tokens": 3568374678.0, + "step": 6980 + }, + { + "epoch": 1.8877771768523526, + "grad_norm": 0.6080968379974365, + "learning_rate": 1.487703051252221e-05, + "loss": 1.1782, + "mean_token_accuracy": 0.682915210723877, + "num_tokens": 3568898869.0, + "step": 6981 + }, + { + "epoch": 1.8880475932936722, + "grad_norm": 1.660187005996704, + "learning_rate": 1.4875607740250649e-05, + "loss": 1.8675, + "mean_token_accuracy": 0.572534441947937, + "num_tokens": 3569422872.0, + "step": 6982 + }, + { + "epoch": 1.8883180097349919, + "grad_norm": 1.7765631675720215, + "learning_rate": 1.487418484906404e-05, + "loss": 1.9704, + "mean_token_accuracy": 0.5576149225234985, + "num_tokens": 3569902868.0, + "step": 6983 + }, + { + "epoch": 1.8885884261763115, + "grad_norm": 1.009160041809082, + "learning_rate": 1.4872761839006049e-05, + "loss": 1.9369, + "mean_token_accuracy": 0.541038990020752, + "num_tokens": 3570427009.0, + "step": 6984 + }, + { + "epoch": 1.8888588426176312, + "grad_norm": 1.3874942064285278, + "learning_rate": 1.4871338710120336e-05, + "loss": 1.8298, + "mean_token_accuracy": 0.6052871346473694, + "num_tokens": 3570825650.0, + "step": 6985 + }, + { + "epoch": 1.8891292590589508, + "grad_norm": 1.9935786724090576, + "learning_rate": 1.4869915462450566e-05, + "loss": 1.7399, + "mean_token_accuracy": 0.58217453956604, + "num_tokens": 3571320850.0, + "step": 6986 + }, + { + "epoch": 1.8893996755002704, + "grad_norm": 1.2880679368972778, + "learning_rate": 1.4868492096040407e-05, + "loss": 1.8969, + "mean_token_accuracy": 0.5664197206497192, + "num_tokens": 3571840274.0, + "step": 6987 + }, + { + "epoch": 1.88967009194159, + "grad_norm": 1.1707329750061035, + "learning_rate": 1.4867068610933538e-05, + "loss": 1.8374, + "mean_token_accuracy": 0.5562637448310852, + "num_tokens": 3572364515.0, + "step": 6988 + }, + { + "epoch": 1.8899405083829097, + "grad_norm": 1.6558430194854736, + "learning_rate": 1.486564500717363e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5327891111373901, + "num_tokens": 3572869408.0, + "step": 6989 + }, + { + "epoch": 1.8902109248242294, + "grad_norm": 1.2865071296691895, + "learning_rate": 1.4864221284804362e-05, + "loss": 1.9906, + "mean_token_accuracy": 0.5505802631378174, + "num_tokens": 3573393587.0, + "step": 6990 + }, + { + "epoch": 1.890481341265549, + "grad_norm": 1.1180682182312012, + "learning_rate": 1.4862797443869426e-05, + "loss": 2.0435, + "mean_token_accuracy": 0.52884441614151, + "num_tokens": 3573917847.0, + "step": 6991 + }, + { + "epoch": 1.8907517577068687, + "grad_norm": 1.3166885375976562, + "learning_rate": 1.4861373484412502e-05, + "loss": 1.8636, + "mean_token_accuracy": 0.5725597143173218, + "num_tokens": 3574442089.0, + "step": 6992 + }, + { + "epoch": 1.8910221741481883, + "grad_norm": 1.4118754863739014, + "learning_rate": 1.4859949406477285e-05, + "loss": 1.9412, + "mean_token_accuracy": 0.555137038230896, + "num_tokens": 3574966310.0, + "step": 6993 + }, + { + "epoch": 1.891292590589508, + "grad_norm": 1.3426374197006226, + "learning_rate": 1.4858525210107468e-05, + "loss": 1.9753, + "mean_token_accuracy": 0.5585718154907227, + "num_tokens": 3575452528.0, + "step": 6994 + }, + { + "epoch": 1.8915630070308276, + "grad_norm": 1.1944057941436768, + "learning_rate": 1.485710089534675e-05, + "loss": 1.951, + "mean_token_accuracy": 0.5577850341796875, + "num_tokens": 3575976733.0, + "step": 6995 + }, + { + "epoch": 1.8918334234721472, + "grad_norm": 1.4703396558761597, + "learning_rate": 1.4855676462238837e-05, + "loss": 2.0548, + "mean_token_accuracy": 0.5444866418838501, + "num_tokens": 3576454639.0, + "step": 6996 + }, + { + "epoch": 1.8921038399134669, + "grad_norm": 1.1258351802825928, + "learning_rate": 1.4854251910827429e-05, + "loss": 1.8383, + "mean_token_accuracy": 0.5662688612937927, + "num_tokens": 3576978912.0, + "step": 6997 + }, + { + "epoch": 1.8923742563547865, + "grad_norm": 1.2549593448638916, + "learning_rate": 1.4852827241156236e-05, + "loss": 1.9977, + "mean_token_accuracy": 0.577905535697937, + "num_tokens": 3577443908.0, + "step": 6998 + }, + { + "epoch": 1.8926446727961062, + "grad_norm": 1.2063566446304321, + "learning_rate": 1.4851402453268975e-05, + "loss": 1.8872, + "mean_token_accuracy": 0.5551375150680542, + "num_tokens": 3577951982.0, + "step": 6999 + }, + { + "epoch": 1.8929150892374258, + "grad_norm": 1.143882393836975, + "learning_rate": 1.4849977547209358e-05, + "loss": 1.9232, + "mean_token_accuracy": 0.5636670589447021, + "num_tokens": 3578446734.0, + "step": 7000 + }, + { + "epoch": 1.8931855056787452, + "grad_norm": 0.6162956357002258, + "learning_rate": 1.4848552523021107e-05, + "loss": 1.0716, + "mean_token_accuracy": 0.7146944403648376, + "num_tokens": 3578970924.0, + "step": 7001 + }, + { + "epoch": 1.8934559221200649, + "grad_norm": 1.5939836502075195, + "learning_rate": 1.4847127380747955e-05, + "loss": 1.8883, + "mean_token_accuracy": 0.5683495998382568, + "num_tokens": 3579456148.0, + "step": 7002 + }, + { + "epoch": 1.8937263385613845, + "grad_norm": 1.4712297916412354, + "learning_rate": 1.4845702120433612e-05, + "loss": 2.0144, + "mean_token_accuracy": 0.5576980113983154, + "num_tokens": 3579980415.0, + "step": 7003 + }, + { + "epoch": 1.8939967550027041, + "grad_norm": 1.3807587623596191, + "learning_rate": 1.4844276742121824e-05, + "loss": 2.1261, + "mean_token_accuracy": 0.522323489189148, + "num_tokens": 3580504673.0, + "step": 7004 + }, + { + "epoch": 1.8942671714440238, + "grad_norm": 1.4439723491668701, + "learning_rate": 1.4842851245856318e-05, + "loss": 1.9412, + "mean_token_accuracy": 0.5644518136978149, + "num_tokens": 3581028930.0, + "step": 7005 + }, + { + "epoch": 1.8945375878853434, + "grad_norm": 1.2862396240234375, + "learning_rate": 1.4841425631680834e-05, + "loss": 2.0123, + "mean_token_accuracy": 0.5373320579528809, + "num_tokens": 3581552991.0, + "step": 7006 + }, + { + "epoch": 1.894808004326663, + "grad_norm": 1.357761025428772, + "learning_rate": 1.4839999899639115e-05, + "loss": 1.9233, + "mean_token_accuracy": 0.559749960899353, + "num_tokens": 3582077119.0, + "step": 7007 + }, + { + "epoch": 1.8950784207679827, + "grad_norm": 1.249820590019226, + "learning_rate": 1.4838574049774908e-05, + "loss": 2.0061, + "mean_token_accuracy": 0.5408719778060913, + "num_tokens": 3582567451.0, + "step": 7008 + }, + { + "epoch": 1.8953488372093024, + "grad_norm": 1.2738415002822876, + "learning_rate": 1.4837148082131959e-05, + "loss": 1.9475, + "mean_token_accuracy": 0.553886353969574, + "num_tokens": 3583091643.0, + "step": 7009 + }, + { + "epoch": 1.895619253650622, + "grad_norm": 1.4124517440795898, + "learning_rate": 1.4835721996754022e-05, + "loss": 1.9475, + "mean_token_accuracy": 0.557215690612793, + "num_tokens": 3583615819.0, + "step": 7010 + }, + { + "epoch": 1.8958896700919414, + "grad_norm": 1.412048578262329, + "learning_rate": 1.4834295793684857e-05, + "loss": 2.0045, + "mean_token_accuracy": 0.5491848587989807, + "num_tokens": 3584140031.0, + "step": 7011 + }, + { + "epoch": 1.896160086533261, + "grad_norm": 1.2387244701385498, + "learning_rate": 1.4832869472968219e-05, + "loss": 1.8349, + "mean_token_accuracy": 0.5887621641159058, + "num_tokens": 3584658908.0, + "step": 7012 + }, + { + "epoch": 1.8964305029745807, + "grad_norm": 1.536315679550171, + "learning_rate": 1.4831443034647875e-05, + "loss": 1.9633, + "mean_token_accuracy": 0.5347551107406616, + "num_tokens": 3585183188.0, + "step": 7013 + }, + { + "epoch": 1.8967009194159004, + "grad_norm": 1.3517582416534424, + "learning_rate": 1.4830016478767588e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.5327671766281128, + "num_tokens": 3585707468.0, + "step": 7014 + }, + { + "epoch": 1.89697133585722, + "grad_norm": 1.1214680671691895, + "learning_rate": 1.4828589805371134e-05, + "loss": 2.0391, + "mean_token_accuracy": 0.5349200963973999, + "num_tokens": 3586231661.0, + "step": 7015 + }, + { + "epoch": 1.8972417522985396, + "grad_norm": 1.2870676517486572, + "learning_rate": 1.4827163014502288e-05, + "loss": 1.8078, + "mean_token_accuracy": 0.5960487723350525, + "num_tokens": 3586723496.0, + "step": 7016 + }, + { + "epoch": 1.8975121687398593, + "grad_norm": 1.5150402784347534, + "learning_rate": 1.4825736106204822e-05, + "loss": 1.9608, + "mean_token_accuracy": 0.5568862557411194, + "num_tokens": 3587247776.0, + "step": 7017 + }, + { + "epoch": 1.897782585181179, + "grad_norm": 1.5184496641159058, + "learning_rate": 1.4824309080522526e-05, + "loss": 1.8308, + "mean_token_accuracy": 0.5762412548065186, + "num_tokens": 3587750868.0, + "step": 7018 + }, + { + "epoch": 1.8980530016224986, + "grad_norm": 1.143175482749939, + "learning_rate": 1.4822881937499176e-05, + "loss": 1.895, + "mean_token_accuracy": 0.548640787601471, + "num_tokens": 3588274990.0, + "step": 7019 + }, + { + "epoch": 1.8983234180638182, + "grad_norm": 1.6915209293365479, + "learning_rate": 1.4821454677178568e-05, + "loss": 1.9911, + "mean_token_accuracy": 0.5655705332756042, + "num_tokens": 3588743521.0, + "step": 7020 + }, + { + "epoch": 1.8985938345051379, + "grad_norm": 0.7748240828514099, + "learning_rate": 1.482002729960449e-05, + "loss": 1.1424, + "mean_token_accuracy": 0.7045960426330566, + "num_tokens": 3589267789.0, + "step": 7021 + }, + { + "epoch": 1.8988642509464575, + "grad_norm": 2.0798532962799072, + "learning_rate": 1.4818599804820747e-05, + "loss": 1.9324, + "mean_token_accuracy": 0.577282190322876, + "num_tokens": 3589752994.0, + "step": 7022 + }, + { + "epoch": 1.8991346673877771, + "grad_norm": 1.3426340818405151, + "learning_rate": 1.4817172192871126e-05, + "loss": 1.9309, + "mean_token_accuracy": 0.5753297805786133, + "num_tokens": 3590244411.0, + "step": 7023 + }, + { + "epoch": 1.8994050838290968, + "grad_norm": 1.494682788848877, + "learning_rate": 1.4815744463799438e-05, + "loss": 1.8593, + "mean_token_accuracy": 0.5699605941772461, + "num_tokens": 3590768601.0, + "step": 7024 + }, + { + "epoch": 1.8996755002704164, + "grad_norm": 1.7622543573379517, + "learning_rate": 1.481431661764949e-05, + "loss": 1.8816, + "mean_token_accuracy": 0.6181133985519409, + "num_tokens": 3591181403.0, + "step": 7025 + }, + { + "epoch": 1.899945916711736, + "grad_norm": 1.8029409646987915, + "learning_rate": 1.481288865446509e-05, + "loss": 1.9618, + "mean_token_accuracy": 0.5551197528839111, + "num_tokens": 3591705550.0, + "step": 7026 + }, + { + "epoch": 1.9002163331530557, + "grad_norm": 1.4490498304367065, + "learning_rate": 1.4811460574290054e-05, + "loss": 1.887, + "mean_token_accuracy": 0.5596603751182556, + "num_tokens": 3592229810.0, + "step": 7027 + }, + { + "epoch": 1.9004867495943754, + "grad_norm": 1.58322012424469, + "learning_rate": 1.4810032377168196e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5533775687217712, + "num_tokens": 3592753954.0, + "step": 7028 + }, + { + "epoch": 1.900757166035695, + "grad_norm": 1.2908596992492676, + "learning_rate": 1.4808604063143342e-05, + "loss": 2.017, + "mean_token_accuracy": 0.5483946204185486, + "num_tokens": 3593278124.0, + "step": 7029 + }, + { + "epoch": 1.9010275824770146, + "grad_norm": 1.2665883302688599, + "learning_rate": 1.4807175632259315e-05, + "loss": 2.036, + "mean_token_accuracy": 0.5406749248504639, + "num_tokens": 3593802309.0, + "step": 7030 + }, + { + "epoch": 1.9012979989183343, + "grad_norm": 1.442166805267334, + "learning_rate": 1.4805747084559944e-05, + "loss": 1.9204, + "mean_token_accuracy": 0.5455926060676575, + "num_tokens": 3594326355.0, + "step": 7031 + }, + { + "epoch": 1.901568415359654, + "grad_norm": 1.265177607536316, + "learning_rate": 1.4804318420089059e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.5571160912513733, + "num_tokens": 3594850605.0, + "step": 7032 + }, + { + "epoch": 1.9018388318009736, + "grad_norm": 1.4024479389190674, + "learning_rate": 1.48028896388905e-05, + "loss": 1.852, + "mean_token_accuracy": 0.5631914138793945, + "num_tokens": 3595322165.0, + "step": 7033 + }, + { + "epoch": 1.9021092482422932, + "grad_norm": 1.1304898262023926, + "learning_rate": 1.48014607410081e-05, + "loss": 1.8829, + "mean_token_accuracy": 0.5802924633026123, + "num_tokens": 3595846369.0, + "step": 7034 + }, + { + "epoch": 1.9023796646836129, + "grad_norm": 1.3121365308761597, + "learning_rate": 1.4800031726485705e-05, + "loss": 2.0286, + "mean_token_accuracy": 0.5745435357093811, + "num_tokens": 3596263414.0, + "step": 7035 + }, + { + "epoch": 1.9026500811249325, + "grad_norm": 1.2245032787322998, + "learning_rate": 1.4798602595367164e-05, + "loss": 1.9537, + "mean_token_accuracy": 0.5659663677215576, + "num_tokens": 3596787521.0, + "step": 7036 + }, + { + "epoch": 1.9029204975662521, + "grad_norm": 1.050161361694336, + "learning_rate": 1.479717334769632e-05, + "loss": 1.8499, + "mean_token_accuracy": 0.5637162923812866, + "num_tokens": 3597311693.0, + "step": 7037 + }, + { + "epoch": 1.9031909140075718, + "grad_norm": 1.145525336265564, + "learning_rate": 1.4795743983517037e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.571311354637146, + "num_tokens": 3597829247.0, + "step": 7038 + }, + { + "epoch": 1.9034613304488914, + "grad_norm": 1.2172973155975342, + "learning_rate": 1.4794314502873164e-05, + "loss": 1.8863, + "mean_token_accuracy": 0.5764014720916748, + "num_tokens": 3598353505.0, + "step": 7039 + }, + { + "epoch": 1.903731746890211, + "grad_norm": 1.0131690502166748, + "learning_rate": 1.4792884905808562e-05, + "loss": 1.9043, + "mean_token_accuracy": 0.5679951906204224, + "num_tokens": 3598877682.0, + "step": 7040 + }, + { + "epoch": 1.9040021633315307, + "grad_norm": 0.7150542736053467, + "learning_rate": 1.4791455192367099e-05, + "loss": 1.0895, + "mean_token_accuracy": 0.708158016204834, + "num_tokens": 3599390461.0, + "step": 7041 + }, + { + "epoch": 1.9042725797728501, + "grad_norm": 1.8899736404418945, + "learning_rate": 1.4790025362592637e-05, + "loss": 1.9963, + "mean_token_accuracy": 0.554972767829895, + "num_tokens": 3599914731.0, + "step": 7042 + }, + { + "epoch": 1.9045429962141698, + "grad_norm": 1.6013505458831787, + "learning_rate": 1.4788595416529054e-05, + "loss": 1.9466, + "mean_token_accuracy": 0.563835620880127, + "num_tokens": 3600439002.0, + "step": 7043 + }, + { + "epoch": 1.9048134126554894, + "grad_norm": 1.010899543762207, + "learning_rate": 1.4787165354220226e-05, + "loss": 1.903, + "mean_token_accuracy": 0.5291576981544495, + "num_tokens": 3600963208.0, + "step": 7044 + }, + { + "epoch": 1.905083829096809, + "grad_norm": 1.450888991355896, + "learning_rate": 1.4785735175710021e-05, + "loss": 2.0045, + "mean_token_accuracy": 0.5457665324211121, + "num_tokens": 3601487426.0, + "step": 7045 + }, + { + "epoch": 1.9053542455381287, + "grad_norm": 1.4903277158737183, + "learning_rate": 1.478430488104233e-05, + "loss": 1.8063, + "mean_token_accuracy": 0.5896453261375427, + "num_tokens": 3602011579.0, + "step": 7046 + }, + { + "epoch": 1.9056246619794484, + "grad_norm": 1.1693123579025269, + "learning_rate": 1.4782874470261036e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.5524909496307373, + "num_tokens": 3602535846.0, + "step": 7047 + }, + { + "epoch": 1.905895078420768, + "grad_norm": 1.2595843076705933, + "learning_rate": 1.478144394341003e-05, + "loss": 1.9461, + "mean_token_accuracy": 0.5517692565917969, + "num_tokens": 3603060120.0, + "step": 7048 + }, + { + "epoch": 1.9061654948620876, + "grad_norm": 1.4193236827850342, + "learning_rate": 1.4780013300533202e-05, + "loss": 1.8634, + "mean_token_accuracy": 0.5840390920639038, + "num_tokens": 3603530787.0, + "step": 7049 + }, + { + "epoch": 1.9064359113034073, + "grad_norm": 1.4568891525268555, + "learning_rate": 1.477858254167445e-05, + "loss": 2.0704, + "mean_token_accuracy": 0.5543990135192871, + "num_tokens": 3603999149.0, + "step": 7050 + }, + { + "epoch": 1.906706327744727, + "grad_norm": 1.3209753036499023, + "learning_rate": 1.4777151666877673e-05, + "loss": 1.96, + "mean_token_accuracy": 0.5353752374649048, + "num_tokens": 3604523399.0, + "step": 7051 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 1.5807561874389648, + "learning_rate": 1.4775720676186776e-05, + "loss": 2.0196, + "mean_token_accuracy": 0.5546268224716187, + "num_tokens": 3605001741.0, + "step": 7052 + }, + { + "epoch": 1.907247160627366, + "grad_norm": 1.3810499906539917, + "learning_rate": 1.4774289569645663e-05, + "loss": 2.0446, + "mean_token_accuracy": 0.5365070104598999, + "num_tokens": 3605490782.0, + "step": 7053 + }, + { + "epoch": 1.9075175770686856, + "grad_norm": 1.2548269033432007, + "learning_rate": 1.477285834729825e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5482255220413208, + "num_tokens": 3605988112.0, + "step": 7054 + }, + { + "epoch": 1.9077879935100053, + "grad_norm": 1.396190881729126, + "learning_rate": 1.4771427009188447e-05, + "loss": 1.8698, + "mean_token_accuracy": 0.550727128982544, + "num_tokens": 3606512274.0, + "step": 7055 + }, + { + "epoch": 1.908058409951325, + "grad_norm": 1.6685435771942139, + "learning_rate": 1.4769995555360168e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.5428807139396667, + "num_tokens": 3607036544.0, + "step": 7056 + }, + { + "epoch": 1.9083288263926446, + "grad_norm": 1.3490674495697021, + "learning_rate": 1.4768563985857343e-05, + "loss": 1.9864, + "mean_token_accuracy": 0.5613220930099487, + "num_tokens": 3607560761.0, + "step": 7057 + }, + { + "epoch": 1.9085992428339642, + "grad_norm": 1.1659537553787231, + "learning_rate": 1.476713230072389e-05, + "loss": 1.94, + "mean_token_accuracy": 0.5545152425765991, + "num_tokens": 3608084932.0, + "step": 7058 + }, + { + "epoch": 1.9088696592752838, + "grad_norm": 1.5319311618804932, + "learning_rate": 1.4765700500003742e-05, + "loss": 2.0875, + "mean_token_accuracy": 0.5442469120025635, + "num_tokens": 3608609201.0, + "step": 7059 + }, + { + "epoch": 1.9091400757166035, + "grad_norm": 1.1960517168045044, + "learning_rate": 1.4764268583740825e-05, + "loss": 2.0002, + "mean_token_accuracy": 0.560777485370636, + "num_tokens": 3609078281.0, + "step": 7060 + }, + { + "epoch": 1.9094104921579231, + "grad_norm": 0.5214628577232361, + "learning_rate": 1.476283655197908e-05, + "loss": 1.103, + "mean_token_accuracy": 0.7079370021820068, + "num_tokens": 3609602450.0, + "step": 7061 + }, + { + "epoch": 1.9096809085992428, + "grad_norm": 1.836585521697998, + "learning_rate": 1.4761404404762444e-05, + "loss": 1.9112, + "mean_token_accuracy": 0.5532639026641846, + "num_tokens": 3610126647.0, + "step": 7062 + }, + { + "epoch": 1.9099513250405624, + "grad_norm": 1.7373183965682983, + "learning_rate": 1.475997214213486e-05, + "loss": 1.9741, + "mean_token_accuracy": 0.540736198425293, + "num_tokens": 3610650893.0, + "step": 7063 + }, + { + "epoch": 1.910221741481882, + "grad_norm": 1.253281593322754, + "learning_rate": 1.475853976414027e-05, + "loss": 1.9101, + "mean_token_accuracy": 0.5482621192932129, + "num_tokens": 3611119272.0, + "step": 7064 + }, + { + "epoch": 1.9104921579232017, + "grad_norm": 1.8650318384170532, + "learning_rate": 1.475710727082263e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5577125549316406, + "num_tokens": 3611588514.0, + "step": 7065 + }, + { + "epoch": 1.9107625743645213, + "grad_norm": 1.6568318605422974, + "learning_rate": 1.4755674662225887e-05, + "loss": 1.7216, + "mean_token_accuracy": 0.5842830538749695, + "num_tokens": 3612064014.0, + "step": 7066 + }, + { + "epoch": 1.911032990805841, + "grad_norm": 1.7429215908050537, + "learning_rate": 1.4754241938394005e-05, + "loss": 1.7605, + "mean_token_accuracy": 0.5876659750938416, + "num_tokens": 3612543546.0, + "step": 7067 + }, + { + "epoch": 1.9113034072471606, + "grad_norm": 1.459673523902893, + "learning_rate": 1.4752809099370937e-05, + "loss": 1.921, + "mean_token_accuracy": 0.5561388731002808, + "num_tokens": 3613013896.0, + "step": 7068 + }, + { + "epoch": 1.9115738236884803, + "grad_norm": 1.5211149454116821, + "learning_rate": 1.475137614520065e-05, + "loss": 1.9391, + "mean_token_accuracy": 0.5595439672470093, + "num_tokens": 3613538166.0, + "step": 7069 + }, + { + "epoch": 1.9118442401298, + "grad_norm": 1.716259241104126, + "learning_rate": 1.4749943075927112e-05, + "loss": 1.8538, + "mean_token_accuracy": 0.5713684558868408, + "num_tokens": 3614062440.0, + "step": 7070 + }, + { + "epoch": 1.9121146565711196, + "grad_norm": 1.8463999032974243, + "learning_rate": 1.474850989159429e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.5675441026687622, + "num_tokens": 3614529424.0, + "step": 7071 + }, + { + "epoch": 1.9123850730124392, + "grad_norm": 1.3684855699539185, + "learning_rate": 1.4747076592246161e-05, + "loss": 1.9054, + "mean_token_accuracy": 0.5646395683288574, + "num_tokens": 3614998268.0, + "step": 7072 + }, + { + "epoch": 1.9126554894537589, + "grad_norm": 1.6338963508605957, + "learning_rate": 1.4745643177926705e-05, + "loss": 2.0448, + "mean_token_accuracy": 0.5486072301864624, + "num_tokens": 3615522547.0, + "step": 7073 + }, + { + "epoch": 1.9129259058950785, + "grad_norm": 1.4266983270645142, + "learning_rate": 1.4744209648679901e-05, + "loss": 2.0356, + "mean_token_accuracy": 0.5360459089279175, + "num_tokens": 3616046790.0, + "step": 7074 + }, + { + "epoch": 1.9131963223363981, + "grad_norm": 1.1303845643997192, + "learning_rate": 1.4742776004549732e-05, + "loss": 1.9775, + "mean_token_accuracy": 0.5641747713088989, + "num_tokens": 3616549300.0, + "step": 7075 + }, + { + "epoch": 1.9134667387777178, + "grad_norm": 1.5123358964920044, + "learning_rate": 1.4741342245580188e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.5377382040023804, + "num_tokens": 3617073497.0, + "step": 7076 + }, + { + "epoch": 1.9137371552190374, + "grad_norm": 1.6039685010910034, + "learning_rate": 1.4739908371815264e-05, + "loss": 1.9258, + "mean_token_accuracy": 0.5601917505264282, + "num_tokens": 3617597677.0, + "step": 7077 + }, + { + "epoch": 1.914007571660357, + "grad_norm": 1.208045482635498, + "learning_rate": 1.4738474383298953e-05, + "loss": 2.0208, + "mean_token_accuracy": 0.5434298515319824, + "num_tokens": 3618121880.0, + "step": 7078 + }, + { + "epoch": 1.9142779881016767, + "grad_norm": 1.802843451499939, + "learning_rate": 1.473704028007525e-05, + "loss": 1.8926, + "mean_token_accuracy": 0.5401467084884644, + "num_tokens": 3618646135.0, + "step": 7079 + }, + { + "epoch": 1.9145484045429964, + "grad_norm": 1.5094596147537231, + "learning_rate": 1.4735606062188164e-05, + "loss": 1.8565, + "mean_token_accuracy": 0.5697451233863831, + "num_tokens": 3619170282.0, + "step": 7080 + }, + { + "epoch": 1.914818820984316, + "grad_norm": 0.6514673233032227, + "learning_rate": 1.4734171729681699e-05, + "loss": 1.2133, + "mean_token_accuracy": 0.6849414110183716, + "num_tokens": 3619694483.0, + "step": 7081 + }, + { + "epoch": 1.9150892374256356, + "grad_norm": 1.722499966621399, + "learning_rate": 1.4732737282599863e-05, + "loss": 1.9501, + "mean_token_accuracy": 0.5608156323432922, + "num_tokens": 3620218729.0, + "step": 7082 + }, + { + "epoch": 1.915359653866955, + "grad_norm": 1.4645664691925049, + "learning_rate": 1.4731302720986668e-05, + "loss": 1.9266, + "mean_token_accuracy": 0.5735499262809753, + "num_tokens": 3620742902.0, + "step": 7083 + }, + { + "epoch": 1.9156300703082747, + "grad_norm": 1.068843126296997, + "learning_rate": 1.4729868044886138e-05, + "loss": 1.8464, + "mean_token_accuracy": 0.5795606374740601, + "num_tokens": 3621267033.0, + "step": 7084 + }, + { + "epoch": 1.9159004867495943, + "grad_norm": 1.5873935222625732, + "learning_rate": 1.4728433254342281e-05, + "loss": 1.8522, + "mean_token_accuracy": 0.5279600620269775, + "num_tokens": 3621791175.0, + "step": 7085 + }, + { + "epoch": 1.916170903190914, + "grad_norm": 1.4600002765655518, + "learning_rate": 1.4726998349399128e-05, + "loss": 1.8965, + "mean_token_accuracy": 0.5628035068511963, + "num_tokens": 3622315443.0, + "step": 7086 + }, + { + "epoch": 1.9164413196322336, + "grad_norm": 1.3155723810195923, + "learning_rate": 1.4725563330100709e-05, + "loss": 1.9081, + "mean_token_accuracy": 0.5762373208999634, + "num_tokens": 3622796664.0, + "step": 7087 + }, + { + "epoch": 1.9167117360735533, + "grad_norm": 1.5073494911193848, + "learning_rate": 1.4724128196491047e-05, + "loss": 1.9484, + "mean_token_accuracy": 0.5622134804725647, + "num_tokens": 3623320816.0, + "step": 7088 + }, + { + "epoch": 1.916982152514873, + "grad_norm": 1.2123570442199707, + "learning_rate": 1.4722692948614184e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.5662092566490173, + "num_tokens": 3623845083.0, + "step": 7089 + }, + { + "epoch": 1.9172525689561926, + "grad_norm": 1.2687394618988037, + "learning_rate": 1.4721257586514152e-05, + "loss": 1.9122, + "mean_token_accuracy": 0.5660752058029175, + "num_tokens": 3624369286.0, + "step": 7090 + }, + { + "epoch": 1.9175229853975122, + "grad_norm": 1.7259938716888428, + "learning_rate": 1.4719822110234993e-05, + "loss": 1.9841, + "mean_token_accuracy": 0.5469757914543152, + "num_tokens": 3624893473.0, + "step": 7091 + }, + { + "epoch": 1.9177934018388318, + "grad_norm": 1.3354171514511108, + "learning_rate": 1.4718386519820752e-05, + "loss": 1.8865, + "mean_token_accuracy": 0.5551957488059998, + "num_tokens": 3625417705.0, + "step": 7092 + }, + { + "epoch": 1.9180638182801513, + "grad_norm": 1.4590855836868286, + "learning_rate": 1.4716950815315476e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.521858811378479, + "num_tokens": 3625941927.0, + "step": 7093 + }, + { + "epoch": 1.918334234721471, + "grad_norm": 1.3896046876907349, + "learning_rate": 1.4715514996763222e-05, + "loss": 2.1609, + "mean_token_accuracy": 0.5189657807350159, + "num_tokens": 3626466209.0, + "step": 7094 + }, + { + "epoch": 1.9186046511627906, + "grad_norm": 1.455781102180481, + "learning_rate": 1.471407906420804e-05, + "loss": 1.9854, + "mean_token_accuracy": 0.5553652048110962, + "num_tokens": 3626989751.0, + "step": 7095 + }, + { + "epoch": 1.9188750676041102, + "grad_norm": 1.3766083717346191, + "learning_rate": 1.4712643017693991e-05, + "loss": 1.8856, + "mean_token_accuracy": 0.5588424801826477, + "num_tokens": 3627513936.0, + "step": 7096 + }, + { + "epoch": 1.9191454840454298, + "grad_norm": 1.1179958581924438, + "learning_rate": 1.4711206857265133e-05, + "loss": 1.871, + "mean_token_accuracy": 0.5657150745391846, + "num_tokens": 3628038157.0, + "step": 7097 + }, + { + "epoch": 1.9194159004867495, + "grad_norm": 1.480538249015808, + "learning_rate": 1.4709770582965537e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5718331933021545, + "num_tokens": 3628562183.0, + "step": 7098 + }, + { + "epoch": 1.9196863169280691, + "grad_norm": 1.2672334909439087, + "learning_rate": 1.470833419483927e-05, + "loss": 1.9819, + "mean_token_accuracy": 0.5635609030723572, + "num_tokens": 3629086450.0, + "step": 7099 + }, + { + "epoch": 1.9199567333693888, + "grad_norm": 1.4497764110565186, + "learning_rate": 1.4706897692930402e-05, + "loss": 1.9794, + "mean_token_accuracy": 0.549808919429779, + "num_tokens": 3629610563.0, + "step": 7100 + }, + { + "epoch": 1.9202271498107084, + "grad_norm": 0.5678906440734863, + "learning_rate": 1.4705461077283012e-05, + "loss": 1.119, + "mean_token_accuracy": 0.6855711340904236, + "num_tokens": 3630134836.0, + "step": 7101 + }, + { + "epoch": 1.920497566252028, + "grad_norm": 1.428954839706421, + "learning_rate": 1.4704024347941181e-05, + "loss": 2.0074, + "mean_token_accuracy": 0.560379147529602, + "num_tokens": 3630611083.0, + "step": 7102 + }, + { + "epoch": 1.9207679826933477, + "grad_norm": 1.2038471698760986, + "learning_rate": 1.4702587504948986e-05, + "loss": 1.9576, + "mean_token_accuracy": 0.558089017868042, + "num_tokens": 3631135282.0, + "step": 7103 + }, + { + "epoch": 1.9210383991346673, + "grad_norm": 1.2770318984985352, + "learning_rate": 1.4701150548350525e-05, + "loss": 1.8333, + "mean_token_accuracy": 0.5906212329864502, + "num_tokens": 3631612544.0, + "step": 7104 + }, + { + "epoch": 1.921308815575987, + "grad_norm": 1.2380424737930298, + "learning_rate": 1.4699713478189877e-05, + "loss": 1.951, + "mean_token_accuracy": 0.5610158443450928, + "num_tokens": 3632136679.0, + "step": 7105 + }, + { + "epoch": 1.9215792320173066, + "grad_norm": 1.1842011213302612, + "learning_rate": 1.4698276294511137e-05, + "loss": 1.8538, + "mean_token_accuracy": 0.584951639175415, + "num_tokens": 3632658614.0, + "step": 7106 + }, + { + "epoch": 1.9218496484586263, + "grad_norm": 1.043101191520691, + "learning_rate": 1.4696838997358407e-05, + "loss": 1.9501, + "mean_token_accuracy": 0.5539249181747437, + "num_tokens": 3633182794.0, + "step": 7107 + }, + { + "epoch": 1.922120064899946, + "grad_norm": 1.2945188283920288, + "learning_rate": 1.4695401586775783e-05, + "loss": 2.0293, + "mean_token_accuracy": 0.5450540781021118, + "num_tokens": 3633696513.0, + "step": 7108 + }, + { + "epoch": 1.9223904813412656, + "grad_norm": 1.0766030550003052, + "learning_rate": 1.4693964062807373e-05, + "loss": 1.9677, + "mean_token_accuracy": 0.5456137657165527, + "num_tokens": 3634220773.0, + "step": 7109 + }, + { + "epoch": 1.9226608977825852, + "grad_norm": 1.0416311025619507, + "learning_rate": 1.4692526425497282e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.5529366731643677, + "num_tokens": 3634745026.0, + "step": 7110 + }, + { + "epoch": 1.9229313142239048, + "grad_norm": 1.1405318975448608, + "learning_rate": 1.4691088674889623e-05, + "loss": 1.8671, + "mean_token_accuracy": 0.5725153684616089, + "num_tokens": 3635269138.0, + "step": 7111 + }, + { + "epoch": 1.9232017306652245, + "grad_norm": 1.0847856998443604, + "learning_rate": 1.4689650811028505e-05, + "loss": 1.962, + "mean_token_accuracy": 0.5594568252563477, + "num_tokens": 3635793355.0, + "step": 7112 + }, + { + "epoch": 1.9234721471065441, + "grad_norm": 1.0731537342071533, + "learning_rate": 1.468821283395805e-05, + "loss": 1.9441, + "mean_token_accuracy": 0.5718239545822144, + "num_tokens": 3636317545.0, + "step": 7113 + }, + { + "epoch": 1.9237425635478638, + "grad_norm": 1.1797505617141724, + "learning_rate": 1.468677474372238e-05, + "loss": 1.9701, + "mean_token_accuracy": 0.5276135206222534, + "num_tokens": 3636841769.0, + "step": 7114 + }, + { + "epoch": 1.9240129799891834, + "grad_norm": 1.1416752338409424, + "learning_rate": 1.4685336540365617e-05, + "loss": 1.9722, + "mean_token_accuracy": 0.5585039258003235, + "num_tokens": 3637366052.0, + "step": 7115 + }, + { + "epoch": 1.924283396430503, + "grad_norm": 1.1598559617996216, + "learning_rate": 1.4683898223931894e-05, + "loss": 1.9211, + "mean_token_accuracy": 0.5643961429595947, + "num_tokens": 3637890331.0, + "step": 7116 + }, + { + "epoch": 1.9245538128718227, + "grad_norm": 1.2168684005737305, + "learning_rate": 1.468245979446534e-05, + "loss": 1.928, + "mean_token_accuracy": 0.5663426518440247, + "num_tokens": 3638363157.0, + "step": 7117 + }, + { + "epoch": 1.9248242293131423, + "grad_norm": 1.3390194177627563, + "learning_rate": 1.4681021252010087e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.537298858165741, + "num_tokens": 3638887414.0, + "step": 7118 + }, + { + "epoch": 1.925094645754462, + "grad_norm": 1.4271721839904785, + "learning_rate": 1.4679582596610277e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.5733591318130493, + "num_tokens": 3639411646.0, + "step": 7119 + }, + { + "epoch": 1.9253650621957816, + "grad_norm": 1.2055948972702026, + "learning_rate": 1.4678143828310057e-05, + "loss": 1.9513, + "mean_token_accuracy": 0.5582267045974731, + "num_tokens": 3639888684.0, + "step": 7120 + }, + { + "epoch": 1.9256354786371013, + "grad_norm": 0.6241500377655029, + "learning_rate": 1.4676704947153562e-05, + "loss": 1.166, + "mean_token_accuracy": 0.6729629039764404, + "num_tokens": 3640412889.0, + "step": 7121 + }, + { + "epoch": 1.925905895078421, + "grad_norm": 1.4993144273757935, + "learning_rate": 1.467526595318495e-05, + "loss": 1.8639, + "mean_token_accuracy": 0.5690427422523499, + "num_tokens": 3640937067.0, + "step": 7122 + }, + { + "epoch": 1.9261763115197406, + "grad_norm": 1.3773741722106934, + "learning_rate": 1.4673826846448369e-05, + "loss": 1.9697, + "mean_token_accuracy": 0.5532264709472656, + "num_tokens": 3641461270.0, + "step": 7123 + }, + { + "epoch": 1.92644672796106, + "grad_norm": 1.3819243907928467, + "learning_rate": 1.4672387626987977e-05, + "loss": 2.0237, + "mean_token_accuracy": 0.531392514705658, + "num_tokens": 3641985419.0, + "step": 7124 + }, + { + "epoch": 1.9267171444023796, + "grad_norm": 1.3479788303375244, + "learning_rate": 1.467094829484793e-05, + "loss": 1.7081, + "mean_token_accuracy": 0.6017313003540039, + "num_tokens": 3642456215.0, + "step": 7125 + }, + { + "epoch": 1.9269875608436993, + "grad_norm": 1.198474407196045, + "learning_rate": 1.4669508850072396e-05, + "loss": 1.847, + "mean_token_accuracy": 0.6098129153251648, + "num_tokens": 3642980380.0, + "step": 7126 + }, + { + "epoch": 1.927257977285019, + "grad_norm": 1.392220377922058, + "learning_rate": 1.466806929270554e-05, + "loss": 1.7859, + "mean_token_accuracy": 0.5833647847175598, + "num_tokens": 3643504667.0, + "step": 7127 + }, + { + "epoch": 1.9275283937263386, + "grad_norm": 1.5799602270126343, + "learning_rate": 1.466662962279153e-05, + "loss": 1.8829, + "mean_token_accuracy": 0.5778042078018188, + "num_tokens": 3644002906.0, + "step": 7128 + }, + { + "epoch": 1.9277988101676582, + "grad_norm": 1.155454397201538, + "learning_rate": 1.4665189840374541e-05, + "loss": 1.9615, + "mean_token_accuracy": 0.5602855682373047, + "num_tokens": 3644527124.0, + "step": 7129 + }, + { + "epoch": 1.9280692266089778, + "grad_norm": 1.2809514999389648, + "learning_rate": 1.4663749945498751e-05, + "loss": 1.983, + "mean_token_accuracy": 0.5595649480819702, + "num_tokens": 3645013223.0, + "step": 7130 + }, + { + "epoch": 1.9283396430502975, + "grad_norm": 1.4231891632080078, + "learning_rate": 1.4662309938208334e-05, + "loss": 1.9645, + "mean_token_accuracy": 0.5513638257980347, + "num_tokens": 3645492526.0, + "step": 7131 + }, + { + "epoch": 1.9286100594916171, + "grad_norm": 1.0489052534103394, + "learning_rate": 1.466086981854748e-05, + "loss": 1.9086, + "mean_token_accuracy": 0.569823682308197, + "num_tokens": 3646016671.0, + "step": 7132 + }, + { + "epoch": 1.9288804759329368, + "grad_norm": 1.2674181461334229, + "learning_rate": 1.4659429586560377e-05, + "loss": 2.0123, + "mean_token_accuracy": 0.5496984124183655, + "num_tokens": 3646540821.0, + "step": 7133 + }, + { + "epoch": 1.9291508923742564, + "grad_norm": 1.1706901788711548, + "learning_rate": 1.4657989242291209e-05, + "loss": 1.9848, + "mean_token_accuracy": 0.5543496012687683, + "num_tokens": 3647065106.0, + "step": 7134 + }, + { + "epoch": 1.9294213088155758, + "grad_norm": 1.0397720336914062, + "learning_rate": 1.465654878578417e-05, + "loss": 2.0144, + "mean_token_accuracy": 0.5461286306381226, + "num_tokens": 3647589327.0, + "step": 7135 + }, + { + "epoch": 1.9296917252568955, + "grad_norm": 1.2162957191467285, + "learning_rate": 1.4655108217083467e-05, + "loss": 1.8951, + "mean_token_accuracy": 0.5821256637573242, + "num_tokens": 3648113567.0, + "step": 7136 + }, + { + "epoch": 1.9299621416982151, + "grad_norm": 0.9834150075912476, + "learning_rate": 1.4653667536233294e-05, + "loss": 1.8628, + "mean_token_accuracy": 0.5773030519485474, + "num_tokens": 3648637809.0, + "step": 7137 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 1.0356587171554565, + "learning_rate": 1.4652226743277855e-05, + "loss": 1.7775, + "mean_token_accuracy": 0.5870832204818726, + "num_tokens": 3649101018.0, + "step": 7138 + }, + { + "epoch": 1.9305029745808544, + "grad_norm": 1.236383080482483, + "learning_rate": 1.4650785838261356e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5587910413742065, + "num_tokens": 3649604046.0, + "step": 7139 + }, + { + "epoch": 1.930773391022174, + "grad_norm": 1.0152910947799683, + "learning_rate": 1.4649344821228018e-05, + "loss": 1.9966, + "mean_token_accuracy": 0.5510941743850708, + "num_tokens": 3650128316.0, + "step": 7140 + }, + { + "epoch": 1.9310438074634937, + "grad_norm": 0.5150245428085327, + "learning_rate": 1.4647903692222047e-05, + "loss": 1.0879, + "mean_token_accuracy": 0.7085998058319092, + "num_tokens": 3650652443.0, + "step": 7141 + }, + { + "epoch": 1.9313142239048133, + "grad_norm": 1.7592239379882812, + "learning_rate": 1.4646462451287657e-05, + "loss": 1.8837, + "mean_token_accuracy": 0.5788333415985107, + "num_tokens": 3651175151.0, + "step": 7142 + }, + { + "epoch": 1.931584640346133, + "grad_norm": 1.5067206621170044, + "learning_rate": 1.4645021098469083e-05, + "loss": 1.8631, + "mean_token_accuracy": 0.5595827102661133, + "num_tokens": 3651664939.0, + "step": 7143 + }, + { + "epoch": 1.9318550567874526, + "grad_norm": 1.256594181060791, + "learning_rate": 1.4643579633810537e-05, + "loss": 2.0129, + "mean_token_accuracy": 0.5398702621459961, + "num_tokens": 3652188979.0, + "step": 7144 + }, + { + "epoch": 1.9321254732287723, + "grad_norm": 1.324448585510254, + "learning_rate": 1.4642138057356254e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.5555323362350464, + "num_tokens": 3652659956.0, + "step": 7145 + }, + { + "epoch": 1.932395889670092, + "grad_norm": 1.4407156705856323, + "learning_rate": 1.464069636915047e-05, + "loss": 1.9604, + "mean_token_accuracy": 0.5515086650848389, + "num_tokens": 3653184166.0, + "step": 7146 + }, + { + "epoch": 1.9326663061114115, + "grad_norm": 1.4717670679092407, + "learning_rate": 1.463925456923741e-05, + "loss": 1.9075, + "mean_token_accuracy": 0.5671505331993103, + "num_tokens": 3653634863.0, + "step": 7147 + }, + { + "epoch": 1.9329367225527312, + "grad_norm": 1.3199940919876099, + "learning_rate": 1.4637812657661314e-05, + "loss": 1.9399, + "mean_token_accuracy": 0.5578281879425049, + "num_tokens": 3654158970.0, + "step": 7148 + }, + { + "epoch": 1.9332071389940508, + "grad_norm": 1.3063092231750488, + "learning_rate": 1.4636370634466435e-05, + "loss": 1.9099, + "mean_token_accuracy": 0.5666160583496094, + "num_tokens": 3654683198.0, + "step": 7149 + }, + { + "epoch": 1.9334775554353705, + "grad_norm": 1.2650963068008423, + "learning_rate": 1.4634928499697005e-05, + "loss": 1.9787, + "mean_token_accuracy": 0.5520459413528442, + "num_tokens": 3655207449.0, + "step": 7150 + }, + { + "epoch": 1.9337479718766901, + "grad_norm": 1.1857383251190186, + "learning_rate": 1.4633486253397282e-05, + "loss": 1.9571, + "mean_token_accuracy": 0.5527456402778625, + "num_tokens": 3655731619.0, + "step": 7151 + }, + { + "epoch": 1.9340183883180098, + "grad_norm": 1.201309323310852, + "learning_rate": 1.4632043895611514e-05, + "loss": 1.9032, + "mean_token_accuracy": 0.5651552677154541, + "num_tokens": 3656255743.0, + "step": 7152 + }, + { + "epoch": 1.9342888047593294, + "grad_norm": 1.3582769632339478, + "learning_rate": 1.463060142638396e-05, + "loss": 2.0068, + "mean_token_accuracy": 0.5516613721847534, + "num_tokens": 3656767838.0, + "step": 7153 + }, + { + "epoch": 1.934559221200649, + "grad_norm": 1.3408654928207397, + "learning_rate": 1.4629158845758878e-05, + "loss": 1.8995, + "mean_token_accuracy": 0.579287052154541, + "num_tokens": 3657288660.0, + "step": 7154 + }, + { + "epoch": 1.9348296376419687, + "grad_norm": 1.0323716402053833, + "learning_rate": 1.4627716153780527e-05, + "loss": 1.868, + "mean_token_accuracy": 0.5746756792068481, + "num_tokens": 3657812838.0, + "step": 7155 + }, + { + "epoch": 1.9351000540832883, + "grad_norm": 1.2202303409576416, + "learning_rate": 1.4626273350493176e-05, + "loss": 1.8946, + "mean_token_accuracy": 0.5607251524925232, + "num_tokens": 3658337032.0, + "step": 7156 + }, + { + "epoch": 1.935370470524608, + "grad_norm": 1.1839245557785034, + "learning_rate": 1.4624830435941095e-05, + "loss": 1.9842, + "mean_token_accuracy": 0.5474722385406494, + "num_tokens": 3658861238.0, + "step": 7157 + }, + { + "epoch": 1.9356408869659276, + "grad_norm": 1.1202547550201416, + "learning_rate": 1.4623387410168556e-05, + "loss": 2.0461, + "mean_token_accuracy": 0.5334280729293823, + "num_tokens": 3659385425.0, + "step": 7158 + }, + { + "epoch": 1.9359113034072473, + "grad_norm": 1.2874925136566162, + "learning_rate": 1.4621944273219839e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5595060586929321, + "num_tokens": 3659909552.0, + "step": 7159 + }, + { + "epoch": 1.936181719848567, + "grad_norm": 1.305389404296875, + "learning_rate": 1.4620501025139217e-05, + "loss": 1.9192, + "mean_token_accuracy": 0.5603979825973511, + "num_tokens": 3660433787.0, + "step": 7160 + }, + { + "epoch": 1.9364521362898865, + "grad_norm": 0.5501173734664917, + "learning_rate": 1.4619057665970977e-05, + "loss": 1.1951, + "mean_token_accuracy": 0.6793182492256165, + "num_tokens": 3660958044.0, + "step": 7161 + }, + { + "epoch": 1.9367225527312062, + "grad_norm": 1.6004095077514648, + "learning_rate": 1.4617614195759406e-05, + "loss": 1.8152, + "mean_token_accuracy": 0.6014345288276672, + "num_tokens": 3661482221.0, + "step": 7162 + }, + { + "epoch": 1.9369929691725258, + "grad_norm": 1.2796741724014282, + "learning_rate": 1.461617061454879e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.5638473629951477, + "num_tokens": 3662006435.0, + "step": 7163 + }, + { + "epoch": 1.9372633856138455, + "grad_norm": 1.119617223739624, + "learning_rate": 1.4614726922383426e-05, + "loss": 1.857, + "mean_token_accuracy": 0.572469174861908, + "num_tokens": 3662530665.0, + "step": 7164 + }, + { + "epoch": 1.937533802055165, + "grad_norm": 1.3869361877441406, + "learning_rate": 1.461328311930761e-05, + "loss": 1.9848, + "mean_token_accuracy": 0.5483410954475403, + "num_tokens": 3663054933.0, + "step": 7165 + }, + { + "epoch": 1.9378042184964845, + "grad_norm": 1.265160322189331, + "learning_rate": 1.4611839205365645e-05, + "loss": 1.8695, + "mean_token_accuracy": 0.5661987066268921, + "num_tokens": 3663579065.0, + "step": 7166 + }, + { + "epoch": 1.9380746349378042, + "grad_norm": 1.3312294483184814, + "learning_rate": 1.4610395180601829e-05, + "loss": 2.0448, + "mean_token_accuracy": 0.5350321531295776, + "num_tokens": 3664103305.0, + "step": 7167 + }, + { + "epoch": 1.9383450513791238, + "grad_norm": 1.0176079273223877, + "learning_rate": 1.4608951045060472e-05, + "loss": 1.8257, + "mean_token_accuracy": 0.5677996277809143, + "num_tokens": 3664627431.0, + "step": 7168 + }, + { + "epoch": 1.9386154678204435, + "grad_norm": 1.152022361755371, + "learning_rate": 1.4607506798785885e-05, + "loss": 2.0074, + "mean_token_accuracy": 0.5543999671936035, + "num_tokens": 3665151600.0, + "step": 7169 + }, + { + "epoch": 1.9388858842617631, + "grad_norm": 1.3366814851760864, + "learning_rate": 1.4606062441822375e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5378668904304504, + "num_tokens": 3665675768.0, + "step": 7170 + }, + { + "epoch": 1.9391563007030828, + "grad_norm": 1.1414823532104492, + "learning_rate": 1.4604617974214267e-05, + "loss": 1.8537, + "mean_token_accuracy": 0.5802170038223267, + "num_tokens": 3666199981.0, + "step": 7171 + }, + { + "epoch": 1.9394267171444024, + "grad_norm": 1.1277180910110474, + "learning_rate": 1.4603173396005879e-05, + "loss": 1.8345, + "mean_token_accuracy": 0.5780266523361206, + "num_tokens": 3666699089.0, + "step": 7172 + }, + { + "epoch": 1.939697133585722, + "grad_norm": 1.170475721359253, + "learning_rate": 1.4601728707241541e-05, + "loss": 1.8511, + "mean_token_accuracy": 0.5725027322769165, + "num_tokens": 3667223212.0, + "step": 7173 + }, + { + "epoch": 1.9399675500270417, + "grad_norm": 1.479221224784851, + "learning_rate": 1.4600283907965567e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5607497096061707, + "num_tokens": 3667747409.0, + "step": 7174 + }, + { + "epoch": 1.9402379664683613, + "grad_norm": 1.207166075706482, + "learning_rate": 1.4598838998222296e-05, + "loss": 1.9663, + "mean_token_accuracy": 0.5422732830047607, + "num_tokens": 3668271370.0, + "step": 7175 + }, + { + "epoch": 1.9405083829096808, + "grad_norm": 1.2781898975372314, + "learning_rate": 1.4597393978056065e-05, + "loss": 2.0389, + "mean_token_accuracy": 0.5632160902023315, + "num_tokens": 3668795577.0, + "step": 7176 + }, + { + "epoch": 1.9407787993510004, + "grad_norm": 1.2111027240753174, + "learning_rate": 1.45959488475112e-05, + "loss": 1.9702, + "mean_token_accuracy": 0.5519312620162964, + "num_tokens": 3669319688.0, + "step": 7177 + }, + { + "epoch": 1.94104921579232, + "grad_norm": 1.112905740737915, + "learning_rate": 1.4594503606632058e-05, + "loss": 1.9039, + "mean_token_accuracy": 0.5687512159347534, + "num_tokens": 3669843846.0, + "step": 7178 + }, + { + "epoch": 1.9413196322336397, + "grad_norm": 1.1089801788330078, + "learning_rate": 1.4593058255462971e-05, + "loss": 1.867, + "mean_token_accuracy": 0.5609734058380127, + "num_tokens": 3670318128.0, + "step": 7179 + }, + { + "epoch": 1.9415900486749593, + "grad_norm": 1.6409085988998413, + "learning_rate": 1.459161279404829e-05, + "loss": 1.7422, + "mean_token_accuracy": 0.6029974222183228, + "num_tokens": 3670835857.0, + "step": 7180 + }, + { + "epoch": 1.941860465116279, + "grad_norm": 0.615539014339447, + "learning_rate": 1.4590167222432366e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.7127341032028198, + "num_tokens": 3671360131.0, + "step": 7181 + }, + { + "epoch": 1.9421308815575986, + "grad_norm": 2.2666659355163574, + "learning_rate": 1.4588721540659554e-05, + "loss": 2.0284, + "mean_token_accuracy": 0.5438752174377441, + "num_tokens": 3671884410.0, + "step": 7182 + }, + { + "epoch": 1.9424012979989183, + "grad_norm": 1.9233125448226929, + "learning_rate": 1.4587275748774212e-05, + "loss": 1.9529, + "mean_token_accuracy": 0.5843080878257751, + "num_tokens": 3672324041.0, + "step": 7183 + }, + { + "epoch": 1.942671714440238, + "grad_norm": 1.3616552352905273, + "learning_rate": 1.4585829846820699e-05, + "loss": 1.9076, + "mean_token_accuracy": 0.5622612237930298, + "num_tokens": 3672848199.0, + "step": 7184 + }, + { + "epoch": 1.9429421308815575, + "grad_norm": 1.2970521450042725, + "learning_rate": 1.4584383834843381e-05, + "loss": 1.7945, + "mean_token_accuracy": 0.5755776166915894, + "num_tokens": 3673321624.0, + "step": 7185 + }, + { + "epoch": 1.9432125473228772, + "grad_norm": 2.4889330863952637, + "learning_rate": 1.4582937712886627e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.5642322301864624, + "num_tokens": 3673808632.0, + "step": 7186 + }, + { + "epoch": 1.9434829637641968, + "grad_norm": 1.8192226886749268, + "learning_rate": 1.4581491480994804e-05, + "loss": 1.8815, + "mean_token_accuracy": 0.5703577995300293, + "num_tokens": 3674332900.0, + "step": 7187 + }, + { + "epoch": 1.9437533802055165, + "grad_norm": 1.6264262199401855, + "learning_rate": 1.4580045139212295e-05, + "loss": 1.8757, + "mean_token_accuracy": 0.5627188086509705, + "num_tokens": 3674857138.0, + "step": 7188 + }, + { + "epoch": 1.944023796646836, + "grad_norm": 1.8327642679214478, + "learning_rate": 1.4578598687583472e-05, + "loss": 1.9287, + "mean_token_accuracy": 0.5579538345336914, + "num_tokens": 3675381406.0, + "step": 7189 + }, + { + "epoch": 1.9442942130881558, + "grad_norm": 1.9646515846252441, + "learning_rate": 1.4577152126152717e-05, + "loss": 2.0255, + "mean_token_accuracy": 0.5487996339797974, + "num_tokens": 3675905655.0, + "step": 7190 + }, + { + "epoch": 1.9445646295294754, + "grad_norm": 1.7940243482589722, + "learning_rate": 1.4575705454964414e-05, + "loss": 1.892, + "mean_token_accuracy": 0.5633333921432495, + "num_tokens": 3676429931.0, + "step": 7191 + }, + { + "epoch": 1.944835045970795, + "grad_norm": 1.7870125770568848, + "learning_rate": 1.4574258674062952e-05, + "loss": 1.9038, + "mean_token_accuracy": 0.5457886457443237, + "num_tokens": 3676954083.0, + "step": 7192 + }, + { + "epoch": 1.9451054624121147, + "grad_norm": 1.932020902633667, + "learning_rate": 1.4572811783492724e-05, + "loss": 1.8513, + "mean_token_accuracy": 0.5818764567375183, + "num_tokens": 3677478267.0, + "step": 7193 + }, + { + "epoch": 1.9453758788534343, + "grad_norm": 1.4878957271575928, + "learning_rate": 1.4571364783298121e-05, + "loss": 1.8031, + "mean_token_accuracy": 0.5727618932723999, + "num_tokens": 3678002520.0, + "step": 7194 + }, + { + "epoch": 1.945646295294754, + "grad_norm": 1.2129487991333008, + "learning_rate": 1.4569917673523544e-05, + "loss": 1.9569, + "mean_token_accuracy": 0.5387670993804932, + "num_tokens": 3678523656.0, + "step": 7195 + }, + { + "epoch": 1.9459167117360736, + "grad_norm": 1.4002047777175903, + "learning_rate": 1.4568470454213399e-05, + "loss": 1.9304, + "mean_token_accuracy": 0.5416987538337708, + "num_tokens": 3678978282.0, + "step": 7196 + }, + { + "epoch": 1.9461871281773933, + "grad_norm": 1.4315664768218994, + "learning_rate": 1.456702312541208e-05, + "loss": 1.9103, + "mean_token_accuracy": 0.5711801052093506, + "num_tokens": 3679462999.0, + "step": 7197 + }, + { + "epoch": 1.946457544618713, + "grad_norm": 1.1226170063018799, + "learning_rate": 1.4565575687164006e-05, + "loss": 1.7516, + "mean_token_accuracy": 0.6160706281661987, + "num_tokens": 3679936606.0, + "step": 7198 + }, + { + "epoch": 1.9467279610600325, + "grad_norm": 1.4923077821731567, + "learning_rate": 1.4564128139513582e-05, + "loss": 2.0568, + "mean_token_accuracy": 0.5368525981903076, + "num_tokens": 3680459571.0, + "step": 7199 + }, + { + "epoch": 1.9469983775013522, + "grad_norm": 1.4132349491119385, + "learning_rate": 1.4562680482505224e-05, + "loss": 1.8764, + "mean_token_accuracy": 0.5528345108032227, + "num_tokens": 3680983821.0, + "step": 7200 + }, + { + "epoch": 1.9472687939426718, + "grad_norm": 0.6264340877532959, + "learning_rate": 1.4561232716183355e-05, + "loss": 1.1791, + "mean_token_accuracy": 0.6860384941101074, + "num_tokens": 3681508097.0, + "step": 7201 + }, + { + "epoch": 1.9475392103839915, + "grad_norm": 1.6064194440841675, + "learning_rate": 1.4559784840592392e-05, + "loss": 1.971, + "mean_token_accuracy": 0.5422090888023376, + "num_tokens": 3682032366.0, + "step": 7202 + }, + { + "epoch": 1.9478096268253111, + "grad_norm": 1.267456293106079, + "learning_rate": 1.4558336855776763e-05, + "loss": 2.0159, + "mean_token_accuracy": 0.5450783967971802, + "num_tokens": 3682556570.0, + "step": 7203 + }, + { + "epoch": 1.9480800432666308, + "grad_norm": 1.304603099822998, + "learning_rate": 1.4556888761780892e-05, + "loss": 1.9577, + "mean_token_accuracy": 0.5583885908126831, + "num_tokens": 3683055528.0, + "step": 7204 + }, + { + "epoch": 1.9483504597079504, + "grad_norm": 1.1852012872695923, + "learning_rate": 1.4555440558649214e-05, + "loss": 2.0302, + "mean_token_accuracy": 0.5417975783348083, + "num_tokens": 3683579787.0, + "step": 7205 + }, + { + "epoch": 1.9486208761492698, + "grad_norm": 1.3202991485595703, + "learning_rate": 1.4553992246426165e-05, + "loss": 1.979, + "mean_token_accuracy": 0.5508449673652649, + "num_tokens": 3684103861.0, + "step": 7206 + }, + { + "epoch": 1.9488912925905895, + "grad_norm": 1.4967830181121826, + "learning_rate": 1.4552543825156183e-05, + "loss": 2.0516, + "mean_token_accuracy": 0.5297850370407104, + "num_tokens": 3684628127.0, + "step": 7207 + }, + { + "epoch": 1.949161709031909, + "grad_norm": 1.0802384614944458, + "learning_rate": 1.4551095294883707e-05, + "loss": 2.0052, + "mean_token_accuracy": 0.5419826507568359, + "num_tokens": 3685130213.0, + "step": 7208 + }, + { + "epoch": 1.9494321254732287, + "grad_norm": 1.1567976474761963, + "learning_rate": 1.4549646655653186e-05, + "loss": 1.9252, + "mean_token_accuracy": 0.5924507975578308, + "num_tokens": 3685589798.0, + "step": 7209 + }, + { + "epoch": 1.9497025419145484, + "grad_norm": 1.1235278844833374, + "learning_rate": 1.4548197907509068e-05, + "loss": 1.8427, + "mean_token_accuracy": 0.5791993737220764, + "num_tokens": 3686114019.0, + "step": 7210 + }, + { + "epoch": 1.949972958355868, + "grad_norm": 1.128021478652954, + "learning_rate": 1.4546749050495801e-05, + "loss": 2.0031, + "mean_token_accuracy": 0.5566071271896362, + "num_tokens": 3686638257.0, + "step": 7211 + }, + { + "epoch": 1.9502433747971877, + "grad_norm": 1.1567124128341675, + "learning_rate": 1.4545300084657842e-05, + "loss": 1.8995, + "mean_token_accuracy": 0.5753132104873657, + "num_tokens": 3687064563.0, + "step": 7212 + }, + { + "epoch": 1.9505137912385073, + "grad_norm": 1.0910181999206543, + "learning_rate": 1.4543851010039652e-05, + "loss": 1.9316, + "mean_token_accuracy": 0.5594725012779236, + "num_tokens": 3687588834.0, + "step": 7213 + }, + { + "epoch": 1.950784207679827, + "grad_norm": 1.1141531467437744, + "learning_rate": 1.454240182668569e-05, + "loss": 1.7784, + "mean_token_accuracy": 0.5622196197509766, + "num_tokens": 3688113002.0, + "step": 7214 + }, + { + "epoch": 1.9510546241211466, + "grad_norm": 1.2346234321594238, + "learning_rate": 1.4540952534640427e-05, + "loss": 1.9194, + "mean_token_accuracy": 0.577316403388977, + "num_tokens": 3688623920.0, + "step": 7215 + }, + { + "epoch": 1.9513250405624663, + "grad_norm": 1.2583258152008057, + "learning_rate": 1.4539503133948325e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.5531030893325806, + "num_tokens": 3689147996.0, + "step": 7216 + }, + { + "epoch": 1.9515954570037857, + "grad_norm": 1.0364949703216553, + "learning_rate": 1.4538053624653856e-05, + "loss": 2.0179, + "mean_token_accuracy": 0.5606313943862915, + "num_tokens": 3689672217.0, + "step": 7217 + }, + { + "epoch": 1.9518658734451053, + "grad_norm": 1.177626371383667, + "learning_rate": 1.4536604006801498e-05, + "loss": 2.0103, + "mean_token_accuracy": 0.5450558662414551, + "num_tokens": 3690133697.0, + "step": 7218 + }, + { + "epoch": 1.952136289886425, + "grad_norm": 1.4477567672729492, + "learning_rate": 1.4535154280435726e-05, + "loss": 1.9514, + "mean_token_accuracy": 0.5643566846847534, + "num_tokens": 3690606135.0, + "step": 7219 + }, + { + "epoch": 1.9524067063277446, + "grad_norm": 1.4569326639175415, + "learning_rate": 1.4533704445601025e-05, + "loss": 1.7864, + "mean_token_accuracy": 0.5951300859451294, + "num_tokens": 3691086986.0, + "step": 7220 + }, + { + "epoch": 1.9526771227690642, + "grad_norm": 0.5316628813743591, + "learning_rate": 1.453225450234188e-05, + "loss": 1.1215, + "mean_token_accuracy": 0.7059085965156555, + "num_tokens": 3691597495.0, + "step": 7221 + }, + { + "epoch": 1.9529475392103839, + "grad_norm": 2.8628408908843994, + "learning_rate": 1.4530804450702783e-05, + "loss": 1.9656, + "mean_token_accuracy": 0.565735936164856, + "num_tokens": 3692101349.0, + "step": 7222 + }, + { + "epoch": 1.9532179556517035, + "grad_norm": 1.8425755500793457, + "learning_rate": 1.4529354290728219e-05, + "loss": 1.8704, + "mean_token_accuracy": 0.5539523363113403, + "num_tokens": 3692625520.0, + "step": 7223 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 1.1610218286514282, + "learning_rate": 1.4527904022462686e-05, + "loss": 1.839, + "mean_token_accuracy": 0.5848479270935059, + "num_tokens": 3693124420.0, + "step": 7224 + }, + { + "epoch": 1.9537587885343428, + "grad_norm": 2.207559108734131, + "learning_rate": 1.4526453645950684e-05, + "loss": 1.9127, + "mean_token_accuracy": 0.5658851265907288, + "num_tokens": 3693648698.0, + "step": 7225 + }, + { + "epoch": 1.9540292049756625, + "grad_norm": 2.40437388420105, + "learning_rate": 1.4525003161236715e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.545017659664154, + "num_tokens": 3694172965.0, + "step": 7226 + }, + { + "epoch": 1.954299621416982, + "grad_norm": 1.6980774402618408, + "learning_rate": 1.4523552568365277e-05, + "loss": 1.9125, + "mean_token_accuracy": 0.5498625636100769, + "num_tokens": 3694697142.0, + "step": 7227 + }, + { + "epoch": 1.9545700378583017, + "grad_norm": 1.658019781112671, + "learning_rate": 1.4522101867380888e-05, + "loss": 1.8834, + "mean_token_accuracy": 0.5512463450431824, + "num_tokens": 3695221411.0, + "step": 7228 + }, + { + "epoch": 1.9548404542996214, + "grad_norm": 2.4072964191436768, + "learning_rate": 1.4520651058328057e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.5616545677185059, + "num_tokens": 3695745410.0, + "step": 7229 + }, + { + "epoch": 1.955110870740941, + "grad_norm": 2.2314045429229736, + "learning_rate": 1.4519200141251297e-05, + "loss": 1.9726, + "mean_token_accuracy": 0.5725030899047852, + "num_tokens": 3696261736.0, + "step": 7230 + }, + { + "epoch": 1.9553812871822607, + "grad_norm": 1.5331854820251465, + "learning_rate": 1.4517749116195125e-05, + "loss": 1.8753, + "mean_token_accuracy": 0.5781505703926086, + "num_tokens": 3696747037.0, + "step": 7231 + }, + { + "epoch": 1.9556517036235803, + "grad_norm": 1.747310996055603, + "learning_rate": 1.4516297983204068e-05, + "loss": 1.9207, + "mean_token_accuracy": 0.5691751837730408, + "num_tokens": 3697242524.0, + "step": 7232 + }, + { + "epoch": 1.9559221200649, + "grad_norm": 1.675622582435608, + "learning_rate": 1.4514846742322647e-05, + "loss": 1.9026, + "mean_token_accuracy": 0.5647097229957581, + "num_tokens": 3697766775.0, + "step": 7233 + }, + { + "epoch": 1.9561925365062196, + "grad_norm": 1.451121211051941, + "learning_rate": 1.4513395393595393e-05, + "loss": 1.93, + "mean_token_accuracy": 0.55352783203125, + "num_tokens": 3698291050.0, + "step": 7234 + }, + { + "epoch": 1.9564629529475392, + "grad_norm": 1.524683952331543, + "learning_rate": 1.4511943937066832e-05, + "loss": 1.9157, + "mean_token_accuracy": 0.5593577027320862, + "num_tokens": 3698815168.0, + "step": 7235 + }, + { + "epoch": 1.956733369388859, + "grad_norm": 1.5157665014266968, + "learning_rate": 1.4510492372781507e-05, + "loss": 1.8569, + "mean_token_accuracy": 0.5785093307495117, + "num_tokens": 3699337064.0, + "step": 7236 + }, + { + "epoch": 1.9570037858301785, + "grad_norm": 1.4314029216766357, + "learning_rate": 1.4509040700783951e-05, + "loss": 1.9596, + "mean_token_accuracy": 0.5442414283752441, + "num_tokens": 3699861112.0, + "step": 7237 + }, + { + "epoch": 1.9572742022714982, + "grad_norm": 1.6954586505889893, + "learning_rate": 1.4507588921118706e-05, + "loss": 1.9183, + "mean_token_accuracy": 0.5579076409339905, + "num_tokens": 3700329212.0, + "step": 7238 + }, + { + "epoch": 1.9575446187128178, + "grad_norm": 1.2003138065338135, + "learning_rate": 1.4506137033830321e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.5460188388824463, + "num_tokens": 3700841157.0, + "step": 7239 + }, + { + "epoch": 1.9578150351541375, + "grad_norm": 1.4433051347732544, + "learning_rate": 1.4504685038963337e-05, + "loss": 1.9099, + "mean_token_accuracy": 0.6056908369064331, + "num_tokens": 3701232543.0, + "step": 7240 + }, + { + "epoch": 1.958085451595457, + "grad_norm": 0.44104689359664917, + "learning_rate": 1.450323293656231e-05, + "loss": 0.9336, + "mean_token_accuracy": 0.7481114864349365, + "num_tokens": 3701733432.0, + "step": 7241 + }, + { + "epoch": 1.9583558680367767, + "grad_norm": 2.1863746643066406, + "learning_rate": 1.4501780726671797e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5681707262992859, + "num_tokens": 3702257636.0, + "step": 7242 + }, + { + "epoch": 1.9586262844780964, + "grad_norm": 1.6086055040359497, + "learning_rate": 1.450032840933635e-05, + "loss": 1.8933, + "mean_token_accuracy": 0.581702709197998, + "num_tokens": 3702781817.0, + "step": 7243 + }, + { + "epoch": 1.958896700919416, + "grad_norm": 1.4042435884475708, + "learning_rate": 1.4498875984600535e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5661256909370422, + "num_tokens": 3703305990.0, + "step": 7244 + }, + { + "epoch": 1.9591671173607357, + "grad_norm": 1.4680131673812866, + "learning_rate": 1.4497423452508916e-05, + "loss": 1.9198, + "mean_token_accuracy": 0.5610033273696899, + "num_tokens": 3703830182.0, + "step": 7245 + }, + { + "epoch": 1.9594375338020553, + "grad_norm": 1.6425015926361084, + "learning_rate": 1.4495970813106062e-05, + "loss": 1.9491, + "mean_token_accuracy": 0.5609904527664185, + "num_tokens": 3704354402.0, + "step": 7246 + }, + { + "epoch": 1.959707950243375, + "grad_norm": 1.348545789718628, + "learning_rate": 1.4494518066436539e-05, + "loss": 1.928, + "mean_token_accuracy": 0.5663983821868896, + "num_tokens": 3704828565.0, + "step": 7247 + }, + { + "epoch": 1.9599783666846944, + "grad_norm": 1.481458067893982, + "learning_rate": 1.4493065212544925e-05, + "loss": 1.8644, + "mean_token_accuracy": 0.5500506162643433, + "num_tokens": 3705295395.0, + "step": 7248 + }, + { + "epoch": 1.960248783126014, + "grad_norm": 1.8916996717453003, + "learning_rate": 1.4491612251475798e-05, + "loss": 1.9338, + "mean_token_accuracy": 0.5700520277023315, + "num_tokens": 3705819614.0, + "step": 7249 + }, + { + "epoch": 1.9605191995673337, + "grad_norm": 1.4035930633544922, + "learning_rate": 1.449015918327374e-05, + "loss": 1.9552, + "mean_token_accuracy": 0.5687218308448792, + "num_tokens": 3706343763.0, + "step": 7250 + }, + { + "epoch": 1.9607896160086533, + "grad_norm": 1.6262112855911255, + "learning_rate": 1.4488706007983335e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.5758459568023682, + "num_tokens": 3706868041.0, + "step": 7251 + }, + { + "epoch": 1.961060032449973, + "grad_norm": 1.6415451765060425, + "learning_rate": 1.448725272564917e-05, + "loss": 1.9517, + "mean_token_accuracy": 0.5557676553726196, + "num_tokens": 3707392308.0, + "step": 7252 + }, + { + "epoch": 1.9613304488912926, + "grad_norm": 1.4489147663116455, + "learning_rate": 1.4485799336315833e-05, + "loss": 1.9785, + "mean_token_accuracy": 0.5638347268104553, + "num_tokens": 3707916584.0, + "step": 7253 + }, + { + "epoch": 1.9616008653326122, + "grad_norm": 1.5808093547821045, + "learning_rate": 1.4484345840027923e-05, + "loss": 1.867, + "mean_token_accuracy": 0.5846140384674072, + "num_tokens": 3708410658.0, + "step": 7254 + }, + { + "epoch": 1.9618712817739319, + "grad_norm": 1.1944814920425415, + "learning_rate": 1.4482892236830037e-05, + "loss": 1.8672, + "mean_token_accuracy": 0.5569279193878174, + "num_tokens": 3708934894.0, + "step": 7255 + }, + { + "epoch": 1.9621416982152515, + "grad_norm": 1.3052210807800293, + "learning_rate": 1.4481438526766773e-05, + "loss": 1.958, + "mean_token_accuracy": 0.5605963468551636, + "num_tokens": 3709459048.0, + "step": 7256 + }, + { + "epoch": 1.9624121146565712, + "grad_norm": 1.2163676023483276, + "learning_rate": 1.447998470988274e-05, + "loss": 1.6962, + "mean_token_accuracy": 0.6039486527442932, + "num_tokens": 3709983167.0, + "step": 7257 + }, + { + "epoch": 1.9626825310978906, + "grad_norm": 1.3341563940048218, + "learning_rate": 1.447853078622254e-05, + "loss": 2.0351, + "mean_token_accuracy": 0.5374962091445923, + "num_tokens": 3710507294.0, + "step": 7258 + }, + { + "epoch": 1.9629529475392102, + "grad_norm": 1.22474205493927, + "learning_rate": 1.4477076755830785e-05, + "loss": 2.0012, + "mean_token_accuracy": 0.5499098300933838, + "num_tokens": 3711031433.0, + "step": 7259 + }, + { + "epoch": 1.9632233639805299, + "grad_norm": 1.1745645999908447, + "learning_rate": 1.447562261875209e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5813146233558655, + "num_tokens": 3711474188.0, + "step": 7260 + }, + { + "epoch": 1.9634937804218495, + "grad_norm": 0.5496718287467957, + "learning_rate": 1.4474168375031075e-05, + "loss": 1.0976, + "mean_token_accuracy": 0.7073835134506226, + "num_tokens": 3711948449.0, + "step": 7261 + }, + { + "epoch": 1.9637641968631692, + "grad_norm": 1.874534249305725, + "learning_rate": 1.4472714024712353e-05, + "loss": 1.8166, + "mean_token_accuracy": 0.5819416046142578, + "num_tokens": 3712472629.0, + "step": 7262 + }, + { + "epoch": 1.9640346133044888, + "grad_norm": 1.6367740631103516, + "learning_rate": 1.4471259567840552e-05, + "loss": 1.9944, + "mean_token_accuracy": 0.5472825169563293, + "num_tokens": 3712979380.0, + "step": 7263 + }, + { + "epoch": 1.9643050297458085, + "grad_norm": 1.1423976421356201, + "learning_rate": 1.4469805004460303e-05, + "loss": 1.8567, + "mean_token_accuracy": 0.5816172361373901, + "num_tokens": 3713428472.0, + "step": 7264 + }, + { + "epoch": 1.964575446187128, + "grad_norm": 1.6253856420516968, + "learning_rate": 1.446835033461623e-05, + "loss": 2.0776, + "mean_token_accuracy": 0.5648484826087952, + "num_tokens": 3713952750.0, + "step": 7265 + }, + { + "epoch": 1.9648458626284477, + "grad_norm": 1.738725185394287, + "learning_rate": 1.4466895558352968e-05, + "loss": 1.9435, + "mean_token_accuracy": 0.5432838201522827, + "num_tokens": 3714476965.0, + "step": 7266 + }, + { + "epoch": 1.9651162790697674, + "grad_norm": 1.3762532472610474, + "learning_rate": 1.4465440675715158e-05, + "loss": 1.9493, + "mean_token_accuracy": 0.5523836612701416, + "num_tokens": 3715001246.0, + "step": 7267 + }, + { + "epoch": 1.965386695511087, + "grad_norm": 1.4199414253234863, + "learning_rate": 1.4463985686747435e-05, + "loss": 2.0893, + "mean_token_accuracy": 0.5326688289642334, + "num_tokens": 3715525470.0, + "step": 7268 + }, + { + "epoch": 1.9656571119524067, + "grad_norm": 1.2912331819534302, + "learning_rate": 1.4462530591494443e-05, + "loss": 1.9684, + "mean_token_accuracy": 0.5491343140602112, + "num_tokens": 3716049722.0, + "step": 7269 + }, + { + "epoch": 1.9659275283937263, + "grad_norm": 1.487537145614624, + "learning_rate": 1.4461075390000831e-05, + "loss": 1.9919, + "mean_token_accuracy": 0.5483629703521729, + "num_tokens": 3716573970.0, + "step": 7270 + }, + { + "epoch": 1.966197944835046, + "grad_norm": 1.2053253650665283, + "learning_rate": 1.445962008231125e-05, + "loss": 1.9001, + "mean_token_accuracy": 0.5694950819015503, + "num_tokens": 3717088873.0, + "step": 7271 + }, + { + "epoch": 1.9664683612763656, + "grad_norm": 1.2427889108657837, + "learning_rate": 1.4458164668470346e-05, + "loss": 1.9874, + "mean_token_accuracy": 0.554154098033905, + "num_tokens": 3717593329.0, + "step": 7272 + }, + { + "epoch": 1.9667387777176852, + "grad_norm": 1.1828001737594604, + "learning_rate": 1.4456709148522785e-05, + "loss": 1.8247, + "mean_token_accuracy": 0.5763049721717834, + "num_tokens": 3718093675.0, + "step": 7273 + }, + { + "epoch": 1.9670091941590049, + "grad_norm": 1.117572546005249, + "learning_rate": 1.4455253522513219e-05, + "loss": 1.7748, + "mean_token_accuracy": 0.5964899659156799, + "num_tokens": 3718617955.0, + "step": 7274 + }, + { + "epoch": 1.9672796106003245, + "grad_norm": 1.0797010660171509, + "learning_rate": 1.4453797790486314e-05, + "loss": 1.8411, + "mean_token_accuracy": 0.5821281671524048, + "num_tokens": 3719134697.0, + "step": 7275 + }, + { + "epoch": 1.9675500270416442, + "grad_norm": 1.0009982585906982, + "learning_rate": 1.4452341952486738e-05, + "loss": 1.9927, + "mean_token_accuracy": 0.5589238405227661, + "num_tokens": 3719606023.0, + "step": 7276 + }, + { + "epoch": 1.9678204434829638, + "grad_norm": 1.160723090171814, + "learning_rate": 1.4450886008559155e-05, + "loss": 2.0177, + "mean_token_accuracy": 0.5340909957885742, + "num_tokens": 3720130309.0, + "step": 7277 + }, + { + "epoch": 1.9680908599242835, + "grad_norm": 1.1326520442962646, + "learning_rate": 1.444942995874824e-05, + "loss": 1.9083, + "mean_token_accuracy": 0.5697405934333801, + "num_tokens": 3720654415.0, + "step": 7278 + }, + { + "epoch": 1.968361276365603, + "grad_norm": 1.1362513303756714, + "learning_rate": 1.4447973803098674e-05, + "loss": 1.8344, + "mean_token_accuracy": 0.5760136842727661, + "num_tokens": 3721178557.0, + "step": 7279 + }, + { + "epoch": 1.9686316928069227, + "grad_norm": 1.5567779541015625, + "learning_rate": 1.4446517541655128e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.5660480856895447, + "num_tokens": 3721639146.0, + "step": 7280 + }, + { + "epoch": 1.9689021092482424, + "grad_norm": 0.5108099579811096, + "learning_rate": 1.4445061174462292e-05, + "loss": 1.1567, + "mean_token_accuracy": 0.6840643882751465, + "num_tokens": 3722163415.0, + "step": 7281 + }, + { + "epoch": 1.969172525689562, + "grad_norm": 1.2354058027267456, + "learning_rate": 1.4443604701564847e-05, + "loss": 1.8407, + "mean_token_accuracy": 0.5910016894340515, + "num_tokens": 3722676133.0, + "step": 7282 + }, + { + "epoch": 1.9694429421308817, + "grad_norm": 1.38231360912323, + "learning_rate": 1.444214812300748e-05, + "loss": 1.893, + "mean_token_accuracy": 0.5746456384658813, + "num_tokens": 3723200229.0, + "step": 7283 + }, + { + "epoch": 1.9697133585722013, + "grad_norm": 1.2472907304763794, + "learning_rate": 1.4440691438834886e-05, + "loss": 1.9428, + "mean_token_accuracy": 0.5755026936531067, + "num_tokens": 3723688616.0, + "step": 7284 + }, + { + "epoch": 1.969983775013521, + "grad_norm": 1.1959381103515625, + "learning_rate": 1.4439234649091763e-05, + "loss": 1.8996, + "mean_token_accuracy": 0.5753424167633057, + "num_tokens": 3724212799.0, + "step": 7285 + }, + { + "epoch": 1.9702541914548406, + "grad_norm": 1.0850321054458618, + "learning_rate": 1.4437777753822804e-05, + "loss": 1.7595, + "mean_token_accuracy": 0.5993348956108093, + "num_tokens": 3724710468.0, + "step": 7286 + }, + { + "epoch": 1.9705246078961602, + "grad_norm": 1.201456904411316, + "learning_rate": 1.4436320753072718e-05, + "loss": 1.8998, + "mean_token_accuracy": 0.5604528784751892, + "num_tokens": 3725234679.0, + "step": 7287 + }, + { + "epoch": 1.9707950243374799, + "grad_norm": 1.1544088125228882, + "learning_rate": 1.4434863646886204e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.5542842149734497, + "num_tokens": 3725758910.0, + "step": 7288 + }, + { + "epoch": 1.9710654407787993, + "grad_norm": 1.3709568977355957, + "learning_rate": 1.4433406435307971e-05, + "loss": 1.9181, + "mean_token_accuracy": 0.5408703684806824, + "num_tokens": 3726283168.0, + "step": 7289 + }, + { + "epoch": 1.971335857220119, + "grad_norm": 1.312018871307373, + "learning_rate": 1.4431949118382731e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.5420838594436646, + "num_tokens": 3726807445.0, + "step": 7290 + }, + { + "epoch": 1.9716062736614386, + "grad_norm": 1.2578763961791992, + "learning_rate": 1.44304916961552e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.5465792417526245, + "num_tokens": 3727331697.0, + "step": 7291 + }, + { + "epoch": 1.9718766901027582, + "grad_norm": 1.250697135925293, + "learning_rate": 1.4429034168670096e-05, + "loss": 1.8972, + "mean_token_accuracy": 0.5724869966506958, + "num_tokens": 3727824451.0, + "step": 7292 + }, + { + "epoch": 1.9721471065440779, + "grad_norm": 1.1491897106170654, + "learning_rate": 1.4427576535972137e-05, + "loss": 2.0046, + "mean_token_accuracy": 0.5584590435028076, + "num_tokens": 3728348661.0, + "step": 7293 + }, + { + "epoch": 1.9724175229853975, + "grad_norm": 1.3457331657409668, + "learning_rate": 1.4426118798106054e-05, + "loss": 1.8352, + "mean_token_accuracy": 0.5831274390220642, + "num_tokens": 3728868165.0, + "step": 7294 + }, + { + "epoch": 1.9726879394267172, + "grad_norm": 1.4611599445343018, + "learning_rate": 1.4424660955116569e-05, + "loss": 1.9873, + "mean_token_accuracy": 0.5569179058074951, + "num_tokens": 3729392415.0, + "step": 7295 + }, + { + "epoch": 1.9729583558680368, + "grad_norm": 1.0614449977874756, + "learning_rate": 1.4423203007048415e-05, + "loss": 1.8584, + "mean_token_accuracy": 0.5802679061889648, + "num_tokens": 3729916685.0, + "step": 7296 + }, + { + "epoch": 1.9732287723093564, + "grad_norm": 1.3869723081588745, + "learning_rate": 1.4421744953946322e-05, + "loss": 1.7378, + "mean_token_accuracy": 0.5965163707733154, + "num_tokens": 3730440770.0, + "step": 7297 + }, + { + "epoch": 1.973499188750676, + "grad_norm": 1.2993526458740234, + "learning_rate": 1.4420286795855033e-05, + "loss": 1.9473, + "mean_token_accuracy": 0.5752806663513184, + "num_tokens": 3730964939.0, + "step": 7298 + }, + { + "epoch": 1.9737696051919955, + "grad_norm": 1.168695092201233, + "learning_rate": 1.4418828532819289e-05, + "loss": 1.8364, + "mean_token_accuracy": 0.5916271209716797, + "num_tokens": 3731428005.0, + "step": 7299 + }, + { + "epoch": 1.9740400216333152, + "grad_norm": 1.1657161712646484, + "learning_rate": 1.441737016488383e-05, + "loss": 1.9441, + "mean_token_accuracy": 0.5437393188476562, + "num_tokens": 3731952169.0, + "step": 7300 + }, + { + "epoch": 1.9743104380746348, + "grad_norm": 0.5402176976203918, + "learning_rate": 1.4415911692093407e-05, + "loss": 1.124, + "mean_token_accuracy": 0.6979447603225708, + "num_tokens": 3732428546.0, + "step": 7301 + }, + { + "epoch": 1.9745808545159544, + "grad_norm": 1.3284331560134888, + "learning_rate": 1.4414453114492766e-05, + "loss": 1.8889, + "mean_token_accuracy": 0.569427490234375, + "num_tokens": 3732910551.0, + "step": 7302 + }, + { + "epoch": 1.974851270957274, + "grad_norm": 1.421549677848816, + "learning_rate": 1.441299443212666e-05, + "loss": 2.0291, + "mean_token_accuracy": 0.5405470728874207, + "num_tokens": 3733434790.0, + "step": 7303 + }, + { + "epoch": 1.9751216873985937, + "grad_norm": 1.1495716571807861, + "learning_rate": 1.4411535645039852e-05, + "loss": 1.9142, + "mean_token_accuracy": 0.5791112780570984, + "num_tokens": 3733935952.0, + "step": 7304 + }, + { + "epoch": 1.9753921038399134, + "grad_norm": 1.2823426723480225, + "learning_rate": 1.4410076753277093e-05, + "loss": 2.0688, + "mean_token_accuracy": 0.5197961926460266, + "num_tokens": 3734460200.0, + "step": 7305 + }, + { + "epoch": 1.975662520281233, + "grad_norm": 1.2979916334152222, + "learning_rate": 1.4408617756883154e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5535776615142822, + "num_tokens": 3734984328.0, + "step": 7306 + }, + { + "epoch": 1.9759329367225527, + "grad_norm": 1.1265203952789307, + "learning_rate": 1.4407158655902797e-05, + "loss": 1.9277, + "mean_token_accuracy": 0.5673449039459229, + "num_tokens": 3735508524.0, + "step": 7307 + }, + { + "epoch": 1.9762033531638723, + "grad_norm": 1.2642550468444824, + "learning_rate": 1.4405699450380795e-05, + "loss": 1.8473, + "mean_token_accuracy": 0.5635430216789246, + "num_tokens": 3735978552.0, + "step": 7308 + }, + { + "epoch": 1.976473769605192, + "grad_norm": 1.3683491945266724, + "learning_rate": 1.4404240140361916e-05, + "loss": 1.8814, + "mean_token_accuracy": 0.5801171064376831, + "num_tokens": 3736502827.0, + "step": 7309 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 1.2949018478393555, + "learning_rate": 1.440278072589094e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.5515941381454468, + "num_tokens": 3737026945.0, + "step": 7310 + }, + { + "epoch": 1.9770146024878312, + "grad_norm": 1.3982526063919067, + "learning_rate": 1.4401321207012641e-05, + "loss": 2.0247, + "mean_token_accuracy": 0.5392458438873291, + "num_tokens": 3737551209.0, + "step": 7311 + }, + { + "epoch": 1.9772850189291509, + "grad_norm": 1.1594849824905396, + "learning_rate": 1.4399861583771806e-05, + "loss": 2.031, + "mean_token_accuracy": 0.5383992195129395, + "num_tokens": 3738027998.0, + "step": 7312 + }, + { + "epoch": 1.9775554353704705, + "grad_norm": 1.3525797128677368, + "learning_rate": 1.439840185621322e-05, + "loss": 1.7354, + "mean_token_accuracy": 0.6224122047424316, + "num_tokens": 3738549884.0, + "step": 7313 + }, + { + "epoch": 1.9778258518117902, + "grad_norm": 0.935967743396759, + "learning_rate": 1.439694202438167e-05, + "loss": 1.9418, + "mean_token_accuracy": 0.5612446069717407, + "num_tokens": 3739074063.0, + "step": 7314 + }, + { + "epoch": 1.9780962682531098, + "grad_norm": 1.4578731060028076, + "learning_rate": 1.4395482088321952e-05, + "loss": 2.0372, + "mean_token_accuracy": 0.5775456428527832, + "num_tokens": 3739532882.0, + "step": 7315 + }, + { + "epoch": 1.9783666846944294, + "grad_norm": 1.0250862836837769, + "learning_rate": 1.4394022048078857e-05, + "loss": 1.9433, + "mean_token_accuracy": 0.5561901330947876, + "num_tokens": 3740057154.0, + "step": 7316 + }, + { + "epoch": 1.978637101135749, + "grad_norm": 1.2717626094818115, + "learning_rate": 1.4392561903697185e-05, + "loss": 1.9998, + "mean_token_accuracy": 0.553646981716156, + "num_tokens": 3740581312.0, + "step": 7317 + }, + { + "epoch": 1.9789075175770687, + "grad_norm": 1.306705117225647, + "learning_rate": 1.4391101655221735e-05, + "loss": 2.1096, + "mean_token_accuracy": 0.5486220121383667, + "num_tokens": 3741105575.0, + "step": 7318 + }, + { + "epoch": 1.9791779340183884, + "grad_norm": 1.1406071186065674, + "learning_rate": 1.4389641302697313e-05, + "loss": 1.8189, + "mean_token_accuracy": 0.5917171239852905, + "num_tokens": 3741629703.0, + "step": 7319 + }, + { + "epoch": 1.979448350459708, + "grad_norm": 1.2837563753128052, + "learning_rate": 1.4388180846168729e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.538264274597168, + "num_tokens": 3742153956.0, + "step": 7320 + }, + { + "epoch": 1.9797187669010277, + "grad_norm": 0.6347360014915466, + "learning_rate": 1.4386720285680791e-05, + "loss": 1.2374, + "mean_token_accuracy": 0.6796072721481323, + "num_tokens": 3742638014.0, + "step": 7321 + }, + { + "epoch": 1.9799891833423473, + "grad_norm": 2.0790419578552246, + "learning_rate": 1.4385259621278316e-05, + "loss": 2.0201, + "mean_token_accuracy": 0.555410623550415, + "num_tokens": 3743102592.0, + "step": 7322 + }, + { + "epoch": 1.980259599783667, + "grad_norm": 1.9242111444473267, + "learning_rate": 1.438379885300612e-05, + "loss": 2.1365, + "mean_token_accuracy": 0.5161540508270264, + "num_tokens": 3743626852.0, + "step": 7323 + }, + { + "epoch": 1.9805300162249866, + "grad_norm": 1.162039041519165, + "learning_rate": 1.4382337980909026e-05, + "loss": 1.9553, + "mean_token_accuracy": 0.545758843421936, + "num_tokens": 3744151091.0, + "step": 7324 + }, + { + "epoch": 1.9808004326663062, + "grad_norm": 1.3487666845321655, + "learning_rate": 1.4380877005031855e-05, + "loss": 1.9629, + "mean_token_accuracy": 0.5506488084793091, + "num_tokens": 3744675274.0, + "step": 7325 + }, + { + "epoch": 1.9810708491076259, + "grad_norm": 1.732426643371582, + "learning_rate": 1.4379415925419434e-05, + "loss": 1.9305, + "mean_token_accuracy": 0.5699107646942139, + "num_tokens": 3745147650.0, + "step": 7326 + }, + { + "epoch": 1.9813412655489455, + "grad_norm": 1.459196925163269, + "learning_rate": 1.4377954742116593e-05, + "loss": 1.8945, + "mean_token_accuracy": 0.5695545673370361, + "num_tokens": 3745671924.0, + "step": 7327 + }, + { + "epoch": 1.9816116819902652, + "grad_norm": 1.139342188835144, + "learning_rate": 1.437649345516817e-05, + "loss": 1.9552, + "mean_token_accuracy": 0.5510506629943848, + "num_tokens": 3746196090.0, + "step": 7328 + }, + { + "epoch": 1.9818820984315848, + "grad_norm": 1.4321991205215454, + "learning_rate": 1.4375032064618996e-05, + "loss": 1.8783, + "mean_token_accuracy": 0.5588277578353882, + "num_tokens": 3746720362.0, + "step": 7329 + }, + { + "epoch": 1.9821525148729042, + "grad_norm": 1.236282467842102, + "learning_rate": 1.4373570570513916e-05, + "loss": 1.9234, + "mean_token_accuracy": 0.544047474861145, + "num_tokens": 3747244322.0, + "step": 7330 + }, + { + "epoch": 1.9824229313142239, + "grad_norm": 1.195185899734497, + "learning_rate": 1.4372108972897767e-05, + "loss": 2.007, + "mean_token_accuracy": 0.5411796569824219, + "num_tokens": 3747768525.0, + "step": 7331 + }, + { + "epoch": 1.9826933477555435, + "grad_norm": 1.2930275201797485, + "learning_rate": 1.4370647271815396e-05, + "loss": 1.9759, + "mean_token_accuracy": 0.5627241134643555, + "num_tokens": 3748284271.0, + "step": 7332 + }, + { + "epoch": 1.9829637641968632, + "grad_norm": 1.135430097579956, + "learning_rate": 1.4369185467311658e-05, + "loss": 1.7952, + "mean_token_accuracy": 0.5817643404006958, + "num_tokens": 3748808467.0, + "step": 7333 + }, + { + "epoch": 1.9832341806381828, + "grad_norm": 1.2683378458023071, + "learning_rate": 1.4367723559431398e-05, + "loss": 1.9669, + "mean_token_accuracy": 0.5529536008834839, + "num_tokens": 3749332667.0, + "step": 7334 + }, + { + "epoch": 1.9835045970795024, + "grad_norm": 1.1605255603790283, + "learning_rate": 1.4366261548219478e-05, + "loss": 1.9128, + "mean_token_accuracy": 0.5676555037498474, + "num_tokens": 3749830661.0, + "step": 7335 + }, + { + "epoch": 1.983775013520822, + "grad_norm": 1.1853288412094116, + "learning_rate": 1.4364799433720755e-05, + "loss": 1.856, + "mean_token_accuracy": 0.5558661222457886, + "num_tokens": 3750354799.0, + "step": 7336 + }, + { + "epoch": 1.9840454299621417, + "grad_norm": 1.5032868385314941, + "learning_rate": 1.4363337215980091e-05, + "loss": 2.0069, + "mean_token_accuracy": 0.5605933666229248, + "num_tokens": 3750878968.0, + "step": 7337 + }, + { + "epoch": 1.9843158464034614, + "grad_norm": 1.3001132011413574, + "learning_rate": 1.4361874895042347e-05, + "loss": 1.9695, + "mean_token_accuracy": 0.5323563814163208, + "num_tokens": 3751403167.0, + "step": 7338 + }, + { + "epoch": 1.984586262844781, + "grad_norm": 1.1709321737289429, + "learning_rate": 1.4360412470952396e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.532992422580719, + "num_tokens": 3751927337.0, + "step": 7339 + }, + { + "epoch": 1.9848566792861004, + "grad_norm": 1.2935959100723267, + "learning_rate": 1.4358949943755107e-05, + "loss": 1.8686, + "mean_token_accuracy": 0.580140233039856, + "num_tokens": 3752394733.0, + "step": 7340 + }, + { + "epoch": 1.98512709572742, + "grad_norm": 0.5754085183143616, + "learning_rate": 1.4357487313495354e-05, + "loss": 1.1464, + "mean_token_accuracy": 0.6912896633148193, + "num_tokens": 3752894682.0, + "step": 7341 + }, + { + "epoch": 1.9853975121687397, + "grad_norm": 1.6833107471466064, + "learning_rate": 1.4356024580218021e-05, + "loss": 1.9055, + "mean_token_accuracy": 0.5641638040542603, + "num_tokens": 3753418906.0, + "step": 7342 + }, + { + "epoch": 1.9856679286100594, + "grad_norm": 1.4213248491287231, + "learning_rate": 1.4354561743967985e-05, + "loss": 1.9358, + "mean_token_accuracy": 0.5501192808151245, + "num_tokens": 3753943157.0, + "step": 7343 + }, + { + "epoch": 1.985938345051379, + "grad_norm": 1.197314739227295, + "learning_rate": 1.4353098804790126e-05, + "loss": 1.8494, + "mean_token_accuracy": 0.568561315536499, + "num_tokens": 3754467299.0, + "step": 7344 + }, + { + "epoch": 1.9862087614926986, + "grad_norm": 1.318208932876587, + "learning_rate": 1.4351635762729338e-05, + "loss": 1.9585, + "mean_token_accuracy": 0.5645739436149597, + "num_tokens": 3754991449.0, + "step": 7345 + }, + { + "epoch": 1.9864791779340183, + "grad_norm": 1.0875364542007446, + "learning_rate": 1.4350172617830504e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5458572506904602, + "num_tokens": 3755515712.0, + "step": 7346 + }, + { + "epoch": 1.986749594375338, + "grad_norm": 1.357859492301941, + "learning_rate": 1.4348709370138522e-05, + "loss": 1.9037, + "mean_token_accuracy": 0.5454522967338562, + "num_tokens": 3756039799.0, + "step": 7347 + }, + { + "epoch": 1.9870200108166576, + "grad_norm": 1.220677137374878, + "learning_rate": 1.4347246019698293e-05, + "loss": 1.8434, + "mean_token_accuracy": 0.5737240314483643, + "num_tokens": 3756488707.0, + "step": 7348 + }, + { + "epoch": 1.9872904272579772, + "grad_norm": 1.161739706993103, + "learning_rate": 1.434578256655471e-05, + "loss": 1.988, + "mean_token_accuracy": 0.5597768425941467, + "num_tokens": 3757012965.0, + "step": 7349 + }, + { + "epoch": 1.9875608436992969, + "grad_norm": 1.2807867527008057, + "learning_rate": 1.4344319010752681e-05, + "loss": 1.855, + "mean_token_accuracy": 0.563194215297699, + "num_tokens": 3757537083.0, + "step": 7350 + }, + { + "epoch": 1.9878312601406165, + "grad_norm": 1.3303472995758057, + "learning_rate": 1.4342855352337108e-05, + "loss": 1.939, + "mean_token_accuracy": 0.5522701740264893, + "num_tokens": 3758061333.0, + "step": 7351 + }, + { + "epoch": 1.9881016765819362, + "grad_norm": 1.3422317504882812, + "learning_rate": 1.43413915913529e-05, + "loss": 1.9279, + "mean_token_accuracy": 0.5694730281829834, + "num_tokens": 3758575175.0, + "step": 7352 + }, + { + "epoch": 1.9883720930232558, + "grad_norm": 1.1383978128433228, + "learning_rate": 1.4339927727844976e-05, + "loss": 2.0642, + "mean_token_accuracy": 0.5130006074905396, + "num_tokens": 3759099372.0, + "step": 7353 + }, + { + "epoch": 1.9886425094645754, + "grad_norm": 1.3697482347488403, + "learning_rate": 1.4338463761858244e-05, + "loss": 2.1277, + "mean_token_accuracy": 0.5260300636291504, + "num_tokens": 3759623545.0, + "step": 7354 + }, + { + "epoch": 1.988912925905895, + "grad_norm": 1.3208229541778564, + "learning_rate": 1.4336999693437626e-05, + "loss": 1.9019, + "mean_token_accuracy": 0.5406022071838379, + "num_tokens": 3760147749.0, + "step": 7355 + }, + { + "epoch": 1.9891833423472147, + "grad_norm": 1.0335856676101685, + "learning_rate": 1.4335535522628044e-05, + "loss": 1.7863, + "mean_token_accuracy": 0.5734848976135254, + "num_tokens": 3760671982.0, + "step": 7356 + }, + { + "epoch": 1.9894537587885344, + "grad_norm": 1.1443941593170166, + "learning_rate": 1.4334071249474421e-05, + "loss": 1.8123, + "mean_token_accuracy": 0.5743592977523804, + "num_tokens": 3761196240.0, + "step": 7357 + }, + { + "epoch": 1.989724175229854, + "grad_norm": 1.622292399406433, + "learning_rate": 1.433260687402169e-05, + "loss": 1.9019, + "mean_token_accuracy": 0.5571303367614746, + "num_tokens": 3761720482.0, + "step": 7358 + }, + { + "epoch": 1.9899945916711737, + "grad_norm": 1.3166583776474, + "learning_rate": 1.433114239631478e-05, + "loss": 1.8882, + "mean_token_accuracy": 0.5709288120269775, + "num_tokens": 3762216978.0, + "step": 7359 + }, + { + "epoch": 1.9902650081124933, + "grad_norm": 1.2812036275863647, + "learning_rate": 1.4329677816398627e-05, + "loss": 1.9035, + "mean_token_accuracy": 0.5724633932113647, + "num_tokens": 3762741042.0, + "step": 7360 + }, + { + "epoch": 1.990535424553813, + "grad_norm": 0.5672760009765625, + "learning_rate": 1.4328213134318163e-05, + "loss": 1.1216, + "mean_token_accuracy": 0.7042142152786255, + "num_tokens": 3763265233.0, + "step": 7361 + }, + { + "epoch": 1.9908058409951326, + "grad_norm": 1.3128310441970825, + "learning_rate": 1.4326748350118332e-05, + "loss": 1.8436, + "mean_token_accuracy": 0.5756180286407471, + "num_tokens": 3763789474.0, + "step": 7362 + }, + { + "epoch": 1.9910762574364522, + "grad_norm": 1.2689201831817627, + "learning_rate": 1.432528346384408e-05, + "loss": 1.8677, + "mean_token_accuracy": 0.5678232908248901, + "num_tokens": 3764313648.0, + "step": 7363 + }, + { + "epoch": 1.9913466738777719, + "grad_norm": 1.3713903427124023, + "learning_rate": 1.4323818475540352e-05, + "loss": 1.9209, + "mean_token_accuracy": 0.5732094049453735, + "num_tokens": 3764745305.0, + "step": 7364 + }, + { + "epoch": 1.9916170903190915, + "grad_norm": 1.1401399374008179, + "learning_rate": 1.4322353385252097e-05, + "loss": 1.835, + "mean_token_accuracy": 0.5792168974876404, + "num_tokens": 3765269489.0, + "step": 7365 + }, + { + "epoch": 1.9918875067604112, + "grad_norm": 1.3169176578521729, + "learning_rate": 1.4320888193024275e-05, + "loss": 1.8945, + "mean_token_accuracy": 0.573333740234375, + "num_tokens": 3765793583.0, + "step": 7366 + }, + { + "epoch": 1.9921579232017308, + "grad_norm": 1.355367660522461, + "learning_rate": 1.4319422898901832e-05, + "loss": 1.9119, + "mean_token_accuracy": 0.562018632888794, + "num_tokens": 3766317830.0, + "step": 7367 + }, + { + "epoch": 1.9924283396430504, + "grad_norm": 1.2920923233032227, + "learning_rate": 1.4317957502929734e-05, + "loss": 1.7949, + "mean_token_accuracy": 0.5767903327941895, + "num_tokens": 3766842068.0, + "step": 7368 + }, + { + "epoch": 1.99269875608437, + "grad_norm": 1.2983052730560303, + "learning_rate": 1.4316492005152943e-05, + "loss": 1.9653, + "mean_token_accuracy": 0.5756133198738098, + "num_tokens": 3767302552.0, + "step": 7369 + }, + { + "epoch": 1.9929691725256897, + "grad_norm": 1.1602460145950317, + "learning_rate": 1.4315026405616422e-05, + "loss": 1.8445, + "mean_token_accuracy": 0.5740681290626526, + "num_tokens": 3767826673.0, + "step": 7370 + }, + { + "epoch": 1.9932395889670091, + "grad_norm": 1.2701512575149536, + "learning_rate": 1.4313560704365142e-05, + "loss": 2.0889, + "mean_token_accuracy": 0.5468426942825317, + "num_tokens": 3768307125.0, + "step": 7371 + }, + { + "epoch": 1.9935100054083288, + "grad_norm": 1.5398752689361572, + "learning_rate": 1.4312094901444075e-05, + "loss": 1.8496, + "mean_token_accuracy": 0.5980381965637207, + "num_tokens": 3768831281.0, + "step": 7372 + }, + { + "epoch": 1.9937804218496484, + "grad_norm": 1.1961621046066284, + "learning_rate": 1.43106289968982e-05, + "loss": 1.9691, + "mean_token_accuracy": 0.548080563545227, + "num_tokens": 3769355354.0, + "step": 7373 + }, + { + "epoch": 1.994050838290968, + "grad_norm": 1.1230310201644897, + "learning_rate": 1.4309162990772486e-05, + "loss": 1.8565, + "mean_token_accuracy": 0.5747839212417603, + "num_tokens": 3769879633.0, + "step": 7374 + }, + { + "epoch": 1.9943212547322877, + "grad_norm": 1.2217957973480225, + "learning_rate": 1.4307696883111924e-05, + "loss": 2.0268, + "mean_token_accuracy": 0.5635707378387451, + "num_tokens": 3770403880.0, + "step": 7375 + }, + { + "epoch": 1.9945916711736074, + "grad_norm": 1.3138700723648071, + "learning_rate": 1.4306230673961493e-05, + "loss": 2.0416, + "mean_token_accuracy": 0.5417121648788452, + "num_tokens": 3770928045.0, + "step": 7376 + }, + { + "epoch": 1.994862087614927, + "grad_norm": 1.0719300508499146, + "learning_rate": 1.4304764363366181e-05, + "loss": 1.888, + "mean_token_accuracy": 0.5758230686187744, + "num_tokens": 3771429239.0, + "step": 7377 + }, + { + "epoch": 1.9951325040562466, + "grad_norm": 1.0680222511291504, + "learning_rate": 1.430329795137098e-05, + "loss": 1.9146, + "mean_token_accuracy": 0.5574357509613037, + "num_tokens": 3771953346.0, + "step": 7378 + }, + { + "epoch": 1.9954029204975663, + "grad_norm": 1.1342346668243408, + "learning_rate": 1.4301831438020889e-05, + "loss": 1.9687, + "mean_token_accuracy": 0.5478946566581726, + "num_tokens": 3772477518.0, + "step": 7379 + }, + { + "epoch": 1.995673336938886, + "grad_norm": 1.0507490634918213, + "learning_rate": 1.4300364823360893e-05, + "loss": 1.8672, + "mean_token_accuracy": 0.5660808086395264, + "num_tokens": 3773001790.0, + "step": 7380 + }, + { + "epoch": 1.9959437533802054, + "grad_norm": 0.62251216173172, + "learning_rate": 1.4298898107436001e-05, + "loss": 1.0666, + "mean_token_accuracy": 0.7171485424041748, + "num_tokens": 3773471386.0, + "step": 7381 + }, + { + "epoch": 1.996214169821525, + "grad_norm": 1.531812310218811, + "learning_rate": 1.4297431290291211e-05, + "loss": 1.8914, + "mean_token_accuracy": 0.5590308904647827, + "num_tokens": 3773920200.0, + "step": 7382 + }, + { + "epoch": 1.9964845862628446, + "grad_norm": 1.2327500581741333, + "learning_rate": 1.4295964371971534e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.5539960265159607, + "num_tokens": 3774444442.0, + "step": 7383 + }, + { + "epoch": 1.9967550027041643, + "grad_norm": 1.5353025197982788, + "learning_rate": 1.4294497352521977e-05, + "loss": 1.9448, + "mean_token_accuracy": 0.5589476823806763, + "num_tokens": 3774964189.0, + "step": 7384 + }, + { + "epoch": 1.997025419145484, + "grad_norm": 1.337377905845642, + "learning_rate": 1.4293030231987551e-05, + "loss": 1.5699, + "mean_token_accuracy": 0.6452810764312744, + "num_tokens": 3775424309.0, + "step": 7385 + }, + { + "epoch": 1.9972958355868036, + "grad_norm": 1.009498953819275, + "learning_rate": 1.4291563010413277e-05, + "loss": 1.7879, + "mean_token_accuracy": 0.5992577075958252, + "num_tokens": 3775948494.0, + "step": 7386 + }, + { + "epoch": 1.9975662520281232, + "grad_norm": 1.4958772659301758, + "learning_rate": 1.4290095687844167e-05, + "loss": 2.0578, + "mean_token_accuracy": 0.5233769416809082, + "num_tokens": 3776472678.0, + "step": 7387 + }, + { + "epoch": 1.9978366684694429, + "grad_norm": 1.3835413455963135, + "learning_rate": 1.4288628264325246e-05, + "loss": 1.6859, + "mean_token_accuracy": 0.6148779392242432, + "num_tokens": 3776962525.0, + "step": 7388 + }, + { + "epoch": 1.9981070849107625, + "grad_norm": 1.4525758028030396, + "learning_rate": 1.4287160739901541e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.5688335299491882, + "num_tokens": 3777473048.0, + "step": 7389 + }, + { + "epoch": 1.9983775013520821, + "grad_norm": 1.5038665533065796, + "learning_rate": 1.4285693114618076e-05, + "loss": 1.9542, + "mean_token_accuracy": 0.5483046174049377, + "num_tokens": 3777997265.0, + "step": 7390 + }, + { + "epoch": 1.9986479177934018, + "grad_norm": 1.6517951488494873, + "learning_rate": 1.428422538851988e-05, + "loss": 1.7996, + "mean_token_accuracy": 0.6006652116775513, + "num_tokens": 3778406438.0, + "step": 7391 + }, + { + "epoch": 1.9989183342347214, + "grad_norm": 1.2893017530441284, + "learning_rate": 1.4282757561651996e-05, + "loss": 1.9065, + "mean_token_accuracy": 0.5575827360153198, + "num_tokens": 3778930661.0, + "step": 7392 + }, + { + "epoch": 1.999188750676041, + "grad_norm": 1.5326485633850098, + "learning_rate": 1.4281289634059457e-05, + "loss": 1.924, + "mean_token_accuracy": 0.587672770023346, + "num_tokens": 3779389501.0, + "step": 7393 + }, + { + "epoch": 1.9994591671173607, + "grad_norm": 1.9157248735427856, + "learning_rate": 1.42798216057873e-05, + "loss": 1.9639, + "mean_token_accuracy": 0.5627826452255249, + "num_tokens": 3779862663.0, + "step": 7394 + }, + { + "epoch": 1.9997295835586804, + "grad_norm": 1.3200418949127197, + "learning_rate": 1.427835347688057e-05, + "loss": 1.8972, + "mean_token_accuracy": 0.5685454607009888, + "num_tokens": 3780386814.0, + "step": 7395 + }, + { + "epoch": 2.0, + "grad_norm": 1.3761022090911865, + "learning_rate": 1.4276885247384318e-05, + "loss": 1.9326, + "mean_token_accuracy": 0.5748199224472046, + "num_tokens": 3780648958.0, + "step": 7396 + }, + { + "epoch": 2.0002704164413196, + "grad_norm": 1.7350249290466309, + "learning_rate": 1.4275416917343583e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.5598627924919128, + "num_tokens": 3781130527.0, + "step": 7397 + }, + { + "epoch": 2.0005408328826393, + "grad_norm": 1.5189443826675415, + "learning_rate": 1.4273948486803428e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5575931072235107, + "num_tokens": 3781654794.0, + "step": 7398 + }, + { + "epoch": 2.000811249323959, + "grad_norm": 1.3146530389785767, + "learning_rate": 1.4272479955808907e-05, + "loss": 1.9867, + "mean_token_accuracy": 0.5561646223068237, + "num_tokens": 3782170966.0, + "step": 7399 + }, + { + "epoch": 2.0010816657652786, + "grad_norm": 1.454944133758545, + "learning_rate": 1.4271011324405077e-05, + "loss": 1.9817, + "mean_token_accuracy": 0.5812638998031616, + "num_tokens": 3782650837.0, + "step": 7400 + }, + { + "epoch": 2.001352082206598, + "grad_norm": 0.5772461295127869, + "learning_rate": 1.4269542592636997e-05, + "loss": 1.1438, + "mean_token_accuracy": 0.6923242807388306, + "num_tokens": 3783175120.0, + "step": 7401 + }, + { + "epoch": 2.001622498647918, + "grad_norm": 1.992647409439087, + "learning_rate": 1.4268073760549739e-05, + "loss": 1.9631, + "mean_token_accuracy": 0.5578937530517578, + "num_tokens": 3783699396.0, + "step": 7402 + }, + { + "epoch": 2.0018929150892375, + "grad_norm": 1.7946345806121826, + "learning_rate": 1.4266604828188366e-05, + "loss": 1.9266, + "mean_token_accuracy": 0.5572751760482788, + "num_tokens": 3784223561.0, + "step": 7403 + }, + { + "epoch": 2.002163331530557, + "grad_norm": 1.288295030593872, + "learning_rate": 1.4265135795597951e-05, + "loss": 1.8847, + "mean_token_accuracy": 0.5760946273803711, + "num_tokens": 3784688430.0, + "step": 7404 + }, + { + "epoch": 2.002433747971877, + "grad_norm": 1.9393991231918335, + "learning_rate": 1.4263666662823563e-05, + "loss": 1.8783, + "mean_token_accuracy": 0.5895543098449707, + "num_tokens": 3785152738.0, + "step": 7405 + }, + { + "epoch": 2.0027041644131964, + "grad_norm": 1.4395105838775635, + "learning_rate": 1.4262197429910289e-05, + "loss": 1.8132, + "mean_token_accuracy": 0.5881507396697998, + "num_tokens": 3785629831.0, + "step": 7406 + }, + { + "epoch": 2.002974580854516, + "grad_norm": 1.2839001417160034, + "learning_rate": 1.4260728096903204e-05, + "loss": 1.8781, + "mean_token_accuracy": 0.5714292526245117, + "num_tokens": 3786154045.0, + "step": 7407 + }, + { + "epoch": 2.0032449972958357, + "grad_norm": 1.56488835811615, + "learning_rate": 1.4259258663847391e-05, + "loss": 1.9336, + "mean_token_accuracy": 0.541405200958252, + "num_tokens": 3786678260.0, + "step": 7408 + }, + { + "epoch": 2.0035154137371554, + "grad_norm": 1.3025020360946655, + "learning_rate": 1.4257789130787937e-05, + "loss": 2.0547, + "mean_token_accuracy": 0.5299773216247559, + "num_tokens": 3787202415.0, + "step": 7409 + }, + { + "epoch": 2.003785830178475, + "grad_norm": 1.4157979488372803, + "learning_rate": 1.4256319497769933e-05, + "loss": 1.7392, + "mean_token_accuracy": 0.585136353969574, + "num_tokens": 3787726663.0, + "step": 7410 + }, + { + "epoch": 2.0040562466197946, + "grad_norm": 1.5354104042053223, + "learning_rate": 1.425484976483847e-05, + "loss": 1.897, + "mean_token_accuracy": 0.5670636296272278, + "num_tokens": 3788250826.0, + "step": 7411 + }, + { + "epoch": 2.0043266630611143, + "grad_norm": 1.284698724746704, + "learning_rate": 1.4253379932038642e-05, + "loss": 1.9232, + "mean_token_accuracy": 0.5529575943946838, + "num_tokens": 3788775044.0, + "step": 7412 + }, + { + "epoch": 2.004597079502434, + "grad_norm": 1.3578742742538452, + "learning_rate": 1.4251909999415555e-05, + "loss": 1.9632, + "mean_token_accuracy": 0.5404897928237915, + "num_tokens": 3789274558.0, + "step": 7413 + }, + { + "epoch": 2.0048674959437536, + "grad_norm": 1.1726102828979492, + "learning_rate": 1.42504399670143e-05, + "loss": 1.8928, + "mean_token_accuracy": 0.5595687627792358, + "num_tokens": 3789798824.0, + "step": 7414 + }, + { + "epoch": 2.005137912385073, + "grad_norm": 1.2825404405593872, + "learning_rate": 1.4248969834879992e-05, + "loss": 1.9534, + "mean_token_accuracy": 0.5729799270629883, + "num_tokens": 3790323018.0, + "step": 7415 + }, + { + "epoch": 2.005408328826393, + "grad_norm": 1.3512415885925293, + "learning_rate": 1.4247499603057735e-05, + "loss": 1.9592, + "mean_token_accuracy": 0.5699207782745361, + "num_tokens": 3790809576.0, + "step": 7416 + }, + { + "epoch": 2.005678745267712, + "grad_norm": 1.2807985544204712, + "learning_rate": 1.4246029271592636e-05, + "loss": 1.9503, + "mean_token_accuracy": 0.5465084314346313, + "num_tokens": 3791333853.0, + "step": 7417 + }, + { + "epoch": 2.0059491617090317, + "grad_norm": 1.3540407419204712, + "learning_rate": 1.4244558840529819e-05, + "loss": 1.9503, + "mean_token_accuracy": 0.5689564943313599, + "num_tokens": 3791858139.0, + "step": 7418 + }, + { + "epoch": 2.0062195781503513, + "grad_norm": 1.5723605155944824, + "learning_rate": 1.4243088309914388e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5398094058036804, + "num_tokens": 3792382364.0, + "step": 7419 + }, + { + "epoch": 2.006489994591671, + "grad_norm": 1.4445315599441528, + "learning_rate": 1.4241617679791475e-05, + "loss": 1.9798, + "mean_token_accuracy": 0.5526494979858398, + "num_tokens": 3792889648.0, + "step": 7420 + }, + { + "epoch": 2.0067604110329906, + "grad_norm": 0.5298720598220825, + "learning_rate": 1.4240146950206199e-05, + "loss": 1.1142, + "mean_token_accuracy": 0.6942621469497681, + "num_tokens": 3793413785.0, + "step": 7421 + }, + { + "epoch": 2.0070308274743103, + "grad_norm": 1.7303355932235718, + "learning_rate": 1.4238676121203685e-05, + "loss": 1.9166, + "mean_token_accuracy": 0.5777587294578552, + "num_tokens": 3793930390.0, + "step": 7422 + }, + { + "epoch": 2.00730124391563, + "grad_norm": 1.994193434715271, + "learning_rate": 1.4237205192829065e-05, + "loss": 1.5299, + "mean_token_accuracy": 0.6055625677108765, + "num_tokens": 3794358753.0, + "step": 7423 + }, + { + "epoch": 2.0075716603569496, + "grad_norm": 1.5997525453567505, + "learning_rate": 1.4235734165127467e-05, + "loss": 1.9715, + "mean_token_accuracy": 0.5608588457107544, + "num_tokens": 3794829488.0, + "step": 7424 + }, + { + "epoch": 2.007842076798269, + "grad_norm": 1.6778136491775513, + "learning_rate": 1.4234263038144029e-05, + "loss": 1.8432, + "mean_token_accuracy": 0.5876070261001587, + "num_tokens": 3795353594.0, + "step": 7425 + }, + { + "epoch": 2.008112493239589, + "grad_norm": 1.5175130367279053, + "learning_rate": 1.423279181192389e-05, + "loss": 1.983, + "mean_token_accuracy": 0.5582467317581177, + "num_tokens": 3795877829.0, + "step": 7426 + }, + { + "epoch": 2.0083829096809085, + "grad_norm": 1.6597354412078857, + "learning_rate": 1.4231320486512193e-05, + "loss": 1.9448, + "mean_token_accuracy": 0.5680317878723145, + "num_tokens": 3796401985.0, + "step": 7427 + }, + { + "epoch": 2.008653326122228, + "grad_norm": 1.8865493535995483, + "learning_rate": 1.4229849061954078e-05, + "loss": 1.9176, + "mean_token_accuracy": 0.5731262564659119, + "num_tokens": 3796865501.0, + "step": 7428 + }, + { + "epoch": 2.0089237425635478, + "grad_norm": 2.0866594314575195, + "learning_rate": 1.4228377538294698e-05, + "loss": 1.976, + "mean_token_accuracy": 0.5808843374252319, + "num_tokens": 3797389735.0, + "step": 7429 + }, + { + "epoch": 2.0091941590048674, + "grad_norm": 1.3727117776870728, + "learning_rate": 1.4226905915579202e-05, + "loss": 1.8652, + "mean_token_accuracy": 0.5770659446716309, + "num_tokens": 3797913870.0, + "step": 7430 + }, + { + "epoch": 2.009464575446187, + "grad_norm": 1.4958140850067139, + "learning_rate": 1.4225434193852739e-05, + "loss": 1.9018, + "mean_token_accuracy": 0.5780888795852661, + "num_tokens": 3798406137.0, + "step": 7431 + }, + { + "epoch": 2.0097349918875067, + "grad_norm": 1.7021090984344482, + "learning_rate": 1.422396237316047e-05, + "loss": 1.7681, + "mean_token_accuracy": 0.5938836336135864, + "num_tokens": 3798930385.0, + "step": 7432 + }, + { + "epoch": 2.0100054083288263, + "grad_norm": 1.2251008749008179, + "learning_rate": 1.4222490453547557e-05, + "loss": 1.7532, + "mean_token_accuracy": 0.5956125259399414, + "num_tokens": 3799414708.0, + "step": 7433 + }, + { + "epoch": 2.010275824770146, + "grad_norm": 1.065908432006836, + "learning_rate": 1.4221018435059157e-05, + "loss": 1.721, + "mean_token_accuracy": 0.6098916530609131, + "num_tokens": 3799938839.0, + "step": 7434 + }, + { + "epoch": 2.0105462412114656, + "grad_norm": 1.151475191116333, + "learning_rate": 1.4219546317740442e-05, + "loss": 1.7815, + "mean_token_accuracy": 0.5776839256286621, + "num_tokens": 3800462923.0, + "step": 7435 + }, + { + "epoch": 2.0108166576527853, + "grad_norm": 1.2345260381698608, + "learning_rate": 1.4218074101636577e-05, + "loss": 1.8956, + "mean_token_accuracy": 0.5696513652801514, + "num_tokens": 3800987047.0, + "step": 7436 + }, + { + "epoch": 2.011087074094105, + "grad_norm": 1.1201229095458984, + "learning_rate": 1.4216601786792735e-05, + "loss": 1.934, + "mean_token_accuracy": 0.5606049299240112, + "num_tokens": 3801511260.0, + "step": 7437 + }, + { + "epoch": 2.0113574905354246, + "grad_norm": 1.2655805349349976, + "learning_rate": 1.421512937325409e-05, + "loss": 1.8852, + "mean_token_accuracy": 0.5580093860626221, + "num_tokens": 3802035375.0, + "step": 7438 + }, + { + "epoch": 2.011627906976744, + "grad_norm": 1.4192951917648315, + "learning_rate": 1.4213656861065824e-05, + "loss": 2.0094, + "mean_token_accuracy": 0.542449951171875, + "num_tokens": 3802538846.0, + "step": 7439 + }, + { + "epoch": 2.011898323418064, + "grad_norm": 1.4582983255386353, + "learning_rate": 1.4212184250273107e-05, + "loss": 1.8584, + "mean_token_accuracy": 0.5742724537849426, + "num_tokens": 3803024615.0, + "step": 7440 + }, + { + "epoch": 2.0121687398593835, + "grad_norm": 0.8562176823616028, + "learning_rate": 1.4210711540921137e-05, + "loss": 1.0589, + "mean_token_accuracy": 0.7106289863586426, + "num_tokens": 3803548803.0, + "step": 7441 + }, + { + "epoch": 2.012439156300703, + "grad_norm": 1.5773581266403198, + "learning_rate": 1.4209238733055093e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5727149844169617, + "num_tokens": 3804072936.0, + "step": 7442 + }, + { + "epoch": 2.012709572742023, + "grad_norm": 1.295235276222229, + "learning_rate": 1.4207765826720168e-05, + "loss": 1.787, + "mean_token_accuracy": 0.594444990158081, + "num_tokens": 3804597190.0, + "step": 7443 + }, + { + "epoch": 2.0129799891833424, + "grad_norm": 1.1172486543655396, + "learning_rate": 1.4206292821961554e-05, + "loss": 1.8749, + "mean_token_accuracy": 0.5660141706466675, + "num_tokens": 3805075668.0, + "step": 7444 + }, + { + "epoch": 2.013250405624662, + "grad_norm": 1.4330275058746338, + "learning_rate": 1.4204819718824448e-05, + "loss": 1.9588, + "mean_token_accuracy": 0.562293529510498, + "num_tokens": 3805599874.0, + "step": 7445 + }, + { + "epoch": 2.0135208220659817, + "grad_norm": 1.3499220609664917, + "learning_rate": 1.4203346517354047e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5331546068191528, + "num_tokens": 3806076643.0, + "step": 7446 + }, + { + "epoch": 2.0137912385073014, + "grad_norm": 1.416854977607727, + "learning_rate": 1.4201873217595552e-05, + "loss": 1.9788, + "mean_token_accuracy": 0.5473216772079468, + "num_tokens": 3806587930.0, + "step": 7447 + }, + { + "epoch": 2.014061654948621, + "grad_norm": 1.544638991355896, + "learning_rate": 1.4200399819594175e-05, + "loss": 2.0275, + "mean_token_accuracy": 0.5455243587493896, + "num_tokens": 3807112139.0, + "step": 7448 + }, + { + "epoch": 2.0143320713899406, + "grad_norm": 1.5278477668762207, + "learning_rate": 1.4198926323395118e-05, + "loss": 1.915, + "mean_token_accuracy": 0.5839744806289673, + "num_tokens": 3807636368.0, + "step": 7449 + }, + { + "epoch": 2.0146024878312603, + "grad_norm": 1.2505334615707397, + "learning_rate": 1.4197452729043593e-05, + "loss": 1.8617, + "mean_token_accuracy": 0.5754786729812622, + "num_tokens": 3808150757.0, + "step": 7450 + }, + { + "epoch": 2.01487290427258, + "grad_norm": 1.4272552728652954, + "learning_rate": 1.4195979036584813e-05, + "loss": 1.8491, + "mean_token_accuracy": 0.571980357170105, + "num_tokens": 3808674679.0, + "step": 7451 + }, + { + "epoch": 2.0151433207138996, + "grad_norm": 1.3634954690933228, + "learning_rate": 1.4194505246064001e-05, + "loss": 1.8122, + "mean_token_accuracy": 0.5737918019294739, + "num_tokens": 3809198907.0, + "step": 7452 + }, + { + "epoch": 2.015413737155219, + "grad_norm": 1.4889202117919922, + "learning_rate": 1.4193031357526372e-05, + "loss": 1.9575, + "mean_token_accuracy": 0.5771932601928711, + "num_tokens": 3809579739.0, + "step": 7453 + }, + { + "epoch": 2.015684153596539, + "grad_norm": 1.5839526653289795, + "learning_rate": 1.4191557371017154e-05, + "loss": 1.8691, + "mean_token_accuracy": 0.5748423933982849, + "num_tokens": 3810044563.0, + "step": 7454 + }, + { + "epoch": 2.0159545700378585, + "grad_norm": 1.2077081203460693, + "learning_rate": 1.4190083286581566e-05, + "loss": 1.8524, + "mean_token_accuracy": 0.5729376077651978, + "num_tokens": 3810568804.0, + "step": 7455 + }, + { + "epoch": 2.016224986479178, + "grad_norm": 1.5560548305511475, + "learning_rate": 1.4188609104264843e-05, + "loss": 1.8616, + "mean_token_accuracy": 0.5674866437911987, + "num_tokens": 3811093024.0, + "step": 7456 + }, + { + "epoch": 2.016495402920498, + "grad_norm": 1.2903735637664795, + "learning_rate": 1.4187134824112213e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.572885274887085, + "num_tokens": 3811617254.0, + "step": 7457 + }, + { + "epoch": 2.016765819361817, + "grad_norm": 1.1091463565826416, + "learning_rate": 1.418566044616892e-05, + "loss": 1.9191, + "mean_token_accuracy": 0.5683391094207764, + "num_tokens": 3812141395.0, + "step": 7458 + }, + { + "epoch": 2.0170362358031366, + "grad_norm": 1.2741140127182007, + "learning_rate": 1.4184185970480193e-05, + "loss": 1.7834, + "mean_token_accuracy": 0.5893917083740234, + "num_tokens": 3812665600.0, + "step": 7459 + }, + { + "epoch": 2.0173066522444563, + "grad_norm": 1.2320642471313477, + "learning_rate": 1.4182711397091276e-05, + "loss": 1.9569, + "mean_token_accuracy": 0.5430808663368225, + "num_tokens": 3813189646.0, + "step": 7460 + }, + { + "epoch": 2.017577068685776, + "grad_norm": 0.46618354320526123, + "learning_rate": 1.4181236726047414e-05, + "loss": 1.0966, + "mean_token_accuracy": 0.7098146080970764, + "num_tokens": 3813713798.0, + "step": 7461 + }, + { + "epoch": 2.0178474851270956, + "grad_norm": 2.4775896072387695, + "learning_rate": 1.4179761957393852e-05, + "loss": 1.9391, + "mean_token_accuracy": 0.5695170164108276, + "num_tokens": 3814238078.0, + "step": 7462 + }, + { + "epoch": 2.018117901568415, + "grad_norm": 2.3160488605499268, + "learning_rate": 1.4178287091175842e-05, + "loss": 1.9517, + "mean_token_accuracy": 0.5638507604598999, + "num_tokens": 3814762275.0, + "step": 7463 + }, + { + "epoch": 2.018388318009735, + "grad_norm": 1.2439439296722412, + "learning_rate": 1.417681212743864e-05, + "loss": 1.9608, + "mean_token_accuracy": 0.5507079362869263, + "num_tokens": 3815286508.0, + "step": 7464 + }, + { + "epoch": 2.0186587344510545, + "grad_norm": 1.4041202068328857, + "learning_rate": 1.41753370662275e-05, + "loss": 1.8815, + "mean_token_accuracy": 0.5624248385429382, + "num_tokens": 3815797705.0, + "step": 7465 + }, + { + "epoch": 2.018929150892374, + "grad_norm": 1.8152142763137817, + "learning_rate": 1.4173861907587678e-05, + "loss": 1.9022, + "mean_token_accuracy": 0.5794625878334045, + "num_tokens": 3816214135.0, + "step": 7466 + }, + { + "epoch": 2.0191995673336938, + "grad_norm": 1.5116422176361084, + "learning_rate": 1.417238665156444e-05, + "loss": 1.8841, + "mean_token_accuracy": 0.5687375664710999, + "num_tokens": 3816738221.0, + "step": 7467 + }, + { + "epoch": 2.0194699837750134, + "grad_norm": 1.395468831062317, + "learning_rate": 1.4170911298203047e-05, + "loss": 1.9573, + "mean_token_accuracy": 0.5503410696983337, + "num_tokens": 3817262337.0, + "step": 7468 + }, + { + "epoch": 2.019740400216333, + "grad_norm": 1.612835168838501, + "learning_rate": 1.4169435847548774e-05, + "loss": 1.9041, + "mean_token_accuracy": 0.569232702255249, + "num_tokens": 3817786548.0, + "step": 7469 + }, + { + "epoch": 2.0200108166576527, + "grad_norm": 1.3999959230422974, + "learning_rate": 1.4167960299646888e-05, + "loss": 1.7375, + "mean_token_accuracy": 0.5845859050750732, + "num_tokens": 3818310777.0, + "step": 7470 + }, + { + "epoch": 2.0202812330989723, + "grad_norm": 1.6378979682922363, + "learning_rate": 1.4166484654542665e-05, + "loss": 1.9731, + "mean_token_accuracy": 0.5638898015022278, + "num_tokens": 3818771655.0, + "step": 7471 + }, + { + "epoch": 2.020551649540292, + "grad_norm": 1.3923650979995728, + "learning_rate": 1.4165008912281382e-05, + "loss": 1.8796, + "mean_token_accuracy": 0.562730073928833, + "num_tokens": 3819295857.0, + "step": 7472 + }, + { + "epoch": 2.0208220659816116, + "grad_norm": 1.4051283597946167, + "learning_rate": 1.4163533072908313e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.5564314126968384, + "num_tokens": 3819789928.0, + "step": 7473 + }, + { + "epoch": 2.0210924824229313, + "grad_norm": 1.3114721775054932, + "learning_rate": 1.416205713646875e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5593738555908203, + "num_tokens": 3820314096.0, + "step": 7474 + }, + { + "epoch": 2.021362898864251, + "grad_norm": 1.4546419382095337, + "learning_rate": 1.4160581103007972e-05, + "loss": 1.8709, + "mean_token_accuracy": 0.5827131867408752, + "num_tokens": 3820735758.0, + "step": 7475 + }, + { + "epoch": 2.0216333153055706, + "grad_norm": 1.702872633934021, + "learning_rate": 1.415910497257127e-05, + "loss": 2.0309, + "mean_token_accuracy": 0.5560009479522705, + "num_tokens": 3821208612.0, + "step": 7476 + }, + { + "epoch": 2.02190373174689, + "grad_norm": 1.0860257148742676, + "learning_rate": 1.4157628745203942e-05, + "loss": 1.8516, + "mean_token_accuracy": 0.5787153244018555, + "num_tokens": 3821732726.0, + "step": 7477 + }, + { + "epoch": 2.02217414818821, + "grad_norm": 1.0770965814590454, + "learning_rate": 1.4156152420951275e-05, + "loss": 1.8457, + "mean_token_accuracy": 0.5607866048812866, + "num_tokens": 3822256955.0, + "step": 7478 + }, + { + "epoch": 2.0224445646295295, + "grad_norm": 1.3512248992919922, + "learning_rate": 1.4154675999858572e-05, + "loss": 1.9003, + "mean_token_accuracy": 0.565703272819519, + "num_tokens": 3822767504.0, + "step": 7479 + }, + { + "epoch": 2.022714981070849, + "grad_norm": 1.277984380722046, + "learning_rate": 1.415319948197113e-05, + "loss": 1.7249, + "mean_token_accuracy": 0.5940461754798889, + "num_tokens": 3823291674.0, + "step": 7480 + }, + { + "epoch": 2.0229853975121688, + "grad_norm": 0.7092245817184448, + "learning_rate": 1.4151722867334257e-05, + "loss": 1.0352, + "mean_token_accuracy": 0.7290549278259277, + "num_tokens": 3823815877.0, + "step": 7481 + }, + { + "epoch": 2.0232558139534884, + "grad_norm": 2.0823585987091064, + "learning_rate": 1.4150246155993255e-05, + "loss": 1.8826, + "mean_token_accuracy": 0.57593834400177, + "num_tokens": 3824291714.0, + "step": 7482 + }, + { + "epoch": 2.023526230394808, + "grad_norm": 1.506250023841858, + "learning_rate": 1.4148769347993436e-05, + "loss": 1.9438, + "mean_token_accuracy": 0.5294475555419922, + "num_tokens": 3824815991.0, + "step": 7483 + }, + { + "epoch": 2.0237966468361277, + "grad_norm": 1.0984749794006348, + "learning_rate": 1.4147292443380117e-05, + "loss": 1.9687, + "mean_token_accuracy": 0.5645935535430908, + "num_tokens": 3825340221.0, + "step": 7484 + }, + { + "epoch": 2.0240670632774473, + "grad_norm": 1.6240087747573853, + "learning_rate": 1.4145815442198609e-05, + "loss": 1.9878, + "mean_token_accuracy": 0.560409426689148, + "num_tokens": 3825864318.0, + "step": 7485 + }, + { + "epoch": 2.024337479718767, + "grad_norm": 1.2866605520248413, + "learning_rate": 1.4144338344494232e-05, + "loss": 1.8998, + "mean_token_accuracy": 0.566193699836731, + "num_tokens": 3826388556.0, + "step": 7486 + }, + { + "epoch": 2.0246078961600866, + "grad_norm": 1.035529613494873, + "learning_rate": 1.4142861150312307e-05, + "loss": 1.8734, + "mean_token_accuracy": 0.5702908635139465, + "num_tokens": 3826895969.0, + "step": 7487 + }, + { + "epoch": 2.0248783126014063, + "grad_norm": 1.37888765335083, + "learning_rate": 1.4141383859698163e-05, + "loss": 1.9136, + "mean_token_accuracy": 0.5517994165420532, + "num_tokens": 3827420245.0, + "step": 7488 + }, + { + "epoch": 2.025148729042726, + "grad_norm": 1.3168624639511108, + "learning_rate": 1.413990647269712e-05, + "loss": 1.9656, + "mean_token_accuracy": 0.5562248229980469, + "num_tokens": 3827900099.0, + "step": 7489 + }, + { + "epoch": 2.0254191454840456, + "grad_norm": 1.1151195764541626, + "learning_rate": 1.4138428989354511e-05, + "loss": 1.9905, + "mean_token_accuracy": 0.5625032186508179, + "num_tokens": 3828424203.0, + "step": 7490 + }, + { + "epoch": 2.025689561925365, + "grad_norm": 1.4115464687347412, + "learning_rate": 1.4136951409715674e-05, + "loss": 2.0507, + "mean_token_accuracy": 0.5421538352966309, + "num_tokens": 3828948461.0, + "step": 7491 + }, + { + "epoch": 2.025959978366685, + "grad_norm": 1.2213853597640991, + "learning_rate": 1.4135473733825943e-05, + "loss": 1.933, + "mean_token_accuracy": 0.5515434741973877, + "num_tokens": 3829438459.0, + "step": 7492 + }, + { + "epoch": 2.0262303948080045, + "grad_norm": 1.3067914247512817, + "learning_rate": 1.4133995961730653e-05, + "loss": 1.9494, + "mean_token_accuracy": 0.5695956349372864, + "num_tokens": 3829962623.0, + "step": 7493 + }, + { + "epoch": 2.026500811249324, + "grad_norm": 1.4142532348632812, + "learning_rate": 1.4132518093475158e-05, + "loss": 2.0461, + "mean_token_accuracy": 0.5570990443229675, + "num_tokens": 3830426338.0, + "step": 7494 + }, + { + "epoch": 2.0267712276906438, + "grad_norm": 1.1091679334640503, + "learning_rate": 1.4131040129104789e-05, + "loss": 1.9485, + "mean_token_accuracy": 0.5638092160224915, + "num_tokens": 3830950618.0, + "step": 7495 + }, + { + "epoch": 2.0270416441319634, + "grad_norm": 1.4169121980667114, + "learning_rate": 1.4129562068664904e-05, + "loss": 1.9688, + "mean_token_accuracy": 0.5653587579727173, + "num_tokens": 3831474846.0, + "step": 7496 + }, + { + "epoch": 2.027312060573283, + "grad_norm": 1.4463675022125244, + "learning_rate": 1.4128083912200851e-05, + "loss": 1.9657, + "mean_token_accuracy": 0.5491451621055603, + "num_tokens": 3831999077.0, + "step": 7497 + }, + { + "epoch": 2.0275824770146027, + "grad_norm": 1.4631565809249878, + "learning_rate": 1.4126605659757984e-05, + "loss": 1.9812, + "mean_token_accuracy": 0.5396848917007446, + "num_tokens": 3832523163.0, + "step": 7498 + }, + { + "epoch": 2.027852893455922, + "grad_norm": 1.2011513710021973, + "learning_rate": 1.412512731138166e-05, + "loss": 1.8544, + "mean_token_accuracy": 0.5680246353149414, + "num_tokens": 3833047148.0, + "step": 7499 + }, + { + "epoch": 2.0281233098972415, + "grad_norm": 1.3930696249008179, + "learning_rate": 1.412364886711724e-05, + "loss": 1.9426, + "mean_token_accuracy": 0.5434629917144775, + "num_tokens": 3833571403.0, + "step": 7500 + }, + { + "epoch": 2.028393726338561, + "grad_norm": 0.5562939643859863, + "learning_rate": 1.4122170327010088e-05, + "loss": 1.1538, + "mean_token_accuracy": 0.6936299204826355, + "num_tokens": 3834038183.0, + "step": 7501 + }, + { + "epoch": 2.028664142779881, + "grad_norm": 2.4127492904663086, + "learning_rate": 1.412069169110557e-05, + "loss": 1.9723, + "mean_token_accuracy": 0.5542908310890198, + "num_tokens": 3834562329.0, + "step": 7502 + }, + { + "epoch": 2.0289345592212005, + "grad_norm": 2.055433988571167, + "learning_rate": 1.4119212959449049e-05, + "loss": 1.9005, + "mean_token_accuracy": 0.5491228103637695, + "num_tokens": 3835078304.0, + "step": 7503 + }, + { + "epoch": 2.02920497566252, + "grad_norm": 1.131567120552063, + "learning_rate": 1.4117734132085903e-05, + "loss": 1.7603, + "mean_token_accuracy": 0.5878040790557861, + "num_tokens": 3835602544.0, + "step": 7504 + }, + { + "epoch": 2.0294753921038398, + "grad_norm": 1.5317373275756836, + "learning_rate": 1.4116255209061507e-05, + "loss": 1.8411, + "mean_token_accuracy": 0.5764197111129761, + "num_tokens": 3836126709.0, + "step": 7505 + }, + { + "epoch": 2.0297458085451594, + "grad_norm": 1.5736995935440063, + "learning_rate": 1.4114776190421235e-05, + "loss": 1.9889, + "mean_token_accuracy": 0.5375405550003052, + "num_tokens": 3836650971.0, + "step": 7506 + }, + { + "epoch": 2.030016224986479, + "grad_norm": 1.070518970489502, + "learning_rate": 1.411329707621047e-05, + "loss": 1.8701, + "mean_token_accuracy": 0.5700361728668213, + "num_tokens": 3837159029.0, + "step": 7507 + }, + { + "epoch": 2.0302866414277987, + "grad_norm": 1.1912747621536255, + "learning_rate": 1.4111817866474597e-05, + "loss": 1.9067, + "mean_token_accuracy": 0.5457574129104614, + "num_tokens": 3837683277.0, + "step": 7508 + }, + { + "epoch": 2.0305570578691183, + "grad_norm": 1.1637670993804932, + "learning_rate": 1.4110338561259e-05, + "loss": 1.8369, + "mean_token_accuracy": 0.5725631713867188, + "num_tokens": 3838207517.0, + "step": 7509 + }, + { + "epoch": 2.030827474310438, + "grad_norm": 1.2337356805801392, + "learning_rate": 1.4108859160609066e-05, + "loss": 1.8648, + "mean_token_accuracy": 0.5740379691123962, + "num_tokens": 3838673890.0, + "step": 7510 + }, + { + "epoch": 2.0310978907517576, + "grad_norm": 1.3017197847366333, + "learning_rate": 1.4107379664570193e-05, + "loss": 1.8493, + "mean_token_accuracy": 0.574384868144989, + "num_tokens": 3839198095.0, + "step": 7511 + }, + { + "epoch": 2.0313683071930773, + "grad_norm": 1.2114289999008179, + "learning_rate": 1.4105900073187773e-05, + "loss": 1.6972, + "mean_token_accuracy": 0.6315544247627258, + "num_tokens": 3839722327.0, + "step": 7512 + }, + { + "epoch": 2.031638723634397, + "grad_norm": 1.330946445465088, + "learning_rate": 1.4104420386507203e-05, + "loss": 1.9384, + "mean_token_accuracy": 0.5845885276794434, + "num_tokens": 3840203364.0, + "step": 7513 + }, + { + "epoch": 2.0319091400757165, + "grad_norm": 1.2773627042770386, + "learning_rate": 1.4102940604573889e-05, + "loss": 2.0389, + "mean_token_accuracy": 0.5408912897109985, + "num_tokens": 3840727625.0, + "step": 7514 + }, + { + "epoch": 2.032179556517036, + "grad_norm": 1.498918056488037, + "learning_rate": 1.4101460727433234e-05, + "loss": 1.9693, + "mean_token_accuracy": 0.5837959051132202, + "num_tokens": 3841149770.0, + "step": 7515 + }, + { + "epoch": 2.032449972958356, + "grad_norm": 1.2522624731063843, + "learning_rate": 1.4099980755130637e-05, + "loss": 1.9674, + "mean_token_accuracy": 0.5660936832427979, + "num_tokens": 3841670442.0, + "step": 7516 + }, + { + "epoch": 2.0327203893996755, + "grad_norm": 1.4296166896820068, + "learning_rate": 1.409850068771152e-05, + "loss": 1.9981, + "mean_token_accuracy": 0.5478821396827698, + "num_tokens": 3842194585.0, + "step": 7517 + }, + { + "epoch": 2.032990805840995, + "grad_norm": 1.1460484266281128, + "learning_rate": 1.4097020525221285e-05, + "loss": 1.8509, + "mean_token_accuracy": 0.5743227005004883, + "num_tokens": 3842718727.0, + "step": 7518 + }, + { + "epoch": 2.0332612222823148, + "grad_norm": 1.1493728160858154, + "learning_rate": 1.4095540267705354e-05, + "loss": 1.843, + "mean_token_accuracy": 0.5636284351348877, + "num_tokens": 3843188886.0, + "step": 7519 + }, + { + "epoch": 2.0335316387236344, + "grad_norm": 1.1634374856948853, + "learning_rate": 1.4094059915209144e-05, + "loss": 1.9457, + "mean_token_accuracy": 0.5605021715164185, + "num_tokens": 3843713128.0, + "step": 7520 + }, + { + "epoch": 2.033802055164954, + "grad_norm": 0.6002532243728638, + "learning_rate": 1.4092579467778077e-05, + "loss": 1.1598, + "mean_token_accuracy": 0.6955435276031494, + "num_tokens": 3844203036.0, + "step": 7521 + }, + { + "epoch": 2.0340724716062737, + "grad_norm": 1.3918626308441162, + "learning_rate": 1.409109892545758e-05, + "loss": 1.8626, + "mean_token_accuracy": 0.5520068407058716, + "num_tokens": 3844727222.0, + "step": 7522 + }, + { + "epoch": 2.0343428880475933, + "grad_norm": 1.3252370357513428, + "learning_rate": 1.4089618288293075e-05, + "loss": 1.9874, + "mean_token_accuracy": 0.5455420613288879, + "num_tokens": 3845251434.0, + "step": 7523 + }, + { + "epoch": 2.034613304488913, + "grad_norm": 0.9577711820602417, + "learning_rate": 1.4088137556329997e-05, + "loss": 1.9602, + "mean_token_accuracy": 0.5563749074935913, + "num_tokens": 3845760650.0, + "step": 7524 + }, + { + "epoch": 2.0348837209302326, + "grad_norm": 1.3740209341049194, + "learning_rate": 1.4086656729613772e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.559261679649353, + "num_tokens": 3846284655.0, + "step": 7525 + }, + { + "epoch": 2.0351541373715523, + "grad_norm": 1.2269431352615356, + "learning_rate": 1.4085175808189846e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.5654480457305908, + "num_tokens": 3846808863.0, + "step": 7526 + }, + { + "epoch": 2.035424553812872, + "grad_norm": 1.1079189777374268, + "learning_rate": 1.4083694792103652e-05, + "loss": 1.8266, + "mean_token_accuracy": 0.5876612067222595, + "num_tokens": 3847333139.0, + "step": 7527 + }, + { + "epoch": 2.0356949702541915, + "grad_norm": 1.3132879734039307, + "learning_rate": 1.4082213681400634e-05, + "loss": 1.7037, + "mean_token_accuracy": 0.6009108424186707, + "num_tokens": 3847857384.0, + "step": 7528 + }, + { + "epoch": 2.035965386695511, + "grad_norm": 1.4441766738891602, + "learning_rate": 1.4080732476126235e-05, + "loss": 1.8967, + "mean_token_accuracy": 0.5806445479393005, + "num_tokens": 3848381647.0, + "step": 7529 + }, + { + "epoch": 2.036235803136831, + "grad_norm": 1.2402808666229248, + "learning_rate": 1.4079251176325907e-05, + "loss": 1.9364, + "mean_token_accuracy": 0.5499122142791748, + "num_tokens": 3848905711.0, + "step": 7530 + }, + { + "epoch": 2.0365062195781505, + "grad_norm": 9.741620063781738, + "learning_rate": 1.4077769782045093e-05, + "loss": 1.966, + "mean_token_accuracy": 0.5940333604812622, + "num_tokens": 3849396153.0, + "step": 7531 + }, + { + "epoch": 2.03677663601947, + "grad_norm": 1.7227131128311157, + "learning_rate": 1.4076288293329253e-05, + "loss": 2.0083, + "mean_token_accuracy": 0.5449011325836182, + "num_tokens": 3849920415.0, + "step": 7532 + }, + { + "epoch": 2.0370470524607898, + "grad_norm": 1.6395307779312134, + "learning_rate": 1.4074806710223839e-05, + "loss": 2.0783, + "mean_token_accuracy": 0.5243579149246216, + "num_tokens": 3850444579.0, + "step": 7533 + }, + { + "epoch": 2.0373174689021094, + "grad_norm": 1.2275792360305786, + "learning_rate": 1.4073325032774315e-05, + "loss": 1.9354, + "mean_token_accuracy": 0.5665684938430786, + "num_tokens": 3850943684.0, + "step": 7534 + }, + { + "epoch": 2.037587885343429, + "grad_norm": 1.342410683631897, + "learning_rate": 1.4071843261026141e-05, + "loss": 2.0302, + "mean_token_accuracy": 0.555486798286438, + "num_tokens": 3851363379.0, + "step": 7535 + }, + { + "epoch": 2.0378583017847487, + "grad_norm": 1.1984130144119263, + "learning_rate": 1.4070361395024779e-05, + "loss": 2.0647, + "mean_token_accuracy": 0.5316801071166992, + "num_tokens": 3851887474.0, + "step": 7536 + }, + { + "epoch": 2.0381287182260683, + "grad_norm": 1.2621290683746338, + "learning_rate": 1.4068879434815708e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.5503789186477661, + "num_tokens": 3852381690.0, + "step": 7537 + }, + { + "epoch": 2.038399134667388, + "grad_norm": 1.3331531286239624, + "learning_rate": 1.4067397380444383e-05, + "loss": 2.0236, + "mean_token_accuracy": 0.5508705973625183, + "num_tokens": 3852860596.0, + "step": 7538 + }, + { + "epoch": 2.0386695511087076, + "grad_norm": 0.9924965500831604, + "learning_rate": 1.4065915231956286e-05, + "loss": 1.8875, + "mean_token_accuracy": 0.5647345781326294, + "num_tokens": 3853384854.0, + "step": 7539 + }, + { + "epoch": 2.038939967550027, + "grad_norm": 1.262770652770996, + "learning_rate": 1.4064432989396896e-05, + "loss": 1.9704, + "mean_token_accuracy": 0.5504292249679565, + "num_tokens": 3853909135.0, + "step": 7540 + }, + { + "epoch": 2.0392103839913465, + "grad_norm": 0.5285135507583618, + "learning_rate": 1.4062950652811691e-05, + "loss": 1.0993, + "mean_token_accuracy": 0.7067173719406128, + "num_tokens": 3854433294.0, + "step": 7541 + }, + { + "epoch": 2.039480800432666, + "grad_norm": 1.4994007349014282, + "learning_rate": 1.4061468222246148e-05, + "loss": 1.7314, + "mean_token_accuracy": 0.603693962097168, + "num_tokens": 3854957479.0, + "step": 7542 + }, + { + "epoch": 2.0397512168739858, + "grad_norm": 1.3672770261764526, + "learning_rate": 1.4059985697745764e-05, + "loss": 1.9899, + "mean_token_accuracy": 0.5450354218482971, + "num_tokens": 3855461768.0, + "step": 7543 + }, + { + "epoch": 2.0400216333153054, + "grad_norm": 1.2538070678710938, + "learning_rate": 1.4058503079356016e-05, + "loss": 1.9071, + "mean_token_accuracy": 0.5593597888946533, + "num_tokens": 3855957663.0, + "step": 7544 + }, + { + "epoch": 2.040292049756625, + "grad_norm": 1.4409654140472412, + "learning_rate": 1.4057020367122398e-05, + "loss": 1.9525, + "mean_token_accuracy": 0.5658285021781921, + "num_tokens": 3856481944.0, + "step": 7545 + }, + { + "epoch": 2.0405624661979447, + "grad_norm": 1.2961030006408691, + "learning_rate": 1.4055537561090407e-05, + "loss": 1.9867, + "mean_token_accuracy": 0.560866117477417, + "num_tokens": 3857002596.0, + "step": 7546 + }, + { + "epoch": 2.0408328826392643, + "grad_norm": 1.2456837892532349, + "learning_rate": 1.4054054661305535e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.5417048335075378, + "num_tokens": 3857526827.0, + "step": 7547 + }, + { + "epoch": 2.041103299080584, + "grad_norm": 1.2571260929107666, + "learning_rate": 1.4052571667813287e-05, + "loss": 1.8882, + "mean_token_accuracy": 0.5750809907913208, + "num_tokens": 3858017397.0, + "step": 7548 + }, + { + "epoch": 2.0413737155219036, + "grad_norm": 1.105129599571228, + "learning_rate": 1.405108858065916e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.5757300853729248, + "num_tokens": 3858541635.0, + "step": 7549 + }, + { + "epoch": 2.0416441319632233, + "grad_norm": 0.9875454902648926, + "learning_rate": 1.4049605399888668e-05, + "loss": 1.9048, + "mean_token_accuracy": 0.5712671875953674, + "num_tokens": 3859065852.0, + "step": 7550 + }, + { + "epoch": 2.041914548404543, + "grad_norm": 0.9459758400917053, + "learning_rate": 1.4048122125547312e-05, + "loss": 1.8826, + "mean_token_accuracy": 0.5442022085189819, + "num_tokens": 3859590088.0, + "step": 7551 + }, + { + "epoch": 2.0421849648458625, + "grad_norm": 0.977205753326416, + "learning_rate": 1.4046638757680601e-05, + "loss": 1.9338, + "mean_token_accuracy": 0.5560933947563171, + "num_tokens": 3860114284.0, + "step": 7552 + }, + { + "epoch": 2.042455381287182, + "grad_norm": 1.1206629276275635, + "learning_rate": 1.4045155296334059e-05, + "loss": 2.076, + "mean_token_accuracy": 0.5267125964164734, + "num_tokens": 3860638537.0, + "step": 7553 + }, + { + "epoch": 2.042725797728502, + "grad_norm": 1.0375388860702515, + "learning_rate": 1.4043671741553196e-05, + "loss": 1.7384, + "mean_token_accuracy": 0.5939933061599731, + "num_tokens": 3861160443.0, + "step": 7554 + }, + { + "epoch": 2.0429962141698215, + "grad_norm": 1.3158384561538696, + "learning_rate": 1.404218809338353e-05, + "loss": 1.944, + "mean_token_accuracy": 0.557506263256073, + "num_tokens": 3861684664.0, + "step": 7555 + }, + { + "epoch": 2.043266630611141, + "grad_norm": 1.360026240348816, + "learning_rate": 1.4040704351870588e-05, + "loss": 1.8612, + "mean_token_accuracy": 0.5848879218101501, + "num_tokens": 3862208904.0, + "step": 7556 + }, + { + "epoch": 2.0435370470524608, + "grad_norm": 1.320473313331604, + "learning_rate": 1.40392205170599e-05, + "loss": 1.8593, + "mean_token_accuracy": 0.5596538782119751, + "num_tokens": 3862698746.0, + "step": 7557 + }, + { + "epoch": 2.0438074634937804, + "grad_norm": 1.2590012550354004, + "learning_rate": 1.4037736588996982e-05, + "loss": 1.8834, + "mean_token_accuracy": 0.5654474496841431, + "num_tokens": 3863222945.0, + "step": 7558 + }, + { + "epoch": 2.0440778799351, + "grad_norm": 1.4192088842391968, + "learning_rate": 1.4036252567727375e-05, + "loss": 1.8313, + "mean_token_accuracy": 0.5903058648109436, + "num_tokens": 3863747210.0, + "step": 7559 + }, + { + "epoch": 2.0443482963764197, + "grad_norm": 1.1691635847091675, + "learning_rate": 1.403476845329661e-05, + "loss": 1.9505, + "mean_token_accuracy": 0.5618542432785034, + "num_tokens": 3864271414.0, + "step": 7560 + }, + { + "epoch": 2.0446187128177393, + "grad_norm": 0.6585413813591003, + "learning_rate": 1.4033284245750222e-05, + "loss": 1.1184, + "mean_token_accuracy": 0.7046586275100708, + "num_tokens": 3864700963.0, + "step": 7561 + }, + { + "epoch": 2.044889129259059, + "grad_norm": 1.660720705986023, + "learning_rate": 1.4031799945133754e-05, + "loss": 1.8363, + "mean_token_accuracy": 0.5875274538993835, + "num_tokens": 3865188405.0, + "step": 7562 + }, + { + "epoch": 2.0451595457003786, + "grad_norm": 1.2699577808380127, + "learning_rate": 1.403031555149275e-05, + "loss": 1.9924, + "mean_token_accuracy": 0.5582238435745239, + "num_tokens": 3865712395.0, + "step": 7563 + }, + { + "epoch": 2.0454299621416983, + "grad_norm": 1.011606216430664, + "learning_rate": 1.4028831064872753e-05, + "loss": 1.8539, + "mean_token_accuracy": 0.561138391494751, + "num_tokens": 3866236562.0, + "step": 7564 + }, + { + "epoch": 2.045700378583018, + "grad_norm": 1.6154606342315674, + "learning_rate": 1.4027346485319309e-05, + "loss": 2.0321, + "mean_token_accuracy": 0.5409584641456604, + "num_tokens": 3866716536.0, + "step": 7565 + }, + { + "epoch": 2.0459707950243375, + "grad_norm": 1.3066269159317017, + "learning_rate": 1.4025861812877972e-05, + "loss": 1.9254, + "mean_token_accuracy": 0.5660285949707031, + "num_tokens": 3867215464.0, + "step": 7566 + }, + { + "epoch": 2.046241211465657, + "grad_norm": 1.1609166860580444, + "learning_rate": 1.40243770475943e-05, + "loss": 1.8709, + "mean_token_accuracy": 0.5504292249679565, + "num_tokens": 3867739641.0, + "step": 7567 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 1.6473222970962524, + "learning_rate": 1.4022892189513841e-05, + "loss": 1.9182, + "mean_token_accuracy": 0.560910701751709, + "num_tokens": 3868263881.0, + "step": 7568 + }, + { + "epoch": 2.0467820443482965, + "grad_norm": 1.3518576622009277, + "learning_rate": 1.4021407238682163e-05, + "loss": 1.844, + "mean_token_accuracy": 0.5890023112297058, + "num_tokens": 3868788151.0, + "step": 7569 + }, + { + "epoch": 2.047052460789616, + "grad_norm": 1.2490075826644897, + "learning_rate": 1.4019922195144827e-05, + "loss": 1.988, + "mean_token_accuracy": 0.5374331474304199, + "num_tokens": 3869312428.0, + "step": 7570 + }, + { + "epoch": 2.0473228772309358, + "grad_norm": 1.3217719793319702, + "learning_rate": 1.4018437058947394e-05, + "loss": 1.6921, + "mean_token_accuracy": 0.5780369639396667, + "num_tokens": 3869836581.0, + "step": 7571 + }, + { + "epoch": 2.0475932936722554, + "grad_norm": 1.3804417848587036, + "learning_rate": 1.4016951830135437e-05, + "loss": 1.9733, + "mean_token_accuracy": 0.5492736101150513, + "num_tokens": 3870319407.0, + "step": 7572 + }, + { + "epoch": 2.047863710113575, + "grad_norm": 1.1186587810516357, + "learning_rate": 1.4015466508754524e-05, + "loss": 1.7835, + "mean_token_accuracy": 0.5809926390647888, + "num_tokens": 3870843608.0, + "step": 7573 + }, + { + "epoch": 2.0481341265548947, + "grad_norm": 1.3324404954910278, + "learning_rate": 1.4013981094850232e-05, + "loss": 1.779, + "mean_token_accuracy": 0.5613747835159302, + "num_tokens": 3871367878.0, + "step": 7574 + }, + { + "epoch": 2.0484045429962143, + "grad_norm": 1.5291355848312378, + "learning_rate": 1.4012495588468134e-05, + "loss": 1.9739, + "mean_token_accuracy": 0.5531197786331177, + "num_tokens": 3871892106.0, + "step": 7575 + }, + { + "epoch": 2.048674959437534, + "grad_norm": 1.1553304195404053, + "learning_rate": 1.4011009989653817e-05, + "loss": 1.8762, + "mean_token_accuracy": 0.5659422874450684, + "num_tokens": 3872416236.0, + "step": 7576 + }, + { + "epoch": 2.0489453758788536, + "grad_norm": 1.46017587184906, + "learning_rate": 1.4009524298452854e-05, + "loss": 1.9209, + "mean_token_accuracy": 0.5652600526809692, + "num_tokens": 3872940362.0, + "step": 7577 + }, + { + "epoch": 2.0492157923201733, + "grad_norm": 1.4327574968338013, + "learning_rate": 1.4008038514910838e-05, + "loss": 1.8804, + "mean_token_accuracy": 0.5712611079216003, + "num_tokens": 3873464627.0, + "step": 7578 + }, + { + "epoch": 2.049486208761493, + "grad_norm": 1.2361572980880737, + "learning_rate": 1.4006552639073354e-05, + "loss": 1.7542, + "mean_token_accuracy": 0.6146438121795654, + "num_tokens": 3873988747.0, + "step": 7579 + }, + { + "epoch": 2.0497566252028125, + "grad_norm": 1.2198173999786377, + "learning_rate": 1.4005066670985993e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.5478410124778748, + "num_tokens": 3874512958.0, + "step": 7580 + }, + { + "epoch": 2.0500270416441317, + "grad_norm": 0.6432572603225708, + "learning_rate": 1.4003580610694349e-05, + "loss": 1.0777, + "mean_token_accuracy": 0.7156680822372437, + "num_tokens": 3875037210.0, + "step": 7581 + }, + { + "epoch": 2.0502974580854514, + "grad_norm": 1.438372254371643, + "learning_rate": 1.4002094458244018e-05, + "loss": 1.8158, + "mean_token_accuracy": 0.5730190873146057, + "num_tokens": 3875561298.0, + "step": 7582 + }, + { + "epoch": 2.050567874526771, + "grad_norm": 1.742600679397583, + "learning_rate": 1.4000608213680601e-05, + "loss": 1.8802, + "mean_token_accuracy": 0.5779403448104858, + "num_tokens": 3876067699.0, + "step": 7583 + }, + { + "epoch": 2.0508382909680907, + "grad_norm": 1.2678117752075195, + "learning_rate": 1.39991218770497e-05, + "loss": 2.1087, + "mean_token_accuracy": 0.5147101879119873, + "num_tokens": 3876591962.0, + "step": 7584 + }, + { + "epoch": 2.0511087074094103, + "grad_norm": 1.3800567388534546, + "learning_rate": 1.399763544839692e-05, + "loss": 1.8421, + "mean_token_accuracy": 0.570834219455719, + "num_tokens": 3877116206.0, + "step": 7585 + }, + { + "epoch": 2.05137912385073, + "grad_norm": 1.4545456171035767, + "learning_rate": 1.399614892776787e-05, + "loss": 2.0634, + "mean_token_accuracy": 0.5302112698554993, + "num_tokens": 3877640380.0, + "step": 7586 + }, + { + "epoch": 2.0516495402920496, + "grad_norm": 1.2887645959854126, + "learning_rate": 1.3994662315208159e-05, + "loss": 1.9571, + "mean_token_accuracy": 0.5601317882537842, + "num_tokens": 3878164663.0, + "step": 7587 + }, + { + "epoch": 2.0519199567333692, + "grad_norm": 1.3863974809646606, + "learning_rate": 1.3993175610763398e-05, + "loss": 2.0161, + "mean_token_accuracy": 0.559908390045166, + "num_tokens": 3878647408.0, + "step": 7588 + }, + { + "epoch": 2.052190373174689, + "grad_norm": 1.472543478012085, + "learning_rate": 1.3991688814479212e-05, + "loss": 1.7849, + "mean_token_accuracy": 0.5960240364074707, + "num_tokens": 3879153980.0, + "step": 7589 + }, + { + "epoch": 2.0524607896160085, + "grad_norm": 1.258112907409668, + "learning_rate": 1.3990201926401213e-05, + "loss": 2.0269, + "mean_token_accuracy": 0.5586003065109253, + "num_tokens": 3879678191.0, + "step": 7590 + }, + { + "epoch": 2.052731206057328, + "grad_norm": 1.1938412189483643, + "learning_rate": 1.3988714946575026e-05, + "loss": 1.6751, + "mean_token_accuracy": 0.5839599370956421, + "num_tokens": 3880178577.0, + "step": 7591 + }, + { + "epoch": 2.053001622498648, + "grad_norm": 1.5588467121124268, + "learning_rate": 1.3987227875046274e-05, + "loss": 1.982, + "mean_token_accuracy": 0.5495704412460327, + "num_tokens": 3880702707.0, + "step": 7592 + }, + { + "epoch": 2.0532720389399675, + "grad_norm": 1.0449538230895996, + "learning_rate": 1.3985740711860587e-05, + "loss": 2.019, + "mean_token_accuracy": 0.5598952770233154, + "num_tokens": 3881226978.0, + "step": 7593 + }, + { + "epoch": 2.053542455381287, + "grad_norm": 1.1681804656982422, + "learning_rate": 1.3984253457063595e-05, + "loss": 1.8728, + "mean_token_accuracy": 0.5749784111976624, + "num_tokens": 3881751248.0, + "step": 7594 + }, + { + "epoch": 2.0538128718226067, + "grad_norm": 1.3810639381408691, + "learning_rate": 1.3982766110700931e-05, + "loss": 1.8536, + "mean_token_accuracy": 0.5635836124420166, + "num_tokens": 3882275510.0, + "step": 7595 + }, + { + "epoch": 2.0540832882639264, + "grad_norm": 1.1762422323226929, + "learning_rate": 1.3981278672818229e-05, + "loss": 1.9289, + "mean_token_accuracy": 0.5636227130889893, + "num_tokens": 3882799609.0, + "step": 7596 + }, + { + "epoch": 2.054353704705246, + "grad_norm": 1.0365149974822998, + "learning_rate": 1.397979114346113e-05, + "loss": 1.8396, + "mean_token_accuracy": 0.5782372951507568, + "num_tokens": 3883323815.0, + "step": 7597 + }, + { + "epoch": 2.0546241211465657, + "grad_norm": 1.5516213178634644, + "learning_rate": 1.3978303522675276e-05, + "loss": 1.9217, + "mean_token_accuracy": 0.5686507225036621, + "num_tokens": 3883790018.0, + "step": 7598 + }, + { + "epoch": 2.0548945375878853, + "grad_norm": 1.3300752639770508, + "learning_rate": 1.3976815810506314e-05, + "loss": 1.9316, + "mean_token_accuracy": 0.5597570538520813, + "num_tokens": 3884314234.0, + "step": 7599 + }, + { + "epoch": 2.055164954029205, + "grad_norm": 1.1166404485702515, + "learning_rate": 1.3975328006999887e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.5672358274459839, + "num_tokens": 3884838490.0, + "step": 7600 + }, + { + "epoch": 2.0554353704705246, + "grad_norm": 0.6372988224029541, + "learning_rate": 1.3973840112201644e-05, + "loss": 1.1104, + "mean_token_accuracy": 0.708840548992157, + "num_tokens": 3885280894.0, + "step": 7601 + }, + { + "epoch": 2.0557057869118442, + "grad_norm": 2.064924716949463, + "learning_rate": 1.3972352126157242e-05, + "loss": 1.93, + "mean_token_accuracy": 0.5830745697021484, + "num_tokens": 3885740932.0, + "step": 7602 + }, + { + "epoch": 2.055976203353164, + "grad_norm": 1.7506306171417236, + "learning_rate": 1.3970864048912338e-05, + "loss": 1.8511, + "mean_token_accuracy": 0.5680898427963257, + "num_tokens": 3886265205.0, + "step": 7603 + }, + { + "epoch": 2.0562466197944835, + "grad_norm": 1.334222435951233, + "learning_rate": 1.3969375880512587e-05, + "loss": 2.0009, + "mean_token_accuracy": 0.557672381401062, + "num_tokens": 3886748800.0, + "step": 7604 + }, + { + "epoch": 2.056517036235803, + "grad_norm": 1.633994460105896, + "learning_rate": 1.3967887621003654e-05, + "loss": 1.93, + "mean_token_accuracy": 0.576165497303009, + "num_tokens": 3887150741.0, + "step": 7605 + }, + { + "epoch": 2.056787452677123, + "grad_norm": 1.8487390279769897, + "learning_rate": 1.3966399270431199e-05, + "loss": 1.7944, + "mean_token_accuracy": 0.6029539704322815, + "num_tokens": 3887614932.0, + "step": 7606 + }, + { + "epoch": 2.0570578691184425, + "grad_norm": 1.5522353649139404, + "learning_rate": 1.396491082884089e-05, + "loss": 1.934, + "mean_token_accuracy": 0.5466150045394897, + "num_tokens": 3888139047.0, + "step": 7607 + }, + { + "epoch": 2.057328285559762, + "grad_norm": 1.4352235794067383, + "learning_rate": 1.3963422296278397e-05, + "loss": 1.9907, + "mean_token_accuracy": 0.5506629347801208, + "num_tokens": 3888652642.0, + "step": 7608 + }, + { + "epoch": 2.0575987020010817, + "grad_norm": 1.3910715579986572, + "learning_rate": 1.3961933672789398e-05, + "loss": 2.0352, + "mean_token_accuracy": 0.5254300236701965, + "num_tokens": 3889176813.0, + "step": 7609 + }, + { + "epoch": 2.0578691184424014, + "grad_norm": 1.4488941431045532, + "learning_rate": 1.3960444958419557e-05, + "loss": 1.83, + "mean_token_accuracy": 0.5662255883216858, + "num_tokens": 3889678988.0, + "step": 7610 + }, + { + "epoch": 2.058139534883721, + "grad_norm": 1.2547948360443115, + "learning_rate": 1.395895615321456e-05, + "loss": 1.8946, + "mean_token_accuracy": 0.558860182762146, + "num_tokens": 3890203260.0, + "step": 7611 + }, + { + "epoch": 2.0584099513250407, + "grad_norm": 1.355989694595337, + "learning_rate": 1.3957467257220085e-05, + "loss": 1.9914, + "mean_token_accuracy": 0.5460898876190186, + "num_tokens": 3890727461.0, + "step": 7612 + }, + { + "epoch": 2.0586803677663603, + "grad_norm": 1.2124758958816528, + "learning_rate": 1.3955978270481823e-05, + "loss": 1.8717, + "mean_token_accuracy": 0.5603470802307129, + "num_tokens": 3891251604.0, + "step": 7613 + }, + { + "epoch": 2.05895078420768, + "grad_norm": 1.403152346611023, + "learning_rate": 1.3954489193045451e-05, + "loss": 1.978, + "mean_token_accuracy": 0.563843309879303, + "num_tokens": 3891775714.0, + "step": 7614 + }, + { + "epoch": 2.0592212006489996, + "grad_norm": 1.4095666408538818, + "learning_rate": 1.3953000024956662e-05, + "loss": 1.8358, + "mean_token_accuracy": 0.5742456912994385, + "num_tokens": 3892252486.0, + "step": 7615 + }, + { + "epoch": 2.0594916170903192, + "grad_norm": 1.3300578594207764, + "learning_rate": 1.3951510766261152e-05, + "loss": 1.9698, + "mean_token_accuracy": 0.5634562969207764, + "num_tokens": 3892776699.0, + "step": 7616 + }, + { + "epoch": 2.059762033531639, + "grad_norm": 1.1598422527313232, + "learning_rate": 1.3950021417004607e-05, + "loss": 1.7797, + "mean_token_accuracy": 0.5832779407501221, + "num_tokens": 3893300942.0, + "step": 7617 + }, + { + "epoch": 2.0600324499729585, + "grad_norm": 1.0240789651870728, + "learning_rate": 1.3948531977232728e-05, + "loss": 1.902, + "mean_token_accuracy": 0.5411213636398315, + "num_tokens": 3893825101.0, + "step": 7618 + }, + { + "epoch": 2.060302866414278, + "grad_norm": 1.278335690498352, + "learning_rate": 1.3947042446991221e-05, + "loss": 1.9428, + "mean_token_accuracy": 0.5512838363647461, + "num_tokens": 3894349076.0, + "step": 7619 + }, + { + "epoch": 2.060573282855598, + "grad_norm": 1.1403489112854004, + "learning_rate": 1.3945552826325783e-05, + "loss": 1.9998, + "mean_token_accuracy": 0.5298553109169006, + "num_tokens": 3894843997.0, + "step": 7620 + }, + { + "epoch": 2.0608436992969175, + "grad_norm": 0.5439230799674988, + "learning_rate": 1.3944063115282126e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.6476956605911255, + "num_tokens": 3895368176.0, + "step": 7621 + }, + { + "epoch": 2.0611141157382367, + "grad_norm": 1.4173598289489746, + "learning_rate": 1.3942573313905952e-05, + "loss": 1.8125, + "mean_token_accuracy": 0.5790482759475708, + "num_tokens": 3895882822.0, + "step": 7622 + }, + { + "epoch": 2.0613845321795563, + "grad_norm": 1.370124340057373, + "learning_rate": 1.3941083422242974e-05, + "loss": 1.8563, + "mean_token_accuracy": 0.5832078456878662, + "num_tokens": 3896406994.0, + "step": 7623 + }, + { + "epoch": 2.061654948620876, + "grad_norm": 1.243962287902832, + "learning_rate": 1.3939593440338906e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.5599675178527832, + "num_tokens": 3896931274.0, + "step": 7624 + }, + { + "epoch": 2.0619253650621956, + "grad_norm": 1.3584164381027222, + "learning_rate": 1.3938103368239469e-05, + "loss": 1.8228, + "mean_token_accuracy": 0.6094629764556885, + "num_tokens": 3897364500.0, + "step": 7625 + }, + { + "epoch": 2.0621957815035152, + "grad_norm": 1.290166974067688, + "learning_rate": 1.393661320599038e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5502200126647949, + "num_tokens": 3897848904.0, + "step": 7626 + }, + { + "epoch": 2.062466197944835, + "grad_norm": 1.0426056385040283, + "learning_rate": 1.393512295363736e-05, + "loss": 1.9661, + "mean_token_accuracy": 0.5479593873023987, + "num_tokens": 3898373099.0, + "step": 7627 + }, + { + "epoch": 2.0627366143861545, + "grad_norm": 1.0602071285247803, + "learning_rate": 1.3933632611226138e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.564805269241333, + "num_tokens": 3898897202.0, + "step": 7628 + }, + { + "epoch": 2.063007030827474, + "grad_norm": 1.109924554824829, + "learning_rate": 1.3932142178802441e-05, + "loss": 1.8572, + "mean_token_accuracy": 0.579521894454956, + "num_tokens": 3899421444.0, + "step": 7629 + }, + { + "epoch": 2.063277447268794, + "grad_norm": 0.9854384660720825, + "learning_rate": 1.3930651656412e-05, + "loss": 1.828, + "mean_token_accuracy": 0.5807462334632874, + "num_tokens": 3899945727.0, + "step": 7630 + }, + { + "epoch": 2.0635478637101135, + "grad_norm": 1.1877249479293823, + "learning_rate": 1.3929161044100544e-05, + "loss": 1.9095, + "mean_token_accuracy": 0.5652124285697937, + "num_tokens": 3900383545.0, + "step": 7631 + }, + { + "epoch": 2.063818280151433, + "grad_norm": 1.1944199800491333, + "learning_rate": 1.3927670341913813e-05, + "loss": 1.9033, + "mean_token_accuracy": 0.5643010139465332, + "num_tokens": 3900907732.0, + "step": 7632 + }, + { + "epoch": 2.0640886965927527, + "grad_norm": 1.3335161209106445, + "learning_rate": 1.392617954989755e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.5516095757484436, + "num_tokens": 3901373034.0, + "step": 7633 + }, + { + "epoch": 2.0643591130340724, + "grad_norm": 1.2085405588150024, + "learning_rate": 1.392468866809749e-05, + "loss": 1.7704, + "mean_token_accuracy": 0.6120995283126831, + "num_tokens": 3901897197.0, + "step": 7634 + }, + { + "epoch": 2.064629529475392, + "grad_norm": 0.9986118674278259, + "learning_rate": 1.3923197696559381e-05, + "loss": 1.7936, + "mean_token_accuracy": 0.5803841352462769, + "num_tokens": 3902418271.0, + "step": 7635 + }, + { + "epoch": 2.0648999459167117, + "grad_norm": 1.126429796218872, + "learning_rate": 1.3921706635328974e-05, + "loss": 1.8852, + "mean_token_accuracy": 0.5739061236381531, + "num_tokens": 3902942412.0, + "step": 7636 + }, + { + "epoch": 2.0651703623580313, + "grad_norm": 1.3222171068191528, + "learning_rate": 1.392021548445201e-05, + "loss": 1.9445, + "mean_token_accuracy": 0.549118161201477, + "num_tokens": 3903466596.0, + "step": 7637 + }, + { + "epoch": 2.065440778799351, + "grad_norm": 0.9773605465888977, + "learning_rate": 1.3918724243974249e-05, + "loss": 1.8203, + "mean_token_accuracy": 0.5614563226699829, + "num_tokens": 3903990845.0, + "step": 7638 + }, + { + "epoch": 2.0657111952406706, + "grad_norm": 1.2941404581069946, + "learning_rate": 1.3917232913941446e-05, + "loss": 2.0048, + "mean_token_accuracy": 0.5480475425720215, + "num_tokens": 3904486779.0, + "step": 7639 + }, + { + "epoch": 2.0659816116819902, + "grad_norm": 1.2872474193572998, + "learning_rate": 1.3915741494399355e-05, + "loss": 1.9281, + "mean_token_accuracy": 0.5858927965164185, + "num_tokens": 3904881749.0, + "step": 7640 + }, + { + "epoch": 2.06625202812331, + "grad_norm": 0.5816478729248047, + "learning_rate": 1.3914249985393743e-05, + "loss": 1.1292, + "mean_token_accuracy": 0.6774865984916687, + "num_tokens": 3905405821.0, + "step": 7641 + }, + { + "epoch": 2.0665224445646295, + "grad_norm": 1.862755537033081, + "learning_rate": 1.3912758386970373e-05, + "loss": 2.0424, + "mean_token_accuracy": 0.5324276685714722, + "num_tokens": 3905930076.0, + "step": 7642 + }, + { + "epoch": 2.066792861005949, + "grad_norm": 1.3741542100906372, + "learning_rate": 1.3911266699175009e-05, + "loss": 2.0154, + "mean_token_accuracy": 0.5489463210105896, + "num_tokens": 3906454251.0, + "step": 7643 + }, + { + "epoch": 2.067063277447269, + "grad_norm": 1.1727781295776367, + "learning_rate": 1.3909774922053418e-05, + "loss": 1.9189, + "mean_token_accuracy": 0.5738193988800049, + "num_tokens": 3906950014.0, + "step": 7644 + }, + { + "epoch": 2.0673336938885885, + "grad_norm": 1.3293248414993286, + "learning_rate": 1.3908283055651376e-05, + "loss": 1.8966, + "mean_token_accuracy": 0.5590672492980957, + "num_tokens": 3907474172.0, + "step": 7645 + }, + { + "epoch": 2.067604110329908, + "grad_norm": 1.2032979726791382, + "learning_rate": 1.3906791100014658e-05, + "loss": 1.8617, + "mean_token_accuracy": 0.5755646824836731, + "num_tokens": 3907955764.0, + "step": 7646 + }, + { + "epoch": 2.0678745267712277, + "grad_norm": 1.2292110919952393, + "learning_rate": 1.3905299055189041e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.5716042518615723, + "num_tokens": 3908479947.0, + "step": 7647 + }, + { + "epoch": 2.0681449432125474, + "grad_norm": 1.2559973001480103, + "learning_rate": 1.3903806921220305e-05, + "loss": 1.9297, + "mean_token_accuracy": 0.5679627060890198, + "num_tokens": 3909004116.0, + "step": 7648 + }, + { + "epoch": 2.068415359653867, + "grad_norm": 1.5123646259307861, + "learning_rate": 1.3902314698154236e-05, + "loss": 1.8894, + "mean_token_accuracy": 0.5679234266281128, + "num_tokens": 3909528329.0, + "step": 7649 + }, + { + "epoch": 2.0686857760951867, + "grad_norm": 0.9692097902297974, + "learning_rate": 1.3900822386036612e-05, + "loss": 1.7799, + "mean_token_accuracy": 0.5863450765609741, + "num_tokens": 3910040711.0, + "step": 7650 + }, + { + "epoch": 2.0689561925365063, + "grad_norm": 1.1234415769577026, + "learning_rate": 1.3899329984913228e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.5600762367248535, + "num_tokens": 3910540127.0, + "step": 7651 + }, + { + "epoch": 2.069226608977826, + "grad_norm": 1.0865848064422607, + "learning_rate": 1.3897837494829877e-05, + "loss": 1.9843, + "mean_token_accuracy": 0.5637008547782898, + "num_tokens": 3911064393.0, + "step": 7652 + }, + { + "epoch": 2.0694970254191456, + "grad_norm": 1.1316604614257812, + "learning_rate": 1.3896344915832344e-05, + "loss": 1.826, + "mean_token_accuracy": 0.579596996307373, + "num_tokens": 3911569474.0, + "step": 7653 + }, + { + "epoch": 2.0697674418604652, + "grad_norm": 1.2401260137557983, + "learning_rate": 1.3894852247966436e-05, + "loss": 1.7905, + "mean_token_accuracy": 0.575955867767334, + "num_tokens": 3912088349.0, + "step": 7654 + }, + { + "epoch": 2.070037858301785, + "grad_norm": 1.3828544616699219, + "learning_rate": 1.3893359491277947e-05, + "loss": 1.9403, + "mean_token_accuracy": 0.5530638694763184, + "num_tokens": 3912612624.0, + "step": 7655 + }, + { + "epoch": 2.0703082747431045, + "grad_norm": 1.3963345289230347, + "learning_rate": 1.3891866645812676e-05, + "loss": 1.8838, + "mean_token_accuracy": 0.5707833170890808, + "num_tokens": 3913094142.0, + "step": 7656 + }, + { + "epoch": 2.070578691184424, + "grad_norm": 1.2644591331481934, + "learning_rate": 1.3890373711616436e-05, + "loss": 2.0225, + "mean_token_accuracy": 0.5480011105537415, + "num_tokens": 3913561739.0, + "step": 7657 + }, + { + "epoch": 2.070849107625744, + "grad_norm": 1.2013602256774902, + "learning_rate": 1.388888068873503e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.5597894191741943, + "num_tokens": 3914085940.0, + "step": 7658 + }, + { + "epoch": 2.0711195240670635, + "grad_norm": 1.3285694122314453, + "learning_rate": 1.3887387577214271e-05, + "loss": 1.8801, + "mean_token_accuracy": 0.5643705725669861, + "num_tokens": 3914610161.0, + "step": 7659 + }, + { + "epoch": 2.071389940508383, + "grad_norm": 1.4456615447998047, + "learning_rate": 1.3885894377099961e-05, + "loss": 2.0222, + "mean_token_accuracy": 0.555350124835968, + "num_tokens": 3915134373.0, + "step": 7660 + }, + { + "epoch": 2.0716603569497027, + "grad_norm": 0.4910213053226471, + "learning_rate": 1.3884401088437934e-05, + "loss": 1.0858, + "mean_token_accuracy": 0.7266967296600342, + "num_tokens": 3915658621.0, + "step": 7661 + }, + { + "epoch": 2.0719307733910224, + "grad_norm": 1.8753687143325806, + "learning_rate": 1.3882907711273996e-05, + "loss": 1.8547, + "mean_token_accuracy": 0.5728164911270142, + "num_tokens": 3916182825.0, + "step": 7662 + }, + { + "epoch": 2.0722011898323416, + "grad_norm": 1.6311386823654175, + "learning_rate": 1.388141424565397e-05, + "loss": 1.9643, + "mean_token_accuracy": 0.5401674509048462, + "num_tokens": 3916707008.0, + "step": 7663 + }, + { + "epoch": 2.0724716062736612, + "grad_norm": 1.0196000337600708, + "learning_rate": 1.3879920691623683e-05, + "loss": 1.8081, + "mean_token_accuracy": 0.5828148126602173, + "num_tokens": 3917231190.0, + "step": 7664 + }, + { + "epoch": 2.072742022714981, + "grad_norm": 1.289433240890503, + "learning_rate": 1.3878427049228962e-05, + "loss": 1.9995, + "mean_token_accuracy": 0.5347092747688293, + "num_tokens": 3917755289.0, + "step": 7665 + }, + { + "epoch": 2.0730124391563005, + "grad_norm": 1.4261393547058105, + "learning_rate": 1.3876933318515631e-05, + "loss": 1.9321, + "mean_token_accuracy": 0.5700228214263916, + "num_tokens": 3918279529.0, + "step": 7666 + }, + { + "epoch": 2.07328285559762, + "grad_norm": 1.156046986579895, + "learning_rate": 1.3875439499529523e-05, + "loss": 1.7768, + "mean_token_accuracy": 0.5838087797164917, + "num_tokens": 3918803682.0, + "step": 7667 + }, + { + "epoch": 2.07355327203894, + "grad_norm": 1.0982787609100342, + "learning_rate": 1.387394559231648e-05, + "loss": 1.8757, + "mean_token_accuracy": 0.5676990747451782, + "num_tokens": 3919327741.0, + "step": 7668 + }, + { + "epoch": 2.0738236884802594, + "grad_norm": 1.5511274337768555, + "learning_rate": 1.3872451596922332e-05, + "loss": 1.9661, + "mean_token_accuracy": 0.5535932779312134, + "num_tokens": 3919852012.0, + "step": 7669 + }, + { + "epoch": 2.074094104921579, + "grad_norm": 1.16556715965271, + "learning_rate": 1.3870957513392923e-05, + "loss": 1.8343, + "mean_token_accuracy": 0.5615146160125732, + "num_tokens": 3920352252.0, + "step": 7670 + }, + { + "epoch": 2.0743645213628987, + "grad_norm": 1.0458006858825684, + "learning_rate": 1.3869463341774091e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.5650805830955505, + "num_tokens": 3920876458.0, + "step": 7671 + }, + { + "epoch": 2.0746349378042184, + "grad_norm": 1.4070881605148315, + "learning_rate": 1.3867969082111693e-05, + "loss": 1.8777, + "mean_token_accuracy": 0.574321985244751, + "num_tokens": 3921400705.0, + "step": 7672 + }, + { + "epoch": 2.074905354245538, + "grad_norm": 1.1223840713500977, + "learning_rate": 1.3866474734451563e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5667984485626221, + "num_tokens": 3921880698.0, + "step": 7673 + }, + { + "epoch": 2.0751757706868577, + "grad_norm": 1.313843846321106, + "learning_rate": 1.3864980298839558e-05, + "loss": 1.9341, + "mean_token_accuracy": 0.567163348197937, + "num_tokens": 3922404917.0, + "step": 7674 + }, + { + "epoch": 2.0754461871281773, + "grad_norm": 1.1664419174194336, + "learning_rate": 1.3863485775321535e-05, + "loss": 1.8913, + "mean_token_accuracy": 0.5770635604858398, + "num_tokens": 3922929130.0, + "step": 7675 + }, + { + "epoch": 2.075716603569497, + "grad_norm": 1.2434417009353638, + "learning_rate": 1.3861991163943345e-05, + "loss": 1.993, + "mean_token_accuracy": 0.5618865489959717, + "num_tokens": 3923453357.0, + "step": 7676 + }, + { + "epoch": 2.0759870200108166, + "grad_norm": 1.0180606842041016, + "learning_rate": 1.386049646475085e-05, + "loss": 1.9009, + "mean_token_accuracy": 0.5599392056465149, + "num_tokens": 3923977430.0, + "step": 7677 + }, + { + "epoch": 2.0762574364521362, + "grad_norm": 1.5662305355072021, + "learning_rate": 1.3859001677789913e-05, + "loss": 1.7207, + "mean_token_accuracy": 0.6147685050964355, + "num_tokens": 3924501678.0, + "step": 7678 + }, + { + "epoch": 2.076527852893456, + "grad_norm": 1.4407812356948853, + "learning_rate": 1.3857506803106395e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5747783184051514, + "num_tokens": 3924982617.0, + "step": 7679 + }, + { + "epoch": 2.0767982693347755, + "grad_norm": 1.0422025918960571, + "learning_rate": 1.3856011840746166e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.5508599281311035, + "num_tokens": 3925506875.0, + "step": 7680 + }, + { + "epoch": 2.077068685776095, + "grad_norm": 0.5131911635398865, + "learning_rate": 1.3854516790755095e-05, + "loss": 1.1771, + "mean_token_accuracy": 0.69426029920578, + "num_tokens": 3926030998.0, + "step": 7681 + }, + { + "epoch": 2.077339102217415, + "grad_norm": 1.9281460046768188, + "learning_rate": 1.3853021653179052e-05, + "loss": 1.9928, + "mean_token_accuracy": 0.5533095002174377, + "num_tokens": 3926555161.0, + "step": 7682 + }, + { + "epoch": 2.0776095186587344, + "grad_norm": 1.9515637159347534, + "learning_rate": 1.3851526428063915e-05, + "loss": 1.8412, + "mean_token_accuracy": 0.5791723728179932, + "num_tokens": 3927059135.0, + "step": 7683 + }, + { + "epoch": 2.077879935100054, + "grad_norm": 1.4273475408554077, + "learning_rate": 1.3850031115455562e-05, + "loss": 1.8784, + "mean_token_accuracy": 0.5625392198562622, + "num_tokens": 3927583387.0, + "step": 7684 + }, + { + "epoch": 2.0781503515413737, + "grad_norm": 1.2807761430740356, + "learning_rate": 1.3848535715399875e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5508780479431152, + "num_tokens": 3928107634.0, + "step": 7685 + }, + { + "epoch": 2.0784207679826934, + "grad_norm": 1.2815041542053223, + "learning_rate": 1.3847040227942737e-05, + "loss": 1.9444, + "mean_token_accuracy": 0.5559443235397339, + "num_tokens": 3928631875.0, + "step": 7686 + }, + { + "epoch": 2.078691184424013, + "grad_norm": 1.67208993434906, + "learning_rate": 1.3845544653130028e-05, + "loss": 1.9737, + "mean_token_accuracy": 0.5656394362449646, + "num_tokens": 3929106042.0, + "step": 7687 + }, + { + "epoch": 2.0789616008653327, + "grad_norm": 1.3485559225082397, + "learning_rate": 1.384404899100764e-05, + "loss": 1.8698, + "mean_token_accuracy": 0.5808867812156677, + "num_tokens": 3929630315.0, + "step": 7688 + }, + { + "epoch": 2.0792320173066523, + "grad_norm": 1.0592814683914185, + "learning_rate": 1.3842553241621468e-05, + "loss": 1.9128, + "mean_token_accuracy": 0.5546181201934814, + "num_tokens": 3930154595.0, + "step": 7689 + }, + { + "epoch": 2.079502433747972, + "grad_norm": 1.378951072692871, + "learning_rate": 1.3841057405017402e-05, + "loss": 1.8983, + "mean_token_accuracy": 0.5601778030395508, + "num_tokens": 3930678793.0, + "step": 7690 + }, + { + "epoch": 2.0797728501892916, + "grad_norm": 1.2731605768203735, + "learning_rate": 1.3839561481241339e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5553194284439087, + "num_tokens": 3931195987.0, + "step": 7691 + }, + { + "epoch": 2.0800432666306112, + "grad_norm": 1.1748628616333008, + "learning_rate": 1.383806547033918e-05, + "loss": 1.9356, + "mean_token_accuracy": 0.5696491003036499, + "num_tokens": 3931720022.0, + "step": 7692 + }, + { + "epoch": 2.080313683071931, + "grad_norm": 1.0804755687713623, + "learning_rate": 1.3836569372356828e-05, + "loss": 1.8744, + "mean_token_accuracy": 0.5713038444519043, + "num_tokens": 3932244306.0, + "step": 7693 + }, + { + "epoch": 2.0805840995132505, + "grad_norm": 1.3308547735214233, + "learning_rate": 1.3835073187340183e-05, + "loss": 1.9924, + "mean_token_accuracy": 0.5442636013031006, + "num_tokens": 3932768391.0, + "step": 7694 + }, + { + "epoch": 2.08085451595457, + "grad_norm": 1.2448902130126953, + "learning_rate": 1.3833576915335157e-05, + "loss": 1.9802, + "mean_token_accuracy": 0.5697584748268127, + "num_tokens": 3933257543.0, + "step": 7695 + }, + { + "epoch": 2.08112493239589, + "grad_norm": 1.4100139141082764, + "learning_rate": 1.3832080556387655e-05, + "loss": 2.0314, + "mean_token_accuracy": 0.5483373403549194, + "num_tokens": 3933753511.0, + "step": 7696 + }, + { + "epoch": 2.0813953488372094, + "grad_norm": 1.115513801574707, + "learning_rate": 1.3830584110543597e-05, + "loss": 1.9416, + "mean_token_accuracy": 0.5658138990402222, + "num_tokens": 3934277630.0, + "step": 7697 + }, + { + "epoch": 2.081665765278529, + "grad_norm": 1.0490731000900269, + "learning_rate": 1.3829087577848894e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.562608003616333, + "num_tokens": 3934794084.0, + "step": 7698 + }, + { + "epoch": 2.0819361817198487, + "grad_norm": 1.1193568706512451, + "learning_rate": 1.3827590958349462e-05, + "loss": 1.9361, + "mean_token_accuracy": 0.5587379932403564, + "num_tokens": 3935318185.0, + "step": 7699 + }, + { + "epoch": 2.0822065981611684, + "grad_norm": 1.1485041379928589, + "learning_rate": 1.3826094252091225e-05, + "loss": 2.0376, + "mean_token_accuracy": 0.5474433898925781, + "num_tokens": 3935802096.0, + "step": 7700 + }, + { + "epoch": 2.082477014602488, + "grad_norm": 0.4961792230606079, + "learning_rate": 1.3824597459120106e-05, + "loss": 1.1535, + "mean_token_accuracy": 0.6857680082321167, + "num_tokens": 3936326376.0, + "step": 7701 + }, + { + "epoch": 2.0827474310438077, + "grad_norm": 2.086944580078125, + "learning_rate": 1.3823100579482029e-05, + "loss": 1.9884, + "mean_token_accuracy": 0.5655977725982666, + "num_tokens": 3936823544.0, + "step": 7702 + }, + { + "epoch": 2.0830178474851273, + "grad_norm": 1.758107304573059, + "learning_rate": 1.3821603613222922e-05, + "loss": 1.9896, + "mean_token_accuracy": 0.5537566542625427, + "num_tokens": 3937347727.0, + "step": 7703 + }, + { + "epoch": 2.0832882639264465, + "grad_norm": 1.3666269779205322, + "learning_rate": 1.3820106560388724e-05, + "loss": 2.0292, + "mean_token_accuracy": 0.5614438056945801, + "num_tokens": 3937831973.0, + "step": 7704 + }, + { + "epoch": 2.083558680367766, + "grad_norm": 1.6571449041366577, + "learning_rate": 1.3818609421025361e-05, + "loss": 1.9495, + "mean_token_accuracy": 0.5579280853271484, + "num_tokens": 3938356250.0, + "step": 7705 + }, + { + "epoch": 2.083829096809086, + "grad_norm": 1.3263317346572876, + "learning_rate": 1.381711219517877e-05, + "loss": 1.8917, + "mean_token_accuracy": 0.5599952936172485, + "num_tokens": 3938775404.0, + "step": 7706 + }, + { + "epoch": 2.0840995132504054, + "grad_norm": 1.250435471534729, + "learning_rate": 1.3815614882894894e-05, + "loss": 1.8932, + "mean_token_accuracy": 0.5528914332389832, + "num_tokens": 3939299692.0, + "step": 7707 + }, + { + "epoch": 2.084369929691725, + "grad_norm": 1.5098681449890137, + "learning_rate": 1.3814117484219676e-05, + "loss": 1.9059, + "mean_token_accuracy": 0.5477792620658875, + "num_tokens": 3939823869.0, + "step": 7708 + }, + { + "epoch": 2.0846403461330447, + "grad_norm": 1.5009907484054565, + "learning_rate": 1.3812619999199053e-05, + "loss": 1.8304, + "mean_token_accuracy": 0.5758590698242188, + "num_tokens": 3940343769.0, + "step": 7709 + }, + { + "epoch": 2.0849107625743644, + "grad_norm": 1.634648323059082, + "learning_rate": 1.3811122427878975e-05, + "loss": 1.9268, + "mean_token_accuracy": 0.5539003610610962, + "num_tokens": 3940867983.0, + "step": 7710 + }, + { + "epoch": 2.085181179015684, + "grad_norm": 1.4422396421432495, + "learning_rate": 1.3809624770305401e-05, + "loss": 1.8276, + "mean_token_accuracy": 0.5758121013641357, + "num_tokens": 3941392202.0, + "step": 7711 + }, + { + "epoch": 2.0854515954570036, + "grad_norm": 1.308256983757019, + "learning_rate": 1.3808127026524272e-05, + "loss": 1.9951, + "mean_token_accuracy": 0.5461969375610352, + "num_tokens": 3941916391.0, + "step": 7712 + }, + { + "epoch": 2.0857220118983233, + "grad_norm": 1.3530396223068237, + "learning_rate": 1.380662919658155e-05, + "loss": 1.8466, + "mean_token_accuracy": 0.565356969833374, + "num_tokens": 3942440537.0, + "step": 7713 + }, + { + "epoch": 2.085992428339643, + "grad_norm": 1.6551011800765991, + "learning_rate": 1.380513128052319e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5692743062973022, + "num_tokens": 3942964782.0, + "step": 7714 + }, + { + "epoch": 2.0862628447809626, + "grad_norm": 1.4494541883468628, + "learning_rate": 1.3803633278395152e-05, + "loss": 1.9748, + "mean_token_accuracy": 0.5579090118408203, + "num_tokens": 3943481261.0, + "step": 7715 + }, + { + "epoch": 2.086533261222282, + "grad_norm": 1.2976073026657104, + "learning_rate": 1.38021351902434e-05, + "loss": 1.8455, + "mean_token_accuracy": 0.5716140270233154, + "num_tokens": 3944005455.0, + "step": 7716 + }, + { + "epoch": 2.086803677663602, + "grad_norm": 1.095604419708252, + "learning_rate": 1.3800637016113898e-05, + "loss": 1.7674, + "mean_token_accuracy": 0.5927696228027344, + "num_tokens": 3944529613.0, + "step": 7717 + }, + { + "epoch": 2.0870740941049215, + "grad_norm": 1.3471159934997559, + "learning_rate": 1.3799138756052622e-05, + "loss": 1.8782, + "mean_token_accuracy": 0.5681049823760986, + "num_tokens": 3945049602.0, + "step": 7718 + }, + { + "epoch": 2.087344510546241, + "grad_norm": 1.0555943250656128, + "learning_rate": 1.379764041010553e-05, + "loss": 1.9641, + "mean_token_accuracy": 0.5582990646362305, + "num_tokens": 3945573886.0, + "step": 7719 + }, + { + "epoch": 2.087614926987561, + "grad_norm": 1.4528543949127197, + "learning_rate": 1.3796141978318608e-05, + "loss": 1.9351, + "mean_token_accuracy": 0.5951266884803772, + "num_tokens": 3946098165.0, + "step": 7720 + }, + { + "epoch": 2.0878853434288804, + "grad_norm": 0.6120310425758362, + "learning_rate": 1.3794643460737826e-05, + "loss": 1.1488, + "mean_token_accuracy": 0.6996930241584778, + "num_tokens": 3946618756.0, + "step": 7721 + }, + { + "epoch": 2.0881557598702, + "grad_norm": 1.8278963565826416, + "learning_rate": 1.3793144857409164e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5515750646591187, + "num_tokens": 3947142983.0, + "step": 7722 + }, + { + "epoch": 2.0884261763115197, + "grad_norm": 1.1989600658416748, + "learning_rate": 1.3791646168378603e-05, + "loss": 1.6864, + "mean_token_accuracy": 0.602710485458374, + "num_tokens": 3947667084.0, + "step": 7723 + }, + { + "epoch": 2.0886965927528394, + "grad_norm": 1.1646839380264282, + "learning_rate": 1.3790147393692129e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.5622256398200989, + "num_tokens": 3948191363.0, + "step": 7724 + }, + { + "epoch": 2.088967009194159, + "grad_norm": 4.37584114074707, + "learning_rate": 1.3788648533395727e-05, + "loss": 1.6859, + "mean_token_accuracy": 0.6088802814483643, + "num_tokens": 3948688345.0, + "step": 7725 + }, + { + "epoch": 2.0892374256354787, + "grad_norm": 2.0754270553588867, + "learning_rate": 1.3787149587535384e-05, + "loss": 2.0077, + "mean_token_accuracy": 0.5522553324699402, + "num_tokens": 3949153067.0, + "step": 7726 + }, + { + "epoch": 2.0895078420767983, + "grad_norm": 1.4693529605865479, + "learning_rate": 1.3785650556157099e-05, + "loss": 2.0151, + "mean_token_accuracy": 0.5470916032791138, + "num_tokens": 3949677264.0, + "step": 7727 + }, + { + "epoch": 2.089778258518118, + "grad_norm": 1.147879958152771, + "learning_rate": 1.3784151439306865e-05, + "loss": 1.8856, + "mean_token_accuracy": 0.5627909898757935, + "num_tokens": 3950201543.0, + "step": 7728 + }, + { + "epoch": 2.0900486749594376, + "grad_norm": 1.585992455482483, + "learning_rate": 1.3782652237030674e-05, + "loss": 1.8164, + "mean_token_accuracy": 0.5710963010787964, + "num_tokens": 3950713205.0, + "step": 7729 + }, + { + "epoch": 2.0903190914007572, + "grad_norm": 1.5597095489501953, + "learning_rate": 1.3781152949374527e-05, + "loss": 1.9751, + "mean_token_accuracy": 0.5538075566291809, + "num_tokens": 3951202326.0, + "step": 7730 + }, + { + "epoch": 2.090589507842077, + "grad_norm": 1.3558560609817505, + "learning_rate": 1.3779653576384432e-05, + "loss": 1.9356, + "mean_token_accuracy": 0.5532873868942261, + "num_tokens": 3951726504.0, + "step": 7731 + }, + { + "epoch": 2.0908599242833965, + "grad_norm": 1.3843311071395874, + "learning_rate": 1.377815411810639e-05, + "loss": 1.9224, + "mean_token_accuracy": 0.550881028175354, + "num_tokens": 3952250758.0, + "step": 7732 + }, + { + "epoch": 2.091130340724716, + "grad_norm": 1.387366533279419, + "learning_rate": 1.377665457458641e-05, + "loss": 1.8276, + "mean_token_accuracy": 0.5867054462432861, + "num_tokens": 3952700869.0, + "step": 7733 + }, + { + "epoch": 2.091400757166036, + "grad_norm": 1.174113154411316, + "learning_rate": 1.3775154945870502e-05, + "loss": 1.9328, + "mean_token_accuracy": 0.5671676397323608, + "num_tokens": 3953225103.0, + "step": 7734 + }, + { + "epoch": 2.0916711736073554, + "grad_norm": 1.1969808340072632, + "learning_rate": 1.377365523200468e-05, + "loss": 1.8476, + "mean_token_accuracy": 0.5716167688369751, + "num_tokens": 3953749352.0, + "step": 7735 + }, + { + "epoch": 2.091941590048675, + "grad_norm": 1.1668869256973267, + "learning_rate": 1.3772155433034958e-05, + "loss": 1.9495, + "mean_token_accuracy": 0.5470263957977295, + "num_tokens": 3954236029.0, + "step": 7736 + }, + { + "epoch": 2.0922120064899947, + "grad_norm": 1.1659398078918457, + "learning_rate": 1.3770655549007355e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.5634108781814575, + "num_tokens": 3954760269.0, + "step": 7737 + }, + { + "epoch": 2.0924824229313144, + "grad_norm": 1.2072327136993408, + "learning_rate": 1.3769155579967888e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5530871748924255, + "num_tokens": 3955242977.0, + "step": 7738 + }, + { + "epoch": 2.092752839372634, + "grad_norm": 1.247228980064392, + "learning_rate": 1.3767655525962587e-05, + "loss": 1.9432, + "mean_token_accuracy": 0.5594582557678223, + "num_tokens": 3955767227.0, + "step": 7739 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 1.2285189628601074, + "learning_rate": 1.3766155387037474e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5668489933013916, + "num_tokens": 3956291502.0, + "step": 7740 + }, + { + "epoch": 2.0932936722552733, + "grad_norm": 0.6142182946205139, + "learning_rate": 1.3764655163238584e-05, + "loss": 1.1136, + "mean_token_accuracy": 0.6899435520172119, + "num_tokens": 3956815684.0, + "step": 7741 + }, + { + "epoch": 2.093564088696593, + "grad_norm": 1.6206343173980713, + "learning_rate": 1.376315485461194e-05, + "loss": 1.8941, + "mean_token_accuracy": 0.5666830539703369, + "num_tokens": 3957339787.0, + "step": 7742 + }, + { + "epoch": 2.0938345051379126, + "grad_norm": 1.308962345123291, + "learning_rate": 1.3761654461203578e-05, + "loss": 1.9516, + "mean_token_accuracy": 0.5442405343055725, + "num_tokens": 3957812205.0, + "step": 7743 + }, + { + "epoch": 2.0941049215792322, + "grad_norm": 1.0673891305923462, + "learning_rate": 1.376015398305954e-05, + "loss": 1.8426, + "mean_token_accuracy": 0.5748322010040283, + "num_tokens": 3958336401.0, + "step": 7744 + }, + { + "epoch": 2.0943753380205514, + "grad_norm": 1.2953972816467285, + "learning_rate": 1.3758653420225853e-05, + "loss": 1.8477, + "mean_token_accuracy": 0.5921489000320435, + "num_tokens": 3958806089.0, + "step": 7745 + }, + { + "epoch": 2.094645754461871, + "grad_norm": 1.270106315612793, + "learning_rate": 1.3757152772748569e-05, + "loss": 1.8736, + "mean_token_accuracy": 0.5765718221664429, + "num_tokens": 3959330313.0, + "step": 7746 + }, + { + "epoch": 2.0949161709031907, + "grad_norm": 1.214619517326355, + "learning_rate": 1.3755652040673734e-05, + "loss": 1.8351, + "mean_token_accuracy": 0.555959939956665, + "num_tokens": 3959836425.0, + "step": 7747 + }, + { + "epoch": 2.0951865873445104, + "grad_norm": 1.5583243370056152, + "learning_rate": 1.375415122404739e-05, + "loss": 1.9902, + "mean_token_accuracy": 0.5347723960876465, + "num_tokens": 3960360597.0, + "step": 7748 + }, + { + "epoch": 2.09545700378583, + "grad_norm": 1.4937814474105835, + "learning_rate": 1.3752650322915585e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.555344820022583, + "num_tokens": 3960884788.0, + "step": 7749 + }, + { + "epoch": 2.0957274202271496, + "grad_norm": 1.082588791847229, + "learning_rate": 1.3751149337324371e-05, + "loss": 1.9314, + "mean_token_accuracy": 0.5546090602874756, + "num_tokens": 3961409030.0, + "step": 7750 + }, + { + "epoch": 2.0959978366684693, + "grad_norm": 1.5947786569595337, + "learning_rate": 1.3749648267319811e-05, + "loss": 1.9158, + "mean_token_accuracy": 0.5587524175643921, + "num_tokens": 3961933216.0, + "step": 7751 + }, + { + "epoch": 2.096268253109789, + "grad_norm": 1.3668102025985718, + "learning_rate": 1.3748147112947949e-05, + "loss": 1.9671, + "mean_token_accuracy": 0.5582062602043152, + "num_tokens": 3962441264.0, + "step": 7752 + }, + { + "epoch": 2.0965386695511086, + "grad_norm": 30.641489028930664, + "learning_rate": 1.3746645874254854e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.5908316373825073, + "num_tokens": 3962919442.0, + "step": 7753 + }, + { + "epoch": 2.096809085992428, + "grad_norm": 2.0734758377075195, + "learning_rate": 1.3745144551286586e-05, + "loss": 1.8367, + "mean_token_accuracy": 0.5789270401000977, + "num_tokens": 3963443553.0, + "step": 7754 + }, + { + "epoch": 2.097079502433748, + "grad_norm": 2.1565282344818115, + "learning_rate": 1.3743643144089211e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.5428861379623413, + "num_tokens": 3963950867.0, + "step": 7755 + }, + { + "epoch": 2.0973499188750675, + "grad_norm": 1.189426064491272, + "learning_rate": 1.3742141652708799e-05, + "loss": 1.9001, + "mean_token_accuracy": 0.5651595592498779, + "num_tokens": 3964459470.0, + "step": 7756 + }, + { + "epoch": 2.097620335316387, + "grad_norm": 1.371646761894226, + "learning_rate": 1.3740640077191413e-05, + "loss": 1.9571, + "mean_token_accuracy": 0.5428956747055054, + "num_tokens": 3964983729.0, + "step": 7757 + }, + { + "epoch": 2.097890751757707, + "grad_norm": 1.5220435857772827, + "learning_rate": 1.373913841758313e-05, + "loss": 1.9646, + "mean_token_accuracy": 0.5562348365783691, + "num_tokens": 3965507970.0, + "step": 7758 + }, + { + "epoch": 2.0981611681990264, + "grad_norm": 1.2785731554031372, + "learning_rate": 1.3737636673930025e-05, + "loss": 1.8968, + "mean_token_accuracy": 0.6001539826393127, + "num_tokens": 3966032236.0, + "step": 7759 + }, + { + "epoch": 2.098431584640346, + "grad_norm": 1.202939510345459, + "learning_rate": 1.373613484627818e-05, + "loss": 1.7934, + "mean_token_accuracy": 0.6022131443023682, + "num_tokens": 3966509713.0, + "step": 7760 + }, + { + "epoch": 2.0987020010816657, + "grad_norm": 0.550765872001648, + "learning_rate": 1.3734632934673666e-05, + "loss": 1.1051, + "mean_token_accuracy": 0.7087181806564331, + "num_tokens": 3967033929.0, + "step": 7761 + }, + { + "epoch": 2.0989724175229854, + "grad_norm": 1.7194576263427734, + "learning_rate": 1.3733130939162574e-05, + "loss": 1.8787, + "mean_token_accuracy": 0.5666279792785645, + "num_tokens": 3967540434.0, + "step": 7762 + }, + { + "epoch": 2.099242833964305, + "grad_norm": 1.8855319023132324, + "learning_rate": 1.3731628859790987e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.5881518125534058, + "num_tokens": 3968031770.0, + "step": 7763 + }, + { + "epoch": 2.0995132504056246, + "grad_norm": 1.1859453916549683, + "learning_rate": 1.3730126696604994e-05, + "loss": 1.8791, + "mean_token_accuracy": 0.5744823217391968, + "num_tokens": 3968556006.0, + "step": 7764 + }, + { + "epoch": 2.0997836668469443, + "grad_norm": 1.311411738395691, + "learning_rate": 1.3728624449650682e-05, + "loss": 1.9421, + "mean_token_accuracy": 0.5756286978721619, + "num_tokens": 3969053968.0, + "step": 7765 + }, + { + "epoch": 2.100054083288264, + "grad_norm": 1.2912087440490723, + "learning_rate": 1.3727122118974152e-05, + "loss": 1.969, + "mean_token_accuracy": 0.5717380046844482, + "num_tokens": 3969570765.0, + "step": 7766 + }, + { + "epoch": 2.1003244997295836, + "grad_norm": 1.3446578979492188, + "learning_rate": 1.372561970462149e-05, + "loss": 1.9367, + "mean_token_accuracy": 0.563423752784729, + "num_tokens": 3970094900.0, + "step": 7767 + }, + { + "epoch": 2.100594916170903, + "grad_norm": 1.3303967714309692, + "learning_rate": 1.3724117206638805e-05, + "loss": 1.9802, + "mean_token_accuracy": 0.5707364082336426, + "num_tokens": 3970548396.0, + "step": 7768 + }, + { + "epoch": 2.100865332612223, + "grad_norm": 1.3883754014968872, + "learning_rate": 1.372261462507219e-05, + "loss": 1.9841, + "mean_token_accuracy": 0.5678038597106934, + "num_tokens": 3971019761.0, + "step": 7769 + }, + { + "epoch": 2.1011357490535425, + "grad_norm": 1.3729124069213867, + "learning_rate": 1.3721111959967757e-05, + "loss": 2.0007, + "mean_token_accuracy": 0.5438764095306396, + "num_tokens": 3971544045.0, + "step": 7770 + }, + { + "epoch": 2.101406165494862, + "grad_norm": 1.1837348937988281, + "learning_rate": 1.3719609211371601e-05, + "loss": 1.8685, + "mean_token_accuracy": 0.5644384622573853, + "num_tokens": 3972068200.0, + "step": 7771 + }, + { + "epoch": 2.101676581936182, + "grad_norm": 2.153153896331787, + "learning_rate": 1.371810637932984e-05, + "loss": 1.8282, + "mean_token_accuracy": 0.5935169458389282, + "num_tokens": 3972592468.0, + "step": 7772 + }, + { + "epoch": 2.1019469983775014, + "grad_norm": 1.6442043781280518, + "learning_rate": 1.3716603463888578e-05, + "loss": 1.9333, + "mean_token_accuracy": 0.5578895807266235, + "num_tokens": 3973077687.0, + "step": 7773 + }, + { + "epoch": 2.102217414818821, + "grad_norm": 1.175187587738037, + "learning_rate": 1.3715100465093933e-05, + "loss": 1.9127, + "mean_token_accuracy": 0.5682494640350342, + "num_tokens": 3973591270.0, + "step": 7774 + }, + { + "epoch": 2.1024878312601407, + "grad_norm": 1.1908537149429321, + "learning_rate": 1.3713597382992022e-05, + "loss": 1.9358, + "mean_token_accuracy": 0.56357342004776, + "num_tokens": 3974115490.0, + "step": 7775 + }, + { + "epoch": 2.1027582477014604, + "grad_norm": 1.2717407941818237, + "learning_rate": 1.3712094217628967e-05, + "loss": 1.8598, + "mean_token_accuracy": 0.5682163238525391, + "num_tokens": 3974639752.0, + "step": 7776 + }, + { + "epoch": 2.10302866414278, + "grad_norm": 1.0867486000061035, + "learning_rate": 1.3710590969050886e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5645542144775391, + "num_tokens": 3975163988.0, + "step": 7777 + }, + { + "epoch": 2.1032990805840996, + "grad_norm": 1.367794394493103, + "learning_rate": 1.3709087637303896e-05, + "loss": 1.8751, + "mean_token_accuracy": 0.5690726637840271, + "num_tokens": 3975688206.0, + "step": 7778 + }, + { + "epoch": 2.1035694970254193, + "grad_norm": 1.341370701789856, + "learning_rate": 1.3707584222434135e-05, + "loss": 1.9086, + "mean_token_accuracy": 0.549211859703064, + "num_tokens": 3976159923.0, + "step": 7779 + }, + { + "epoch": 2.103839913466739, + "grad_norm": 1.1214076280593872, + "learning_rate": 1.3706080724487725e-05, + "loss": 1.9285, + "mean_token_accuracy": 0.5638494491577148, + "num_tokens": 3976666824.0, + "step": 7780 + }, + { + "epoch": 2.1041103299080586, + "grad_norm": 0.4736476242542267, + "learning_rate": 1.3704577143510801e-05, + "loss": 1.1122, + "mean_token_accuracy": 0.6894822716712952, + "num_tokens": 3977191083.0, + "step": 7781 + }, + { + "epoch": 2.104380746349378, + "grad_norm": 1.6510136127471924, + "learning_rate": 1.3703073479549498e-05, + "loss": 1.8894, + "mean_token_accuracy": 0.5811178684234619, + "num_tokens": 3977715290.0, + "step": 7782 + }, + { + "epoch": 2.104651162790698, + "grad_norm": 1.453174352645874, + "learning_rate": 1.3701569732649951e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.5676026344299316, + "num_tokens": 3978239340.0, + "step": 7783 + }, + { + "epoch": 2.1049215792320175, + "grad_norm": 1.0836490392684937, + "learning_rate": 1.3700065902858302e-05, + "loss": 1.9113, + "mean_token_accuracy": 0.5635491609573364, + "num_tokens": 3978763621.0, + "step": 7784 + }, + { + "epoch": 2.105191995673337, + "grad_norm": 1.2019882202148438, + "learning_rate": 1.3698561990220687e-05, + "loss": 1.9982, + "mean_token_accuracy": 0.5530800223350525, + "num_tokens": 3979287897.0, + "step": 7785 + }, + { + "epoch": 2.1054624121146563, + "grad_norm": 1.4844467639923096, + "learning_rate": 1.3697057994783256e-05, + "loss": 1.9279, + "mean_token_accuracy": 0.5678879022598267, + "num_tokens": 3979812170.0, + "step": 7786 + }, + { + "epoch": 2.105732828555976, + "grad_norm": 1.399871826171875, + "learning_rate": 1.3695553916592155e-05, + "loss": 1.8662, + "mean_token_accuracy": 0.5671939849853516, + "num_tokens": 3980313153.0, + "step": 7787 + }, + { + "epoch": 2.1060032449972956, + "grad_norm": 1.203649878501892, + "learning_rate": 1.3694049755693531e-05, + "loss": 1.8859, + "mean_token_accuracy": 0.5811871290206909, + "num_tokens": 3980837414.0, + "step": 7788 + }, + { + "epoch": 2.1062736614386153, + "grad_norm": 1.5105419158935547, + "learning_rate": 1.3692545512133538e-05, + "loss": 1.8916, + "mean_token_accuracy": 0.5669680833816528, + "num_tokens": 3981361389.0, + "step": 7789 + }, + { + "epoch": 2.106544077879935, + "grad_norm": 1.5061193704605103, + "learning_rate": 1.3691041185958329e-05, + "loss": 1.7848, + "mean_token_accuracy": 0.5872184634208679, + "num_tokens": 3981885580.0, + "step": 7790 + }, + { + "epoch": 2.1068144943212546, + "grad_norm": 1.2649904489517212, + "learning_rate": 1.3689536777214065e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5659006834030151, + "num_tokens": 3982409861.0, + "step": 7791 + }, + { + "epoch": 2.107084910762574, + "grad_norm": 1.5099658966064453, + "learning_rate": 1.3688032285946901e-05, + "loss": 1.9239, + "mean_token_accuracy": 0.5634490251541138, + "num_tokens": 3982934124.0, + "step": 7792 + }, + { + "epoch": 2.107355327203894, + "grad_norm": 0.9920626878738403, + "learning_rate": 1.3686527712203006e-05, + "loss": 1.8248, + "mean_token_accuracy": 0.575700044631958, + "num_tokens": 3983458399.0, + "step": 7793 + }, + { + "epoch": 2.1076257436452135, + "grad_norm": 1.42494797706604, + "learning_rate": 1.3685023056028534e-05, + "loss": 1.9303, + "mean_token_accuracy": 0.5774437189102173, + "num_tokens": 3983982531.0, + "step": 7794 + }, + { + "epoch": 2.107896160086533, + "grad_norm": 1.1796410083770752, + "learning_rate": 1.3683518317469661e-05, + "loss": 1.8721, + "mean_token_accuracy": 0.53944331407547, + "num_tokens": 3984506671.0, + "step": 7795 + }, + { + "epoch": 2.1081665765278528, + "grad_norm": 1.1676455736160278, + "learning_rate": 1.3682013496572555e-05, + "loss": 1.8317, + "mean_token_accuracy": 0.5798739194869995, + "num_tokens": 3985030921.0, + "step": 7796 + }, + { + "epoch": 2.1084369929691724, + "grad_norm": 1.1444545984268188, + "learning_rate": 1.3680508593383387e-05, + "loss": 1.8959, + "mean_token_accuracy": 0.5687717199325562, + "num_tokens": 3985555187.0, + "step": 7797 + }, + { + "epoch": 2.108707409410492, + "grad_norm": 1.2195245027542114, + "learning_rate": 1.3679003607948335e-05, + "loss": 1.905, + "mean_token_accuracy": 0.5693948864936829, + "num_tokens": 3986079422.0, + "step": 7798 + }, + { + "epoch": 2.1089778258518117, + "grad_norm": 1.2742184400558472, + "learning_rate": 1.3677498540313573e-05, + "loss": 2.0567, + "mean_token_accuracy": 0.5397106409072876, + "num_tokens": 3986603619.0, + "step": 7799 + }, + { + "epoch": 2.1092482422931313, + "grad_norm": 1.2901326417922974, + "learning_rate": 1.3675993390525282e-05, + "loss": 1.9732, + "mean_token_accuracy": 0.5534751415252686, + "num_tokens": 3987127883.0, + "step": 7800 + }, + { + "epoch": 2.109518658734451, + "grad_norm": 0.5521053671836853, + "learning_rate": 1.3674488158629642e-05, + "loss": 1.2672, + "mean_token_accuracy": 0.664749264717102, + "num_tokens": 3987591518.0, + "step": 7801 + }, + { + "epoch": 2.1097890751757706, + "grad_norm": 1.502942442893982, + "learning_rate": 1.3672982844672843e-05, + "loss": 1.9698, + "mean_token_accuracy": 0.5387463569641113, + "num_tokens": 3988094419.0, + "step": 7802 + }, + { + "epoch": 2.1100594916170903, + "grad_norm": 1.422374963760376, + "learning_rate": 1.3671477448701065e-05, + "loss": 1.8755, + "mean_token_accuracy": 0.5917137861251831, + "num_tokens": 3988604605.0, + "step": 7803 + }, + { + "epoch": 2.11032990805841, + "grad_norm": 1.220054268836975, + "learning_rate": 1.3669971970760505e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.5644829869270325, + "num_tokens": 3989077861.0, + "step": 7804 + }, + { + "epoch": 2.1106003244997296, + "grad_norm": 1.152917742729187, + "learning_rate": 1.3668466410897355e-05, + "loss": 1.9499, + "mean_token_accuracy": 0.5499259233474731, + "num_tokens": 3989528548.0, + "step": 7805 + }, + { + "epoch": 2.110870740941049, + "grad_norm": 1.2108618021011353, + "learning_rate": 1.3666960769157807e-05, + "loss": 1.8952, + "mean_token_accuracy": 0.5411596298217773, + "num_tokens": 3990052751.0, + "step": 7806 + }, + { + "epoch": 2.111141157382369, + "grad_norm": 1.1656372547149658, + "learning_rate": 1.3665455045588059e-05, + "loss": 1.8765, + "mean_token_accuracy": 0.5886439681053162, + "num_tokens": 3990481032.0, + "step": 7807 + }, + { + "epoch": 2.1114115738236885, + "grad_norm": 1.1504508256912231, + "learning_rate": 1.3663949240234311e-05, + "loss": 2.0061, + "mean_token_accuracy": 0.5376560091972351, + "num_tokens": 3991005122.0, + "step": 7808 + }, + { + "epoch": 2.111681990265008, + "grad_norm": 1.0316643714904785, + "learning_rate": 1.3662443353142766e-05, + "loss": 1.9925, + "mean_token_accuracy": 0.544188916683197, + "num_tokens": 3991529335.0, + "step": 7809 + }, + { + "epoch": 2.111952406706328, + "grad_norm": 0.8759720325469971, + "learning_rate": 1.3660937384359629e-05, + "loss": 1.9273, + "mean_token_accuracy": 0.5680639743804932, + "num_tokens": 3992053493.0, + "step": 7810 + }, + { + "epoch": 2.1122228231476474, + "grad_norm": 0.979157030582428, + "learning_rate": 1.3659431333931108e-05, + "loss": 2.0219, + "mean_token_accuracy": 0.5361102223396301, + "num_tokens": 3992577761.0, + "step": 7811 + }, + { + "epoch": 2.112493239588967, + "grad_norm": 0.9261279106140137, + "learning_rate": 1.3657925201903412e-05, + "loss": 1.7033, + "mean_token_accuracy": 0.6155185699462891, + "num_tokens": 3993102016.0, + "step": 7812 + }, + { + "epoch": 2.1127636560302867, + "grad_norm": 1.014069676399231, + "learning_rate": 1.3656418988322759e-05, + "loss": 1.8861, + "mean_token_accuracy": 0.5753074884414673, + "num_tokens": 3993626236.0, + "step": 7813 + }, + { + "epoch": 2.1130340724716064, + "grad_norm": 1.1218088865280151, + "learning_rate": 1.3654912693235355e-05, + "loss": 2.0753, + "mean_token_accuracy": 0.5331628322601318, + "num_tokens": 3994150417.0, + "step": 7814 + }, + { + "epoch": 2.113304488912926, + "grad_norm": 1.0809600353240967, + "learning_rate": 1.3653406316687425e-05, + "loss": 1.9772, + "mean_token_accuracy": 0.5527109503746033, + "num_tokens": 3994669353.0, + "step": 7815 + }, + { + "epoch": 2.1135749053542456, + "grad_norm": 1.1047791242599487, + "learning_rate": 1.3651899858725185e-05, + "loss": 1.9197, + "mean_token_accuracy": 0.5706145167350769, + "num_tokens": 3995144043.0, + "step": 7816 + }, + { + "epoch": 2.1138453217955653, + "grad_norm": 1.201422929763794, + "learning_rate": 1.3650393319394857e-05, + "loss": 1.9342, + "mean_token_accuracy": 0.5496747493743896, + "num_tokens": 3995668234.0, + "step": 7817 + }, + { + "epoch": 2.114115738236885, + "grad_norm": 1.3337587118148804, + "learning_rate": 1.3648886698742666e-05, + "loss": 1.8214, + "mean_token_accuracy": 0.580765962600708, + "num_tokens": 3996192348.0, + "step": 7818 + }, + { + "epoch": 2.1143861546782046, + "grad_norm": 1.060490369796753, + "learning_rate": 1.3647379996814844e-05, + "loss": 1.6988, + "mean_token_accuracy": 0.6048134565353394, + "num_tokens": 3996716520.0, + "step": 7819 + }, + { + "epoch": 2.114656571119524, + "grad_norm": 0.9918392896652222, + "learning_rate": 1.364587321365762e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.5458837747573853, + "num_tokens": 3997240696.0, + "step": 7820 + }, + { + "epoch": 2.114926987560844, + "grad_norm": 0.6215873956680298, + "learning_rate": 1.3644366349317222e-05, + "loss": 1.1757, + "mean_token_accuracy": 0.6720359325408936, + "num_tokens": 3997764903.0, + "step": 7821 + }, + { + "epoch": 2.1151974040021635, + "grad_norm": 1.6291745901107788, + "learning_rate": 1.364285940383989e-05, + "loss": 1.9808, + "mean_token_accuracy": 0.539718508720398, + "num_tokens": 3998289139.0, + "step": 7822 + }, + { + "epoch": 2.115467820443483, + "grad_norm": 1.1377347707748413, + "learning_rate": 1.3641352377271858e-05, + "loss": 1.7015, + "mean_token_accuracy": 0.6159452199935913, + "num_tokens": 3998813313.0, + "step": 7823 + }, + { + "epoch": 2.115738236884803, + "grad_norm": 1.2319492101669312, + "learning_rate": 1.3639845269659367e-05, + "loss": 1.8506, + "mean_token_accuracy": 0.5756120085716248, + "num_tokens": 3999337556.0, + "step": 7824 + }, + { + "epoch": 2.1160086533261224, + "grad_norm": 1.13751220703125, + "learning_rate": 1.363833808104866e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.529026210308075, + "num_tokens": 3999861795.0, + "step": 7825 + }, + { + "epoch": 2.116279069767442, + "grad_norm": 1.2401156425476074, + "learning_rate": 1.3636830811485981e-05, + "loss": 1.9244, + "mean_token_accuracy": 0.5619788765907288, + "num_tokens": 4000385978.0, + "step": 7826 + }, + { + "epoch": 2.1165494862087613, + "grad_norm": 1.2620805501937866, + "learning_rate": 1.363532346101758e-05, + "loss": 1.981, + "mean_token_accuracy": 0.5626541376113892, + "num_tokens": 4000879114.0, + "step": 7827 + }, + { + "epoch": 2.1168199026500814, + "grad_norm": 1.1116520166397095, + "learning_rate": 1.3633816029689706e-05, + "loss": 2.0278, + "mean_token_accuracy": 0.5532416701316833, + "num_tokens": 4001403327.0, + "step": 7828 + }, + { + "epoch": 2.1170903190914006, + "grad_norm": 1.352187991142273, + "learning_rate": 1.363230851754861e-05, + "loss": 1.861, + "mean_token_accuracy": 0.5911168456077576, + "num_tokens": 4001927518.0, + "step": 7829 + }, + { + "epoch": 2.11736073553272, + "grad_norm": 1.0302538871765137, + "learning_rate": 1.3630800924640544e-05, + "loss": 1.8518, + "mean_token_accuracy": 0.5760418176651001, + "num_tokens": 4002451797.0, + "step": 7830 + }, + { + "epoch": 2.11763115197404, + "grad_norm": 1.1733217239379883, + "learning_rate": 1.3629293251011772e-05, + "loss": 1.8734, + "mean_token_accuracy": 0.5680713653564453, + "num_tokens": 4002976018.0, + "step": 7831 + }, + { + "epoch": 2.1179015684153595, + "grad_norm": 1.089844822883606, + "learning_rate": 1.3627785496708549e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5422801971435547, + "num_tokens": 4003500187.0, + "step": 7832 + }, + { + "epoch": 2.118171984856679, + "grad_norm": 1.1775723695755005, + "learning_rate": 1.3626277661777144e-05, + "loss": 1.9244, + "mean_token_accuracy": 0.5641610622406006, + "num_tokens": 4004024293.0, + "step": 7833 + }, + { + "epoch": 2.1184424012979988, + "grad_norm": 1.2664190530776978, + "learning_rate": 1.3624769746263813e-05, + "loss": 1.9992, + "mean_token_accuracy": 0.5466177463531494, + "num_tokens": 4004548470.0, + "step": 7834 + }, + { + "epoch": 2.1187128177393184, + "grad_norm": 1.453779697418213, + "learning_rate": 1.3623261750214827e-05, + "loss": 1.9229, + "mean_token_accuracy": 0.5508779287338257, + "num_tokens": 4005072721.0, + "step": 7835 + }, + { + "epoch": 2.118983234180638, + "grad_norm": 1.3325804471969604, + "learning_rate": 1.3621753673676457e-05, + "loss": 1.9962, + "mean_token_accuracy": 0.5732454061508179, + "num_tokens": 4005495578.0, + "step": 7836 + }, + { + "epoch": 2.1192536506219577, + "grad_norm": 1.2282863855361938, + "learning_rate": 1.3620245516694974e-05, + "loss": 1.8999, + "mean_token_accuracy": 0.5730059146881104, + "num_tokens": 4006019830.0, + "step": 7837 + }, + { + "epoch": 2.1195240670632773, + "grad_norm": 1.3194383382797241, + "learning_rate": 1.3618737279316651e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.5701619386672974, + "num_tokens": 4006544096.0, + "step": 7838 + }, + { + "epoch": 2.119794483504597, + "grad_norm": 1.096025824546814, + "learning_rate": 1.3617228961587772e-05, + "loss": 1.9652, + "mean_token_accuracy": 0.561628520488739, + "num_tokens": 4007068370.0, + "step": 7839 + }, + { + "epoch": 2.1200648999459166, + "grad_norm": 1.1778525114059448, + "learning_rate": 1.3615720563554608e-05, + "loss": 1.7597, + "mean_token_accuracy": 0.5904943346977234, + "num_tokens": 4007570983.0, + "step": 7840 + }, + { + "epoch": 2.1203353163872363, + "grad_norm": 0.6593084335327148, + "learning_rate": 1.3614212085263445e-05, + "loss": 1.0928, + "mean_token_accuracy": 0.7077604532241821, + "num_tokens": 4008095028.0, + "step": 7841 + }, + { + "epoch": 2.120605732828556, + "grad_norm": 1.3303906917572021, + "learning_rate": 1.3612703526760572e-05, + "loss": 1.8211, + "mean_token_accuracy": 0.5699315071105957, + "num_tokens": 4008619286.0, + "step": 7842 + }, + { + "epoch": 2.1208761492698756, + "grad_norm": 1.6334936618804932, + "learning_rate": 1.3611194888092266e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.5343664884567261, + "num_tokens": 4009143512.0, + "step": 7843 + }, + { + "epoch": 2.121146565711195, + "grad_norm": 1.0325554609298706, + "learning_rate": 1.3609686169304823e-05, + "loss": 1.8825, + "mean_token_accuracy": 0.5538796186447144, + "num_tokens": 4009644623.0, + "step": 7844 + }, + { + "epoch": 2.121416982152515, + "grad_norm": 1.3139644861221313, + "learning_rate": 1.3608177370444532e-05, + "loss": 1.9654, + "mean_token_accuracy": 0.5596269369125366, + "num_tokens": 4010168825.0, + "step": 7845 + }, + { + "epoch": 2.1216873985938345, + "grad_norm": 1.035709261894226, + "learning_rate": 1.360666849155769e-05, + "loss": 1.8604, + "mean_token_accuracy": 0.563616931438446, + "num_tokens": 4010693101.0, + "step": 7846 + }, + { + "epoch": 2.121957815035154, + "grad_norm": 0.9899608492851257, + "learning_rate": 1.3605159532690595e-05, + "loss": 1.7703, + "mean_token_accuracy": 0.5904432535171509, + "num_tokens": 4011174955.0, + "step": 7847 + }, + { + "epoch": 2.1222282314764738, + "grad_norm": 1.0442012548446655, + "learning_rate": 1.3603650493889543e-05, + "loss": 1.8457, + "mean_token_accuracy": 0.5824175477027893, + "num_tokens": 4011699140.0, + "step": 7848 + }, + { + "epoch": 2.1224986479177934, + "grad_norm": 1.0141746997833252, + "learning_rate": 1.3602141375200837e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5507904291152954, + "num_tokens": 4012223337.0, + "step": 7849 + }, + { + "epoch": 2.122769064359113, + "grad_norm": 1.1581602096557617, + "learning_rate": 1.360063217667078e-05, + "loss": 1.9362, + "mean_token_accuracy": 0.564639687538147, + "num_tokens": 4012747579.0, + "step": 7850 + }, + { + "epoch": 2.1230394808004327, + "grad_norm": 1.2312639951705933, + "learning_rate": 1.3599122898345683e-05, + "loss": 2.0405, + "mean_token_accuracy": 0.5318196415901184, + "num_tokens": 4013271681.0, + "step": 7851 + }, + { + "epoch": 2.1233098972417523, + "grad_norm": 1.164852261543274, + "learning_rate": 1.3597613540271848e-05, + "loss": 1.9204, + "mean_token_accuracy": 0.5710274577140808, + "num_tokens": 4013795860.0, + "step": 7852 + }, + { + "epoch": 2.123580313683072, + "grad_norm": 1.2452071905136108, + "learning_rate": 1.3596104102495594e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5607090592384338, + "num_tokens": 4014320137.0, + "step": 7853 + }, + { + "epoch": 2.1238507301243916, + "grad_norm": 1.1680433750152588, + "learning_rate": 1.359459458506323e-05, + "loss": 2.0359, + "mean_token_accuracy": 0.5496360063552856, + "num_tokens": 4014844394.0, + "step": 7854 + }, + { + "epoch": 2.1241211465657113, + "grad_norm": 1.1791224479675293, + "learning_rate": 1.3593084988021073e-05, + "loss": 1.8957, + "mean_token_accuracy": 0.567772626876831, + "num_tokens": 4015368566.0, + "step": 7855 + }, + { + "epoch": 2.124391563007031, + "grad_norm": 1.521668791770935, + "learning_rate": 1.3591575311415443e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.5327249765396118, + "num_tokens": 4015892849.0, + "step": 7856 + }, + { + "epoch": 2.1246619794483506, + "grad_norm": 1.2962878942489624, + "learning_rate": 1.3590065555292664e-05, + "loss": 2.0105, + "mean_token_accuracy": 0.5454656481742859, + "num_tokens": 4016417033.0, + "step": 7857 + }, + { + "epoch": 2.12493239588967, + "grad_norm": 1.3734428882598877, + "learning_rate": 1.3588555719699054e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.5756524205207825, + "num_tokens": 4016842060.0, + "step": 7858 + }, + { + "epoch": 2.12520281233099, + "grad_norm": 1.1328312158584595, + "learning_rate": 1.3587045804680943e-05, + "loss": 2.0176, + "mean_token_accuracy": 0.5353078246116638, + "num_tokens": 4017366315.0, + "step": 7859 + }, + { + "epoch": 2.1254732287723095, + "grad_norm": 1.1635240316390991, + "learning_rate": 1.3585535810284658e-05, + "loss": 1.9608, + "mean_token_accuracy": 0.5586984753608704, + "num_tokens": 4017842859.0, + "step": 7860 + }, + { + "epoch": 2.125743645213629, + "grad_norm": 0.4337039589881897, + "learning_rate": 1.358402573655653e-05, + "loss": 1.1256, + "mean_token_accuracy": 0.6917062997817993, + "num_tokens": 4018346255.0, + "step": 7861 + }, + { + "epoch": 2.1260140616549488, + "grad_norm": 1.9669848680496216, + "learning_rate": 1.3582515583542897e-05, + "loss": 1.9467, + "mean_token_accuracy": 0.5707974433898926, + "num_tokens": 4018870482.0, + "step": 7862 + }, + { + "epoch": 2.1262844780962684, + "grad_norm": 1.8656504154205322, + "learning_rate": 1.358100535129009e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.5686602592468262, + "num_tokens": 4019394661.0, + "step": 7863 + }, + { + "epoch": 2.126554894537588, + "grad_norm": 1.2579385042190552, + "learning_rate": 1.3579495039844447e-05, + "loss": 1.9279, + "mean_token_accuracy": 0.5498968958854675, + "num_tokens": 4019918806.0, + "step": 7864 + }, + { + "epoch": 2.1268253109789077, + "grad_norm": 1.2031182050704956, + "learning_rate": 1.3577984649252312e-05, + "loss": 1.8774, + "mean_token_accuracy": 0.5693763494491577, + "num_tokens": 4020443010.0, + "step": 7865 + }, + { + "epoch": 2.1270957274202273, + "grad_norm": 1.375484824180603, + "learning_rate": 1.3576474179560024e-05, + "loss": 1.8508, + "mean_token_accuracy": 0.5670091509819031, + "num_tokens": 4020967128.0, + "step": 7866 + }, + { + "epoch": 2.127366143861547, + "grad_norm": 1.362136960029602, + "learning_rate": 1.3574963630813932e-05, + "loss": 1.9936, + "mean_token_accuracy": 0.5425662994384766, + "num_tokens": 4021491298.0, + "step": 7867 + }, + { + "epoch": 2.127636560302866, + "grad_norm": 1.3020894527435303, + "learning_rate": 1.3573453003060382e-05, + "loss": 1.956, + "mean_token_accuracy": 0.566146731376648, + "num_tokens": 4021978324.0, + "step": 7868 + }, + { + "epoch": 2.1279069767441863, + "grad_norm": 1.196092128753662, + "learning_rate": 1.357194229634573e-05, + "loss": 1.9907, + "mean_token_accuracy": 0.5696051120758057, + "num_tokens": 4022483741.0, + "step": 7869 + }, + { + "epoch": 2.1281773931855055, + "grad_norm": 1.0858759880065918, + "learning_rate": 1.3570431510716321e-05, + "loss": 1.8995, + "mean_token_accuracy": 0.577178418636322, + "num_tokens": 4022931736.0, + "step": 7870 + }, + { + "epoch": 2.128447809626825, + "grad_norm": 1.0972659587860107, + "learning_rate": 1.3568920646218518e-05, + "loss": 1.8783, + "mean_token_accuracy": 0.5736473798751831, + "num_tokens": 4023456013.0, + "step": 7871 + }, + { + "epoch": 2.1287182260681448, + "grad_norm": 1.1794993877410889, + "learning_rate": 1.3567409702898675e-05, + "loss": 1.9029, + "mean_token_accuracy": 0.5711929798126221, + "num_tokens": 4023980259.0, + "step": 7872 + }, + { + "epoch": 2.1289886425094644, + "grad_norm": 1.1195945739746094, + "learning_rate": 1.3565898680803146e-05, + "loss": 1.9751, + "mean_token_accuracy": 0.5515146851539612, + "num_tokens": 4024444686.0, + "step": 7873 + }, + { + "epoch": 2.129259058950784, + "grad_norm": 1.1030550003051758, + "learning_rate": 1.3564387579978304e-05, + "loss": 1.8676, + "mean_token_accuracy": 0.5798789262771606, + "num_tokens": 4024968795.0, + "step": 7874 + }, + { + "epoch": 2.1295294753921037, + "grad_norm": 1.2282415628433228, + "learning_rate": 1.3562876400470511e-05, + "loss": 1.8611, + "mean_token_accuracy": 0.5799340009689331, + "num_tokens": 4025492973.0, + "step": 7875 + }, + { + "epoch": 2.1297998918334233, + "grad_norm": 1.1581459045410156, + "learning_rate": 1.3561365142326129e-05, + "loss": 1.9129, + "mean_token_accuracy": 0.5598902702331543, + "num_tokens": 4026017137.0, + "step": 7876 + }, + { + "epoch": 2.130070308274743, + "grad_norm": 1.136411190032959, + "learning_rate": 1.3559853805591534e-05, + "loss": 1.9575, + "mean_token_accuracy": 0.5530985593795776, + "num_tokens": 4026541314.0, + "step": 7877 + }, + { + "epoch": 2.1303407247160626, + "grad_norm": 1.0393450260162354, + "learning_rate": 1.3558342390313096e-05, + "loss": 1.8974, + "mean_token_accuracy": 0.5692770481109619, + "num_tokens": 4027040430.0, + "step": 7878 + }, + { + "epoch": 2.1306111411573823, + "grad_norm": 1.3186688423156738, + "learning_rate": 1.3556830896537192e-05, + "loss": 1.835, + "mean_token_accuracy": 0.5579742193222046, + "num_tokens": 4027564662.0, + "step": 7879 + }, + { + "epoch": 2.130881557598702, + "grad_norm": 1.1861908435821533, + "learning_rate": 1.3555319324310192e-05, + "loss": 1.8486, + "mean_token_accuracy": 0.5723854303359985, + "num_tokens": 4028047349.0, + "step": 7880 + }, + { + "epoch": 2.1311519740400215, + "grad_norm": 0.5885469317436218, + "learning_rate": 1.3553807673678483e-05, + "loss": 1.1109, + "mean_token_accuracy": 0.7040741443634033, + "num_tokens": 4028498486.0, + "step": 7881 + }, + { + "epoch": 2.131422390481341, + "grad_norm": 1.5510096549987793, + "learning_rate": 1.355229594468845e-05, + "loss": 1.8342, + "mean_token_accuracy": 0.5605970621109009, + "num_tokens": 4029022677.0, + "step": 7882 + }, + { + "epoch": 2.131692806922661, + "grad_norm": 1.563254475593567, + "learning_rate": 1.3550784137386464e-05, + "loss": 1.8586, + "mean_token_accuracy": 0.558495283126831, + "num_tokens": 4029546803.0, + "step": 7883 + }, + { + "epoch": 2.1319632233639805, + "grad_norm": 1.1389051675796509, + "learning_rate": 1.3549272251818922e-05, + "loss": 1.8862, + "mean_token_accuracy": 0.5830255746841431, + "num_tokens": 4030015340.0, + "step": 7884 + }, + { + "epoch": 2.1322336398053, + "grad_norm": 1.296831488609314, + "learning_rate": 1.3547760288032212e-05, + "loss": 1.9627, + "mean_token_accuracy": 0.5463591814041138, + "num_tokens": 4030476604.0, + "step": 7885 + }, + { + "epoch": 2.1325040562466198, + "grad_norm": 1.2884554862976074, + "learning_rate": 1.354624824607272e-05, + "loss": 1.896, + "mean_token_accuracy": 0.5754372477531433, + "num_tokens": 4031000882.0, + "step": 7886 + }, + { + "epoch": 2.1327744726879394, + "grad_norm": 1.0903576612472534, + "learning_rate": 1.3544736125986846e-05, + "loss": 1.9462, + "mean_token_accuracy": 0.5510503053665161, + "num_tokens": 4031525130.0, + "step": 7887 + }, + { + "epoch": 2.133044889129259, + "grad_norm": 1.1833208799362183, + "learning_rate": 1.3543223927820984e-05, + "loss": 1.9553, + "mean_token_accuracy": 0.5584594011306763, + "num_tokens": 4032049250.0, + "step": 7888 + }, + { + "epoch": 2.1333153055705787, + "grad_norm": 1.1650338172912598, + "learning_rate": 1.3541711651621531e-05, + "loss": 1.7452, + "mean_token_accuracy": 0.5931599140167236, + "num_tokens": 4032573400.0, + "step": 7889 + }, + { + "epoch": 2.1335857220118983, + "grad_norm": 1.1855131387710571, + "learning_rate": 1.3540199297434891e-05, + "loss": 1.9676, + "mean_token_accuracy": 0.5441267490386963, + "num_tokens": 4033097678.0, + "step": 7890 + }, + { + "epoch": 2.133856138453218, + "grad_norm": 1.2177176475524902, + "learning_rate": 1.353868686530747e-05, + "loss": 1.8985, + "mean_token_accuracy": 0.5717664957046509, + "num_tokens": 4033621839.0, + "step": 7891 + }, + { + "epoch": 2.1341265548945376, + "grad_norm": 1.2868869304656982, + "learning_rate": 1.3537174355285664e-05, + "loss": 1.8475, + "mean_token_accuracy": 0.592711329460144, + "num_tokens": 4034078568.0, + "step": 7892 + }, + { + "epoch": 2.1343969713358573, + "grad_norm": 1.317107915878296, + "learning_rate": 1.353566176741589e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.5847630500793457, + "num_tokens": 4034602844.0, + "step": 7893 + }, + { + "epoch": 2.134667387777177, + "grad_norm": 1.079804539680481, + "learning_rate": 1.3534149101744558e-05, + "loss": 1.9494, + "mean_token_accuracy": 0.5697808265686035, + "num_tokens": 4035092524.0, + "step": 7894 + }, + { + "epoch": 2.1349378042184965, + "grad_norm": 1.1671037673950195, + "learning_rate": 1.353263635831808e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5665860176086426, + "num_tokens": 4035559659.0, + "step": 7895 + }, + { + "epoch": 2.135208220659816, + "grad_norm": 1.36293625831604, + "learning_rate": 1.3531123537182863e-05, + "loss": 1.9187, + "mean_token_accuracy": 0.576933741569519, + "num_tokens": 4036083901.0, + "step": 7896 + }, + { + "epoch": 2.135478637101136, + "grad_norm": 0.9645471572875977, + "learning_rate": 1.352961063838534e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.565413236618042, + "num_tokens": 4036608175.0, + "step": 7897 + }, + { + "epoch": 2.1357490535424555, + "grad_norm": 1.1629713773727417, + "learning_rate": 1.3528097661971922e-05, + "loss": 1.9111, + "mean_token_accuracy": 0.5508247017860413, + "num_tokens": 4037132419.0, + "step": 7898 + }, + { + "epoch": 2.136019469983775, + "grad_norm": 1.4336928129196167, + "learning_rate": 1.3526584607989031e-05, + "loss": 1.9779, + "mean_token_accuracy": 0.5399559736251831, + "num_tokens": 4037656632.0, + "step": 7899 + }, + { + "epoch": 2.1362898864250948, + "grad_norm": 0.9735460877418518, + "learning_rate": 1.3525071476483093e-05, + "loss": 1.8903, + "mean_token_accuracy": 0.5582069158554077, + "num_tokens": 4038180648.0, + "step": 7900 + }, + { + "epoch": 2.1365603028664144, + "grad_norm": 0.6845473647117615, + "learning_rate": 1.352355826750054e-05, + "loss": 1.0863, + "mean_token_accuracy": 0.7132472991943359, + "num_tokens": 4038704421.0, + "step": 7901 + }, + { + "epoch": 2.136830719307734, + "grad_norm": 1.4448144435882568, + "learning_rate": 1.3522044981087796e-05, + "loss": 1.9883, + "mean_token_accuracy": 0.5696869492530823, + "num_tokens": 4039180296.0, + "step": 7902 + }, + { + "epoch": 2.1371011357490537, + "grad_norm": 1.4212994575500488, + "learning_rate": 1.3520531617291295e-05, + "loss": 1.891, + "mean_token_accuracy": 0.5549321174621582, + "num_tokens": 4039704562.0, + "step": 7903 + }, + { + "epoch": 2.1373715521903733, + "grad_norm": 1.3186900615692139, + "learning_rate": 1.3519018176157472e-05, + "loss": 1.927, + "mean_token_accuracy": 0.5689263343811035, + "num_tokens": 4040171122.0, + "step": 7904 + }, + { + "epoch": 2.137641968631693, + "grad_norm": 1.5552867650985718, + "learning_rate": 1.3517504657732767e-05, + "loss": 1.868, + "mean_token_accuracy": 0.5865925550460815, + "num_tokens": 4040662261.0, + "step": 7905 + }, + { + "epoch": 2.1379123850730126, + "grad_norm": 1.2779585123062134, + "learning_rate": 1.351599106206361e-05, + "loss": 1.841, + "mean_token_accuracy": 0.5679535865783691, + "num_tokens": 4041186347.0, + "step": 7906 + }, + { + "epoch": 2.1381828015143323, + "grad_norm": 1.1001667976379395, + "learning_rate": 1.351447738919645e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5677656531333923, + "num_tokens": 4041710605.0, + "step": 7907 + }, + { + "epoch": 2.138453217955652, + "grad_norm": 1.2533589601516724, + "learning_rate": 1.3512963639177728e-05, + "loss": 1.9583, + "mean_token_accuracy": 0.5593881607055664, + "num_tokens": 4042234767.0, + "step": 7908 + }, + { + "epoch": 2.138723634396971, + "grad_norm": 1.2987064123153687, + "learning_rate": 1.3511449812053893e-05, + "loss": 1.9116, + "mean_token_accuracy": 0.5728791356086731, + "num_tokens": 4042758950.0, + "step": 7909 + }, + { + "epoch": 2.138994050838291, + "grad_norm": 1.5644701719284058, + "learning_rate": 1.3509935907871394e-05, + "loss": 1.9661, + "mean_token_accuracy": 0.5457432270050049, + "num_tokens": 4043223774.0, + "step": 7910 + }, + { + "epoch": 2.1392644672796104, + "grad_norm": 1.371524453163147, + "learning_rate": 1.3508421926676679e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.5304592251777649, + "num_tokens": 4043748038.0, + "step": 7911 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 1.3045684099197388, + "learning_rate": 1.3506907868516198e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.560356616973877, + "num_tokens": 4044272211.0, + "step": 7912 + }, + { + "epoch": 2.1398053001622497, + "grad_norm": 1.6551405191421509, + "learning_rate": 1.3505393733436419e-05, + "loss": 1.7875, + "mean_token_accuracy": 0.5881695747375488, + "num_tokens": 4044796306.0, + "step": 7913 + }, + { + "epoch": 2.1400757166035693, + "grad_norm": 1.5803974866867065, + "learning_rate": 1.3503879521483786e-05, + "loss": 1.9026, + "mean_token_accuracy": 0.5525611042976379, + "num_tokens": 4045320491.0, + "step": 7914 + }, + { + "epoch": 2.140346133044889, + "grad_norm": 1.3256558179855347, + "learning_rate": 1.3502365232704766e-05, + "loss": 1.9214, + "mean_token_accuracy": 0.5442632436752319, + "num_tokens": 4045844672.0, + "step": 7915 + }, + { + "epoch": 2.1406165494862086, + "grad_norm": 1.0290095806121826, + "learning_rate": 1.3500850867145826e-05, + "loss": 1.809, + "mean_token_accuracy": 0.5749816298484802, + "num_tokens": 4046368921.0, + "step": 7916 + }, + { + "epoch": 2.1408869659275283, + "grad_norm": 1.5444265604019165, + "learning_rate": 1.3499336424853422e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5288397073745728, + "num_tokens": 4046846198.0, + "step": 7917 + }, + { + "epoch": 2.141157382368848, + "grad_norm": 1.2483201026916504, + "learning_rate": 1.3497821905874033e-05, + "loss": 1.82, + "mean_token_accuracy": 0.5654720067977905, + "num_tokens": 4047370482.0, + "step": 7918 + }, + { + "epoch": 2.1414277988101675, + "grad_norm": 1.2502845525741577, + "learning_rate": 1.3496307310254115e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5569360852241516, + "num_tokens": 4047894648.0, + "step": 7919 + }, + { + "epoch": 2.141698215251487, + "grad_norm": 1.2947242259979248, + "learning_rate": 1.3494792638040152e-05, + "loss": 1.9743, + "mean_token_accuracy": 0.5440608263015747, + "num_tokens": 4048418876.0, + "step": 7920 + }, + { + "epoch": 2.141968631692807, + "grad_norm": 0.5462651252746582, + "learning_rate": 1.3493277889278616e-05, + "loss": 1.1072, + "mean_token_accuracy": 0.6999301910400391, + "num_tokens": 4048943108.0, + "step": 7921 + }, + { + "epoch": 2.1422390481341265, + "grad_norm": 1.4168639183044434, + "learning_rate": 1.3491763064015978e-05, + "loss": 1.9541, + "mean_token_accuracy": 0.55224609375, + "num_tokens": 4049443746.0, + "step": 7922 + }, + { + "epoch": 2.142509464575446, + "grad_norm": 1.3593186140060425, + "learning_rate": 1.349024816229872e-05, + "loss": 1.9908, + "mean_token_accuracy": 0.5509046316146851, + "num_tokens": 4049967958.0, + "step": 7923 + }, + { + "epoch": 2.1427798810167658, + "grad_norm": 0.9791920781135559, + "learning_rate": 1.3488733184173329e-05, + "loss": 2.0392, + "mean_token_accuracy": 0.5440245866775513, + "num_tokens": 4050492214.0, + "step": 7924 + }, + { + "epoch": 2.1430502974580854, + "grad_norm": 1.0741617679595947, + "learning_rate": 1.3487218129686283e-05, + "loss": 1.8119, + "mean_token_accuracy": 0.6035653948783875, + "num_tokens": 4050952641.0, + "step": 7925 + }, + { + "epoch": 2.143320713899405, + "grad_norm": 1.063751459121704, + "learning_rate": 1.3485702998884072e-05, + "loss": 1.9848, + "mean_token_accuracy": 0.5524991750717163, + "num_tokens": 4051476891.0, + "step": 7926 + }, + { + "epoch": 2.1435911303407247, + "grad_norm": 1.0731096267700195, + "learning_rate": 1.3484187791813182e-05, + "loss": 1.9958, + "mean_token_accuracy": 0.5499054193496704, + "num_tokens": 4052001051.0, + "step": 7927 + }, + { + "epoch": 2.1438615467820443, + "grad_norm": 0.9836122989654541, + "learning_rate": 1.3482672508520107e-05, + "loss": 1.9504, + "mean_token_accuracy": 0.5464056730270386, + "num_tokens": 4052525319.0, + "step": 7928 + }, + { + "epoch": 2.144131963223364, + "grad_norm": 1.1933876276016235, + "learning_rate": 1.3481157149051334e-05, + "loss": 1.8763, + "mean_token_accuracy": 0.5595134496688843, + "num_tokens": 4053049509.0, + "step": 7929 + }, + { + "epoch": 2.1444023796646836, + "grad_norm": 1.2511320114135742, + "learning_rate": 1.3479641713453364e-05, + "loss": 1.9533, + "mean_token_accuracy": 0.5630204677581787, + "num_tokens": 4053573731.0, + "step": 7930 + }, + { + "epoch": 2.1446727961060033, + "grad_norm": 1.1400012969970703, + "learning_rate": 1.3478126201772696e-05, + "loss": 1.9777, + "mean_token_accuracy": 0.5496072173118591, + "num_tokens": 4054097993.0, + "step": 7931 + }, + { + "epoch": 2.144943212547323, + "grad_norm": 1.03152596950531, + "learning_rate": 1.347661061405583e-05, + "loss": 1.87, + "mean_token_accuracy": 0.5571975111961365, + "num_tokens": 4054622122.0, + "step": 7932 + }, + { + "epoch": 2.1452136289886425, + "grad_norm": 1.135255217552185, + "learning_rate": 1.3475094950349264e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.5500079393386841, + "num_tokens": 4055118373.0, + "step": 7933 + }, + { + "epoch": 2.145484045429962, + "grad_norm": 1.27588951587677, + "learning_rate": 1.3473579210699508e-05, + "loss": 1.9857, + "mean_token_accuracy": 0.5480414628982544, + "num_tokens": 4055642594.0, + "step": 7934 + }, + { + "epoch": 2.145754461871282, + "grad_norm": 1.3473899364471436, + "learning_rate": 1.3472063395153065e-05, + "loss": 2.0096, + "mean_token_accuracy": 0.5463886260986328, + "num_tokens": 4056166847.0, + "step": 7935 + }, + { + "epoch": 2.1460248783126015, + "grad_norm": 0.9568450450897217, + "learning_rate": 1.3470547503756448e-05, + "loss": 1.7302, + "mean_token_accuracy": 0.5876384973526001, + "num_tokens": 4056691097.0, + "step": 7936 + }, + { + "epoch": 2.146295294753921, + "grad_norm": 1.0611073970794678, + "learning_rate": 1.346903153655617e-05, + "loss": 1.9084, + "mean_token_accuracy": 0.5657138824462891, + "num_tokens": 4057215378.0, + "step": 7937 + }, + { + "epoch": 2.1465657111952408, + "grad_norm": 1.3155301809310913, + "learning_rate": 1.3467515493598743e-05, + "loss": 1.9168, + "mean_token_accuracy": 0.5539135336875916, + "num_tokens": 4057739652.0, + "step": 7938 + }, + { + "epoch": 2.1468361276365604, + "grad_norm": 1.1935100555419922, + "learning_rate": 1.3465999374930682e-05, + "loss": 1.9922, + "mean_token_accuracy": 0.5430953502655029, + "num_tokens": 4058263875.0, + "step": 7939 + }, + { + "epoch": 2.14710654407788, + "grad_norm": 1.1055198907852173, + "learning_rate": 1.3464483180598508e-05, + "loss": 1.8663, + "mean_token_accuracy": 0.5622702836990356, + "num_tokens": 4058788089.0, + "step": 7940 + }, + { + "epoch": 2.1473769605191997, + "grad_norm": 0.49283552169799805, + "learning_rate": 1.3462966910648749e-05, + "loss": 1.13, + "mean_token_accuracy": 0.7128648161888123, + "num_tokens": 4059312306.0, + "step": 7941 + }, + { + "epoch": 2.1476473769605193, + "grad_norm": 1.7194015979766846, + "learning_rate": 1.3461450565127915e-05, + "loss": 1.9224, + "mean_token_accuracy": 0.5719832181930542, + "num_tokens": 4059790500.0, + "step": 7942 + }, + { + "epoch": 2.147917793401839, + "grad_norm": 1.676129698753357, + "learning_rate": 1.3459934144082543e-05, + "loss": 1.8386, + "mean_token_accuracy": 0.5804938077926636, + "num_tokens": 4060263479.0, + "step": 7943 + }, + { + "epoch": 2.1481882098431586, + "grad_norm": 1.2093172073364258, + "learning_rate": 1.3458417647559154e-05, + "loss": 1.9518, + "mean_token_accuracy": 0.5651431083679199, + "num_tokens": 4060787673.0, + "step": 7944 + }, + { + "epoch": 2.1484586262844783, + "grad_norm": 1.2586734294891357, + "learning_rate": 1.3456901075604283e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.5644808411598206, + "num_tokens": 4061263314.0, + "step": 7945 + }, + { + "epoch": 2.148729042725798, + "grad_norm": 1.137191653251648, + "learning_rate": 1.3455384428264462e-05, + "loss": 1.87, + "mean_token_accuracy": 0.568290114402771, + "num_tokens": 4061753843.0, + "step": 7946 + }, + { + "epoch": 2.1489994591671175, + "grad_norm": 2.614769220352173, + "learning_rate": 1.3453867705586227e-05, + "loss": 1.8024, + "mean_token_accuracy": 0.5903600454330444, + "num_tokens": 4062278113.0, + "step": 7947 + }, + { + "epoch": 2.149269875608437, + "grad_norm": 1.6799814701080322, + "learning_rate": 1.3452350907616115e-05, + "loss": 1.8827, + "mean_token_accuracy": 0.5817780494689941, + "num_tokens": 4062802313.0, + "step": 7948 + }, + { + "epoch": 2.149540292049757, + "grad_norm": 1.3710473775863647, + "learning_rate": 1.3450834034400664e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.5755626559257507, + "num_tokens": 4063272614.0, + "step": 7949 + }, + { + "epoch": 2.149810708491076, + "grad_norm": 1.4153326749801636, + "learning_rate": 1.3449317085986416e-05, + "loss": 1.8779, + "mean_token_accuracy": 0.5775704383850098, + "num_tokens": 4063796896.0, + "step": 7950 + }, + { + "epoch": 2.150081124932396, + "grad_norm": 1.3203363418579102, + "learning_rate": 1.3447800062419918e-05, + "loss": 1.8236, + "mean_token_accuracy": 0.5792198181152344, + "num_tokens": 4064321130.0, + "step": 7951 + }, + { + "epoch": 2.1503515413737153, + "grad_norm": 1.1836402416229248, + "learning_rate": 1.3446282963747714e-05, + "loss": 1.8918, + "mean_token_accuracy": 0.5699769258499146, + "num_tokens": 4064771466.0, + "step": 7952 + }, + { + "epoch": 2.150621957815035, + "grad_norm": 1.64222252368927, + "learning_rate": 1.3444765790016356e-05, + "loss": 1.9249, + "mean_token_accuracy": 0.5530561208724976, + "num_tokens": 4065295737.0, + "step": 7953 + }, + { + "epoch": 2.1508923742563546, + "grad_norm": 1.3362904787063599, + "learning_rate": 1.3443248541272398e-05, + "loss": 1.8998, + "mean_token_accuracy": 0.5736573338508606, + "num_tokens": 4065819996.0, + "step": 7954 + }, + { + "epoch": 2.1511627906976742, + "grad_norm": 1.149718999862671, + "learning_rate": 1.3441731217562386e-05, + "loss": 1.915, + "mean_token_accuracy": 0.5537973642349243, + "num_tokens": 4066344097.0, + "step": 7955 + }, + { + "epoch": 2.151433207138994, + "grad_norm": 1.2669602632522583, + "learning_rate": 1.344021381893288e-05, + "loss": 1.9175, + "mean_token_accuracy": 0.5779985189437866, + "num_tokens": 4066856676.0, + "step": 7956 + }, + { + "epoch": 2.1517036235803135, + "grad_norm": 1.175460696220398, + "learning_rate": 1.3438696345430435e-05, + "loss": 1.8926, + "mean_token_accuracy": 0.5630464553833008, + "num_tokens": 4067380952.0, + "step": 7957 + }, + { + "epoch": 2.151974040021633, + "grad_norm": 1.0546276569366455, + "learning_rate": 1.3437178797101616e-05, + "loss": 1.9243, + "mean_token_accuracy": 0.5631852149963379, + "num_tokens": 4067905115.0, + "step": 7958 + }, + { + "epoch": 2.152244456462953, + "grad_norm": 0.9835465550422668, + "learning_rate": 1.3435661173992984e-05, + "loss": 1.9153, + "mean_token_accuracy": 0.5507388114929199, + "num_tokens": 4068383682.0, + "step": 7959 + }, + { + "epoch": 2.1525148729042725, + "grad_norm": 1.098004698753357, + "learning_rate": 1.3434143476151104e-05, + "loss": 1.7751, + "mean_token_accuracy": 0.5940288305282593, + "num_tokens": 4068907927.0, + "step": 7960 + }, + { + "epoch": 2.152785289345592, + "grad_norm": 0.5310194492340088, + "learning_rate": 1.3432625703622549e-05, + "loss": 1.1159, + "mean_token_accuracy": 0.6973373889923096, + "num_tokens": 4069432133.0, + "step": 7961 + }, + { + "epoch": 2.1530557057869117, + "grad_norm": 1.3249001502990723, + "learning_rate": 1.3431107856453878e-05, + "loss": 1.9287, + "mean_token_accuracy": 0.5490338802337646, + "num_tokens": 4069956318.0, + "step": 7962 + }, + { + "epoch": 2.1533261222282314, + "grad_norm": 1.116147756576538, + "learning_rate": 1.342958993469167e-05, + "loss": 1.8229, + "mean_token_accuracy": 0.5742273330688477, + "num_tokens": 4070480505.0, + "step": 7963 + }, + { + "epoch": 2.153596538669551, + "grad_norm": 1.3512260913848877, + "learning_rate": 1.3428071938382498e-05, + "loss": 1.8798, + "mean_token_accuracy": 0.5698069334030151, + "num_tokens": 4070951352.0, + "step": 7964 + }, + { + "epoch": 2.1538669551108707, + "grad_norm": 1.427072286605835, + "learning_rate": 1.3426553867572936e-05, + "loss": 1.8236, + "mean_token_accuracy": 0.5838645696640015, + "num_tokens": 4071475438.0, + "step": 7965 + }, + { + "epoch": 2.1541373715521903, + "grad_norm": 1.2496349811553955, + "learning_rate": 1.342503572230957e-05, + "loss": 1.8988, + "mean_token_accuracy": 0.5718743205070496, + "num_tokens": 4071999711.0, + "step": 7966 + }, + { + "epoch": 2.15440778799351, + "grad_norm": 8.539299964904785, + "learning_rate": 1.3423517502638974e-05, + "loss": 2.0344, + "mean_token_accuracy": 0.5838415622711182, + "num_tokens": 4072523886.0, + "step": 7967 + }, + { + "epoch": 2.1546782044348296, + "grad_norm": 2.0111804008483887, + "learning_rate": 1.3421999208607735e-05, + "loss": 1.9978, + "mean_token_accuracy": 0.5532965064048767, + "num_tokens": 4073036093.0, + "step": 7968 + }, + { + "epoch": 2.1549486208761492, + "grad_norm": 1.4701576232910156, + "learning_rate": 1.3420480840262435e-05, + "loss": 1.8436, + "mean_token_accuracy": 0.5599567294120789, + "num_tokens": 4073560282.0, + "step": 7969 + }, + { + "epoch": 2.155219037317469, + "grad_norm": 1.4526771306991577, + "learning_rate": 1.3418962397649669e-05, + "loss": 1.8742, + "mean_token_accuracy": 0.5537927150726318, + "num_tokens": 4074084568.0, + "step": 7970 + }, + { + "epoch": 2.1554894537587885, + "grad_norm": 1.7855645418167114, + "learning_rate": 1.3417443880816018e-05, + "loss": 1.8724, + "mean_token_accuracy": 0.5847268104553223, + "num_tokens": 4074608615.0, + "step": 7971 + }, + { + "epoch": 2.155759870200108, + "grad_norm": 1.5320310592651367, + "learning_rate": 1.3415925289808084e-05, + "loss": 2.1092, + "mean_token_accuracy": 0.5470162630081177, + "num_tokens": 4075089059.0, + "step": 7972 + }, + { + "epoch": 2.156030286641428, + "grad_norm": 1.3467570543289185, + "learning_rate": 1.3414406624672453e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5505171418190002, + "num_tokens": 4075613161.0, + "step": 7973 + }, + { + "epoch": 2.1563007030827475, + "grad_norm": 1.219281792640686, + "learning_rate": 1.341288788545573e-05, + "loss": 2.0767, + "mean_token_accuracy": 0.5330890417098999, + "num_tokens": 4076137351.0, + "step": 7974 + }, + { + "epoch": 2.156571119524067, + "grad_norm": 1.3504948616027832, + "learning_rate": 1.3411369072204507e-05, + "loss": 1.944, + "mean_token_accuracy": 0.558285117149353, + "num_tokens": 4076661635.0, + "step": 7975 + }, + { + "epoch": 2.1568415359653867, + "grad_norm": 1.1587164402008057, + "learning_rate": 1.3409850184965392e-05, + "loss": 1.8299, + "mean_token_accuracy": 0.572569727897644, + "num_tokens": 4077134324.0, + "step": 7976 + }, + { + "epoch": 2.1571119524067064, + "grad_norm": 1.4438523054122925, + "learning_rate": 1.340833122378499e-05, + "loss": 1.9453, + "mean_token_accuracy": 0.5553064346313477, + "num_tokens": 4077658604.0, + "step": 7977 + }, + { + "epoch": 2.157382368848026, + "grad_norm": 0.9948777556419373, + "learning_rate": 1.3406812188709899e-05, + "loss": 1.7603, + "mean_token_accuracy": 0.5698953866958618, + "num_tokens": 4078182830.0, + "step": 7978 + }, + { + "epoch": 2.1576527852893457, + "grad_norm": 1.0566799640655518, + "learning_rate": 1.3405293079786732e-05, + "loss": 1.9919, + "mean_token_accuracy": 0.5683799386024475, + "num_tokens": 4078707065.0, + "step": 7979 + }, + { + "epoch": 2.1579232017306653, + "grad_norm": 1.078908920288086, + "learning_rate": 1.3403773897062103e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5511951446533203, + "num_tokens": 4079231324.0, + "step": 7980 + }, + { + "epoch": 2.158193618171985, + "grad_norm": 0.5615342855453491, + "learning_rate": 1.3402254640582617e-05, + "loss": 1.1489, + "mean_token_accuracy": 0.695074200630188, + "num_tokens": 4079704375.0, + "step": 7981 + }, + { + "epoch": 2.1584640346133046, + "grad_norm": 1.4921314716339111, + "learning_rate": 1.3400735310394896e-05, + "loss": 1.9259, + "mean_token_accuracy": 0.5578078031539917, + "num_tokens": 4080228639.0, + "step": 7982 + }, + { + "epoch": 2.1587344510546242, + "grad_norm": 1.4687316417694092, + "learning_rate": 1.3399215906545555e-05, + "loss": 1.9184, + "mean_token_accuracy": 0.5807052254676819, + "num_tokens": 4080703532.0, + "step": 7983 + }, + { + "epoch": 2.159004867495944, + "grad_norm": 1.1445788145065308, + "learning_rate": 1.3397696429081219e-05, + "loss": 2.0599, + "mean_token_accuracy": 0.5206972360610962, + "num_tokens": 4081227637.0, + "step": 7984 + }, + { + "epoch": 2.1592752839372635, + "grad_norm": 1.2633399963378906, + "learning_rate": 1.3396176878048502e-05, + "loss": 2.0147, + "mean_token_accuracy": 0.5540672540664673, + "num_tokens": 4081751779.0, + "step": 7985 + }, + { + "epoch": 2.159545700378583, + "grad_norm": 1.43986177444458, + "learning_rate": 1.339465725349403e-05, + "loss": 1.8639, + "mean_token_accuracy": 0.5854839086532593, + "num_tokens": 4082276036.0, + "step": 7986 + }, + { + "epoch": 2.159816116819903, + "grad_norm": 1.0580289363861084, + "learning_rate": 1.3393137555464432e-05, + "loss": 1.883, + "mean_token_accuracy": 0.569679319858551, + "num_tokens": 4082800254.0, + "step": 7987 + }, + { + "epoch": 2.1600865332612225, + "grad_norm": 1.325410008430481, + "learning_rate": 1.3391617784006339e-05, + "loss": 1.6919, + "mean_token_accuracy": 0.6308340430259705, + "num_tokens": 4083300160.0, + "step": 7988 + }, + { + "epoch": 2.160356949702542, + "grad_norm": 1.5277689695358276, + "learning_rate": 1.3390097939166375e-05, + "loss": 1.9536, + "mean_token_accuracy": 0.5636898279190063, + "num_tokens": 4083824400.0, + "step": 7989 + }, + { + "epoch": 2.1606273661438617, + "grad_norm": 1.1255437135696411, + "learning_rate": 1.3388578020991183e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.5455197691917419, + "num_tokens": 4084348665.0, + "step": 7990 + }, + { + "epoch": 2.160897782585181, + "grad_norm": 1.2972530126571655, + "learning_rate": 1.3387058029527388e-05, + "loss": 1.9405, + "mean_token_accuracy": 0.5644040703773499, + "num_tokens": 4084872916.0, + "step": 7991 + }, + { + "epoch": 2.161168199026501, + "grad_norm": 1.203116774559021, + "learning_rate": 1.3385537964821634e-05, + "loss": 1.8615, + "mean_token_accuracy": 0.5722926259040833, + "num_tokens": 4085397135.0, + "step": 7992 + }, + { + "epoch": 2.1614386154678202, + "grad_norm": 1.0815107822418213, + "learning_rate": 1.3384017826920559e-05, + "loss": 2.008, + "mean_token_accuracy": 0.5512270927429199, + "num_tokens": 4085885608.0, + "step": 7993 + }, + { + "epoch": 2.16170903190914, + "grad_norm": 1.3931068181991577, + "learning_rate": 1.3382497615870808e-05, + "loss": 1.9608, + "mean_token_accuracy": 0.556311845779419, + "num_tokens": 4086409838.0, + "step": 7994 + }, + { + "epoch": 2.1619794483504595, + "grad_norm": 1.1030659675598145, + "learning_rate": 1.3380977331719022e-05, + "loss": 1.9128, + "mean_token_accuracy": 0.5614761114120483, + "num_tokens": 4086913353.0, + "step": 7995 + }, + { + "epoch": 2.162249864791779, + "grad_norm": 1.367691993713379, + "learning_rate": 1.337945697451185e-05, + "loss": 1.9629, + "mean_token_accuracy": 0.5688464641571045, + "num_tokens": 4087377091.0, + "step": 7996 + }, + { + "epoch": 2.162520281233099, + "grad_norm": 1.2667579650878906, + "learning_rate": 1.3377936544295941e-05, + "loss": 2.0131, + "mean_token_accuracy": 0.5474159717559814, + "num_tokens": 4087901168.0, + "step": 7997 + }, + { + "epoch": 2.1627906976744184, + "grad_norm": 1.0905905961990356, + "learning_rate": 1.3376416041117947e-05, + "loss": 1.7644, + "mean_token_accuracy": 0.5976235866546631, + "num_tokens": 4088421410.0, + "step": 7998 + }, + { + "epoch": 2.163061114115738, + "grad_norm": 1.160191297531128, + "learning_rate": 1.3374895465024518e-05, + "loss": 1.8616, + "mean_token_accuracy": 0.5648024082183838, + "num_tokens": 4088945407.0, + "step": 7999 + }, + { + "epoch": 2.1633315305570577, + "grad_norm": 1.0536267757415771, + "learning_rate": 1.3373374816062313e-05, + "loss": 1.9099, + "mean_token_accuracy": 0.5493000745773315, + "num_tokens": 4089469417.0, + "step": 8000 + }, + { + "epoch": 2.1636019469983774, + "grad_norm": 0.6133881211280823, + "learning_rate": 1.337185409427799e-05, + "loss": 1.0394, + "mean_token_accuracy": 0.7105716466903687, + "num_tokens": 4089948750.0, + "step": 8001 + }, + { + "epoch": 2.163872363439697, + "grad_norm": 2.3542497158050537, + "learning_rate": 1.3370333299718205e-05, + "loss": 1.7876, + "mean_token_accuracy": 0.5845592617988586, + "num_tokens": 4090467662.0, + "step": 8002 + }, + { + "epoch": 2.1641427798810167, + "grad_norm": 1.8288156986236572, + "learning_rate": 1.3368812432429628e-05, + "loss": 1.8508, + "mean_token_accuracy": 0.5661832094192505, + "num_tokens": 4090991925.0, + "step": 8003 + }, + { + "epoch": 2.1644131963223363, + "grad_norm": 1.3430025577545166, + "learning_rate": 1.3367291492458916e-05, + "loss": 1.7951, + "mean_token_accuracy": 0.5762612819671631, + "num_tokens": 4091516116.0, + "step": 8004 + }, + { + "epoch": 2.164683612763656, + "grad_norm": 1.2543106079101562, + "learning_rate": 1.3365770479852743e-05, + "loss": 1.9442, + "mean_token_accuracy": 0.5586521625518799, + "num_tokens": 4092040277.0, + "step": 8005 + }, + { + "epoch": 2.1649540292049756, + "grad_norm": 1.3926231861114502, + "learning_rate": 1.3364249394657769e-05, + "loss": 2.0385, + "mean_token_accuracy": 0.5389188528060913, + "num_tokens": 4092564415.0, + "step": 8006 + }, + { + "epoch": 2.1652244456462952, + "grad_norm": 1.1863411664962769, + "learning_rate": 1.3362728236920675e-05, + "loss": 2.0236, + "mean_token_accuracy": 0.5582010746002197, + "num_tokens": 4093041609.0, + "step": 8007 + }, + { + "epoch": 2.165494862087615, + "grad_norm": 1.5314929485321045, + "learning_rate": 1.3361207006688129e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5626356601715088, + "num_tokens": 4093565812.0, + "step": 8008 + }, + { + "epoch": 2.1657652785289345, + "grad_norm": 1.3709090948104858, + "learning_rate": 1.3359685704006807e-05, + "loss": 1.9987, + "mean_token_accuracy": 0.5290619134902954, + "num_tokens": 4094089928.0, + "step": 8009 + }, + { + "epoch": 2.166035694970254, + "grad_norm": 1.4223121404647827, + "learning_rate": 1.335816432892339e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.5517935752868652, + "num_tokens": 4094614192.0, + "step": 8010 + }, + { + "epoch": 2.166306111411574, + "grad_norm": 1.5119985342025757, + "learning_rate": 1.3356642881484556e-05, + "loss": 1.8844, + "mean_token_accuracy": 0.5735135674476624, + "num_tokens": 4095138450.0, + "step": 8011 + }, + { + "epoch": 2.1665765278528935, + "grad_norm": 1.218291163444519, + "learning_rate": 1.3355121361736985e-05, + "loss": 1.8257, + "mean_token_accuracy": 0.5735389590263367, + "num_tokens": 4095651001.0, + "step": 8012 + }, + { + "epoch": 2.166846944294213, + "grad_norm": 1.3029898405075073, + "learning_rate": 1.3353599769727372e-05, + "loss": 1.7859, + "mean_token_accuracy": 0.5791017413139343, + "num_tokens": 4096175098.0, + "step": 8013 + }, + { + "epoch": 2.1671173607355327, + "grad_norm": 1.4347987174987793, + "learning_rate": 1.335207810550239e-05, + "loss": 1.8788, + "mean_token_accuracy": 0.5536113977432251, + "num_tokens": 4096668720.0, + "step": 8014 + }, + { + "epoch": 2.1673877771768524, + "grad_norm": 1.248725175857544, + "learning_rate": 1.3350556369108734e-05, + "loss": 1.8893, + "mean_token_accuracy": 0.5591453313827515, + "num_tokens": 4097171074.0, + "step": 8015 + }, + { + "epoch": 2.167658193618172, + "grad_norm": 1.1641682386398315, + "learning_rate": 1.33490345605931e-05, + "loss": 1.9062, + "mean_token_accuracy": 0.5616760849952698, + "num_tokens": 4097695333.0, + "step": 8016 + }, + { + "epoch": 2.1679286100594917, + "grad_norm": 1.2514437437057495, + "learning_rate": 1.3347512680002177e-05, + "loss": 1.943, + "mean_token_accuracy": 0.5580916404724121, + "num_tokens": 4098219609.0, + "step": 8017 + }, + { + "epoch": 2.1681990265008113, + "grad_norm": 1.5009623765945435, + "learning_rate": 1.334599072738266e-05, + "loss": 2.0408, + "mean_token_accuracy": 0.5360375642776489, + "num_tokens": 4098739399.0, + "step": 8018 + }, + { + "epoch": 2.168469442942131, + "grad_norm": 1.213365912437439, + "learning_rate": 1.334446870278125e-05, + "loss": 1.88, + "mean_token_accuracy": 0.5722456574440002, + "num_tokens": 4099211005.0, + "step": 8019 + }, + { + "epoch": 2.1687398593834506, + "grad_norm": 1.1549590826034546, + "learning_rate": 1.3342946606244646e-05, + "loss": 1.9037, + "mean_token_accuracy": 0.5741125345230103, + "num_tokens": 4099715725.0, + "step": 8020 + }, + { + "epoch": 2.1690102758247702, + "grad_norm": 0.6508855223655701, + "learning_rate": 1.3341424437819548e-05, + "loss": 1.1303, + "mean_token_accuracy": 0.6941061019897461, + "num_tokens": 4100239794.0, + "step": 8021 + }, + { + "epoch": 2.16928069226609, + "grad_norm": 2.0897600650787354, + "learning_rate": 1.3339902197552664e-05, + "loss": 2.0006, + "mean_token_accuracy": 0.566620409488678, + "num_tokens": 4100706947.0, + "step": 8022 + }, + { + "epoch": 2.1695511087074095, + "grad_norm": 1.8611940145492554, + "learning_rate": 1.3338379885490698e-05, + "loss": 1.9467, + "mean_token_accuracy": 0.5541229844093323, + "num_tokens": 4101231228.0, + "step": 8023 + }, + { + "epoch": 2.169821525148729, + "grad_norm": 1.2172433137893677, + "learning_rate": 1.333685750168036e-05, + "loss": 1.9016, + "mean_token_accuracy": 0.5563552975654602, + "num_tokens": 4101755272.0, + "step": 8024 + }, + { + "epoch": 2.170091941590049, + "grad_norm": 1.7699577808380127, + "learning_rate": 1.333533504616836e-05, + "loss": 1.92, + "mean_token_accuracy": 0.5770280957221985, + "num_tokens": 4102241709.0, + "step": 8025 + }, + { + "epoch": 2.1703623580313685, + "grad_norm": 1.864343285560608, + "learning_rate": 1.3333812519001419e-05, + "loss": 1.7774, + "mean_token_accuracy": 0.5795835256576538, + "num_tokens": 4102741030.0, + "step": 8026 + }, + { + "epoch": 2.170632774472688, + "grad_norm": 1.644881010055542, + "learning_rate": 1.3332289920226243e-05, + "loss": 1.8955, + "mean_token_accuracy": 0.5946998596191406, + "num_tokens": 4103256399.0, + "step": 8027 + }, + { + "epoch": 2.1709031909140077, + "grad_norm": 1.6122108697891235, + "learning_rate": 1.3330767249889554e-05, + "loss": 2.0014, + "mean_token_accuracy": 0.5462908744812012, + "num_tokens": 4103780662.0, + "step": 8028 + }, + { + "epoch": 2.1711736073553274, + "grad_norm": 1.3334304094314575, + "learning_rate": 1.3329244508038071e-05, + "loss": 1.8625, + "mean_token_accuracy": 0.5818188786506653, + "num_tokens": 4104288553.0, + "step": 8029 + }, + { + "epoch": 2.171444023796647, + "grad_norm": 1.5505491495132446, + "learning_rate": 1.3327721694718516e-05, + "loss": 1.9315, + "mean_token_accuracy": 0.5582995414733887, + "num_tokens": 4104812828.0, + "step": 8030 + }, + { + "epoch": 2.1717144402379667, + "grad_norm": 1.1930313110351562, + "learning_rate": 1.3326198809977613e-05, + "loss": 1.825, + "mean_token_accuracy": 0.5666424036026001, + "num_tokens": 4105337080.0, + "step": 8031 + }, + { + "epoch": 2.171984856679286, + "grad_norm": 1.1233062744140625, + "learning_rate": 1.3324675853862092e-05, + "loss": 1.8757, + "mean_token_accuracy": 0.5750418305397034, + "num_tokens": 4105861160.0, + "step": 8032 + }, + { + "epoch": 2.172255273120606, + "grad_norm": 1.3527228832244873, + "learning_rate": 1.332315282641868e-05, + "loss": 2.0515, + "mean_token_accuracy": 0.5577138662338257, + "num_tokens": 4106345034.0, + "step": 8033 + }, + { + "epoch": 2.172525689561925, + "grad_norm": 1.3510407209396362, + "learning_rate": 1.3321629727694108e-05, + "loss": 1.8598, + "mean_token_accuracy": 0.5670450925827026, + "num_tokens": 4106869190.0, + "step": 8034 + }, + { + "epoch": 2.172796106003245, + "grad_norm": 1.4506440162658691, + "learning_rate": 1.3320106557735105e-05, + "loss": 1.9143, + "mean_token_accuracy": 0.562341570854187, + "num_tokens": 4107393413.0, + "step": 8035 + }, + { + "epoch": 2.1730665224445644, + "grad_norm": 1.2939752340316772, + "learning_rate": 1.3318583316588412e-05, + "loss": 1.827, + "mean_token_accuracy": 0.6184021234512329, + "num_tokens": 4107852709.0, + "step": 8036 + }, + { + "epoch": 2.173336938885884, + "grad_norm": 1.3047747611999512, + "learning_rate": 1.3317060004300761e-05, + "loss": 1.8631, + "mean_token_accuracy": 0.5904349684715271, + "num_tokens": 4108314295.0, + "step": 8037 + }, + { + "epoch": 2.1736073553272037, + "grad_norm": 1.37784743309021, + "learning_rate": 1.3315536620918901e-05, + "loss": 1.9042, + "mean_token_accuracy": 0.5777413845062256, + "num_tokens": 4108773706.0, + "step": 8038 + }, + { + "epoch": 2.1738777717685234, + "grad_norm": 1.5038813352584839, + "learning_rate": 1.3314013166489561e-05, + "loss": 1.9375, + "mean_token_accuracy": 0.5483798980712891, + "num_tokens": 4109297849.0, + "step": 8039 + }, + { + "epoch": 2.174148188209843, + "grad_norm": 1.2554677724838257, + "learning_rate": 1.3312489641059498e-05, + "loss": 1.9326, + "mean_token_accuracy": 0.574337363243103, + "num_tokens": 4109807634.0, + "step": 8040 + }, + { + "epoch": 2.1744186046511627, + "grad_norm": 0.5622175931930542, + "learning_rate": 1.3310966044675449e-05, + "loss": 1.1864, + "mean_token_accuracy": 0.6784334182739258, + "num_tokens": 4110331866.0, + "step": 8041 + }, + { + "epoch": 2.1746890210924823, + "grad_norm": 2.7188851833343506, + "learning_rate": 1.3309442377384166e-05, + "loss": 1.9638, + "mean_token_accuracy": 0.5444877743721008, + "num_tokens": 4110856140.0, + "step": 8042 + }, + { + "epoch": 2.174959437533802, + "grad_norm": 2.23783278465271, + "learning_rate": 1.33079186392324e-05, + "loss": 1.8686, + "mean_token_accuracy": 0.5708655118942261, + "num_tokens": 4111380419.0, + "step": 8043 + }, + { + "epoch": 2.1752298539751216, + "grad_norm": 1.6266697645187378, + "learning_rate": 1.3306394830266903e-05, + "loss": 1.9722, + "mean_token_accuracy": 0.5653523802757263, + "num_tokens": 4111825531.0, + "step": 8044 + }, + { + "epoch": 2.1755002704164412, + "grad_norm": 1.416090488433838, + "learning_rate": 1.3304870950534427e-05, + "loss": 1.9709, + "mean_token_accuracy": 0.565239667892456, + "num_tokens": 4112291094.0, + "step": 8045 + }, + { + "epoch": 2.175770686857761, + "grad_norm": 1.6642566919326782, + "learning_rate": 1.3303347000081739e-05, + "loss": 1.9428, + "mean_token_accuracy": 0.5624141097068787, + "num_tokens": 4112815366.0, + "step": 8046 + }, + { + "epoch": 2.1760411032990805, + "grad_norm": 1.1048099994659424, + "learning_rate": 1.3301822978955588e-05, + "loss": 1.6254, + "mean_token_accuracy": 0.6076128482818604, + "num_tokens": 4113339559.0, + "step": 8047 + }, + { + "epoch": 2.1763115197404, + "grad_norm": 1.3790336847305298, + "learning_rate": 1.3300298887202736e-05, + "loss": 1.7902, + "mean_token_accuracy": 0.6063113808631897, + "num_tokens": 4113863792.0, + "step": 8048 + }, + { + "epoch": 2.17658193618172, + "grad_norm": 1.4232046604156494, + "learning_rate": 1.3298774724869952e-05, + "loss": 1.8059, + "mean_token_accuracy": 0.5781865119934082, + "num_tokens": 4114388067.0, + "step": 8049 + }, + { + "epoch": 2.1768523526230394, + "grad_norm": 1.1159981489181519, + "learning_rate": 1.3297250492003997e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5788314342498779, + "num_tokens": 4114912234.0, + "step": 8050 + }, + { + "epoch": 2.177122769064359, + "grad_norm": 1.3011349439620972, + "learning_rate": 1.329572618865164e-05, + "loss": 1.8335, + "mean_token_accuracy": 0.56775963306427, + "num_tokens": 4115436504.0, + "step": 8051 + }, + { + "epoch": 2.1773931855056787, + "grad_norm": 1.5454789400100708, + "learning_rate": 1.3294201814859658e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.5417498350143433, + "num_tokens": 4115960770.0, + "step": 8052 + }, + { + "epoch": 2.1776636019469984, + "grad_norm": 1.1922721862792969, + "learning_rate": 1.3292677370674816e-05, + "loss": 1.9505, + "mean_token_accuracy": 0.5548611283302307, + "num_tokens": 4116485022.0, + "step": 8053 + }, + { + "epoch": 2.177934018388318, + "grad_norm": 1.540360450744629, + "learning_rate": 1.3291152856143885e-05, + "loss": 1.9306, + "mean_token_accuracy": 0.5523531436920166, + "num_tokens": 4117009092.0, + "step": 8054 + }, + { + "epoch": 2.1782044348296377, + "grad_norm": 1.5198177099227905, + "learning_rate": 1.328962827131365e-05, + "loss": 1.9329, + "mean_token_accuracy": 0.5670430064201355, + "num_tokens": 4117432532.0, + "step": 8055 + }, + { + "epoch": 2.1784748512709573, + "grad_norm": 1.1543525457382202, + "learning_rate": 1.328810361623089e-05, + "loss": 1.8622, + "mean_token_accuracy": 0.5580241680145264, + "num_tokens": 4117956796.0, + "step": 8056 + }, + { + "epoch": 2.178745267712277, + "grad_norm": 1.2595744132995605, + "learning_rate": 1.3286578890942377e-05, + "loss": 1.8037, + "mean_token_accuracy": 0.585955798625946, + "num_tokens": 4118480973.0, + "step": 8057 + }, + { + "epoch": 2.1790156841535966, + "grad_norm": 1.4970672130584717, + "learning_rate": 1.3285054095494898e-05, + "loss": 2.0189, + "mean_token_accuracy": 0.5449939370155334, + "num_tokens": 4118969863.0, + "step": 8058 + }, + { + "epoch": 2.1792861005949162, + "grad_norm": 1.4325101375579834, + "learning_rate": 1.3283529229935244e-05, + "loss": 1.9873, + "mean_token_accuracy": 0.5514519214630127, + "num_tokens": 4119451704.0, + "step": 8059 + }, + { + "epoch": 2.179556517036236, + "grad_norm": 1.3558311462402344, + "learning_rate": 1.3282004294310193e-05, + "loss": 1.9271, + "mean_token_accuracy": 0.556458592414856, + "num_tokens": 4119975974.0, + "step": 8060 + }, + { + "epoch": 2.1798269334775555, + "grad_norm": 0.5333354473114014, + "learning_rate": 1.3280479288666545e-05, + "loss": 1.014, + "mean_token_accuracy": 0.7235997915267944, + "num_tokens": 4120462377.0, + "step": 8061 + }, + { + "epoch": 2.180097349918875, + "grad_norm": 1.6352510452270508, + "learning_rate": 1.327895421305108e-05, + "loss": 1.8846, + "mean_token_accuracy": 0.5672029852867126, + "num_tokens": 4120986619.0, + "step": 8062 + }, + { + "epoch": 2.180367766360195, + "grad_norm": 1.5251226425170898, + "learning_rate": 1.3277429067510603e-05, + "loss": 1.9133, + "mean_token_accuracy": 0.5785123109817505, + "num_tokens": 4121503456.0, + "step": 8063 + }, + { + "epoch": 2.1806381828015144, + "grad_norm": 1.3974741697311401, + "learning_rate": 1.3275903852091905e-05, + "loss": 1.9403, + "mean_token_accuracy": 0.5708114504814148, + "num_tokens": 4122027649.0, + "step": 8064 + }, + { + "epoch": 2.180908599242834, + "grad_norm": 1.2352427244186401, + "learning_rate": 1.327437856684178e-05, + "loss": 1.8624, + "mean_token_accuracy": 0.5557045936584473, + "num_tokens": 4122551861.0, + "step": 8065 + }, + { + "epoch": 2.1811790156841537, + "grad_norm": 1.4956551790237427, + "learning_rate": 1.327285321180703e-05, + "loss": 1.9324, + "mean_token_accuracy": 0.5545955300331116, + "num_tokens": 4123075991.0, + "step": 8066 + }, + { + "epoch": 2.1814494321254734, + "grad_norm": 1.3693734407424927, + "learning_rate": 1.3271327787034466e-05, + "loss": 1.8592, + "mean_token_accuracy": 0.5781394243240356, + "num_tokens": 4123600214.0, + "step": 8067 + }, + { + "epoch": 2.181719848566793, + "grad_norm": 1.2334386110305786, + "learning_rate": 1.326980229257088e-05, + "loss": 1.7855, + "mean_token_accuracy": 0.5870826840400696, + "num_tokens": 4124124494.0, + "step": 8068 + }, + { + "epoch": 2.1819902650081127, + "grad_norm": 1.4643571376800537, + "learning_rate": 1.3268276728463092e-05, + "loss": 1.8519, + "mean_token_accuracy": 0.5526375770568848, + "num_tokens": 4124648761.0, + "step": 8069 + }, + { + "epoch": 2.1822606814494323, + "grad_norm": 1.513065218925476, + "learning_rate": 1.3266751094757897e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5546592473983765, + "num_tokens": 4125172916.0, + "step": 8070 + }, + { + "epoch": 2.182531097890752, + "grad_norm": 1.5503877401351929, + "learning_rate": 1.3265225391502114e-05, + "loss": 1.78, + "mean_token_accuracy": 0.5931316018104553, + "num_tokens": 4125687216.0, + "step": 8071 + }, + { + "epoch": 2.1828015143320716, + "grad_norm": 1.48318612575531, + "learning_rate": 1.3263699618742554e-05, + "loss": 1.9923, + "mean_token_accuracy": 0.5560083985328674, + "num_tokens": 4126211349.0, + "step": 8072 + }, + { + "epoch": 2.183071930773391, + "grad_norm": 1.3011831045150757, + "learning_rate": 1.3262173776526032e-05, + "loss": 1.841, + "mean_token_accuracy": 0.5811431407928467, + "num_tokens": 4126735634.0, + "step": 8073 + }, + { + "epoch": 2.183342347214711, + "grad_norm": 1.284864068031311, + "learning_rate": 1.3260647864899367e-05, + "loss": 1.8922, + "mean_token_accuracy": 0.5640233159065247, + "num_tokens": 4127259914.0, + "step": 8074 + }, + { + "epoch": 2.18361276365603, + "grad_norm": 1.3469651937484741, + "learning_rate": 1.3259121883909378e-05, + "loss": 1.9148, + "mean_token_accuracy": 0.5738621354103088, + "num_tokens": 4127784034.0, + "step": 8075 + }, + { + "epoch": 2.1838831800973497, + "grad_norm": 1.1351780891418457, + "learning_rate": 1.3257595833602887e-05, + "loss": 1.85, + "mean_token_accuracy": 0.5685074329376221, + "num_tokens": 4128308304.0, + "step": 8076 + }, + { + "epoch": 2.1841535965386694, + "grad_norm": 1.3786636590957642, + "learning_rate": 1.3256069714026713e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.5691307783126831, + "num_tokens": 4128832510.0, + "step": 8077 + }, + { + "epoch": 2.184424012979989, + "grad_norm": 1.2710480690002441, + "learning_rate": 1.3254543525227688e-05, + "loss": 1.8674, + "mean_token_accuracy": 0.5793721675872803, + "num_tokens": 4129350707.0, + "step": 8078 + }, + { + "epoch": 2.1846944294213086, + "grad_norm": 1.1495435237884521, + "learning_rate": 1.3253017267252637e-05, + "loss": 1.881, + "mean_token_accuracy": 0.5839412212371826, + "num_tokens": 4129874907.0, + "step": 8079 + }, + { + "epoch": 2.1849648458626283, + "grad_norm": 1.201625943183899, + "learning_rate": 1.325149094014839e-05, + "loss": 2.003, + "mean_token_accuracy": 0.5570224523544312, + "num_tokens": 4130375780.0, + "step": 8080 + }, + { + "epoch": 2.185235262303948, + "grad_norm": 0.5507093667984009, + "learning_rate": 1.324996454396178e-05, + "loss": 1.1049, + "mean_token_accuracy": 0.7008159160614014, + "num_tokens": 4130900008.0, + "step": 8081 + }, + { + "epoch": 2.1855056787452676, + "grad_norm": 1.6088942289352417, + "learning_rate": 1.3248438078739645e-05, + "loss": 1.7205, + "mean_token_accuracy": 0.607007622718811, + "num_tokens": 4131424277.0, + "step": 8082 + }, + { + "epoch": 2.185776095186587, + "grad_norm": 1.2506104707717896, + "learning_rate": 1.3246911544528812e-05, + "loss": 1.8604, + "mean_token_accuracy": 0.5624240636825562, + "num_tokens": 4131948533.0, + "step": 8083 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 1.4047964811325073, + "learning_rate": 1.3245384941376127e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.5682365894317627, + "num_tokens": 4132472735.0, + "step": 8084 + }, + { + "epoch": 2.1863169280692265, + "grad_norm": 1.2730565071105957, + "learning_rate": 1.3243858269328429e-05, + "loss": 1.8552, + "mean_token_accuracy": 0.5657052993774414, + "num_tokens": 4132996806.0, + "step": 8085 + }, + { + "epoch": 2.186587344510546, + "grad_norm": 1.0144498348236084, + "learning_rate": 1.324233152843256e-05, + "loss": 1.913, + "mean_token_accuracy": 0.5664663314819336, + "num_tokens": 4133520982.0, + "step": 8086 + }, + { + "epoch": 2.186857760951866, + "grad_norm": 1.1830097436904907, + "learning_rate": 1.3240804718735365e-05, + "loss": 2.0346, + "mean_token_accuracy": 0.5419237613677979, + "num_tokens": 4134042214.0, + "step": 8087 + }, + { + "epoch": 2.1871281773931854, + "grad_norm": 1.2450767755508423, + "learning_rate": 1.3239277840283691e-05, + "loss": 1.817, + "mean_token_accuracy": 0.5648213624954224, + "num_tokens": 4134566289.0, + "step": 8088 + }, + { + "epoch": 2.187398593834505, + "grad_norm": 1.1480005979537964, + "learning_rate": 1.323775089312439e-05, + "loss": 1.9613, + "mean_token_accuracy": 0.566054105758667, + "num_tokens": 4135090538.0, + "step": 8089 + }, + { + "epoch": 2.1876690102758247, + "grad_norm": 1.2145476341247559, + "learning_rate": 1.323622387730431e-05, + "loss": 2.0174, + "mean_token_accuracy": 0.5464264154434204, + "num_tokens": 4135614451.0, + "step": 8090 + }, + { + "epoch": 2.1879394267171444, + "grad_norm": 1.0608549118041992, + "learning_rate": 1.3234696792870306e-05, + "loss": 1.948, + "mean_token_accuracy": 0.5512300133705139, + "num_tokens": 4136138708.0, + "step": 8091 + }, + { + "epoch": 2.188209843158464, + "grad_norm": 1.2723528146743774, + "learning_rate": 1.3233169639869231e-05, + "loss": 1.8868, + "mean_token_accuracy": 0.5828354358673096, + "num_tokens": 4136618219.0, + "step": 8092 + }, + { + "epoch": 2.1884802595997837, + "grad_norm": 1.2114641666412354, + "learning_rate": 1.3231642418347943e-05, + "loss": 1.8281, + "mean_token_accuracy": 0.5705346465110779, + "num_tokens": 4137142427.0, + "step": 8093 + }, + { + "epoch": 2.1887506760411033, + "grad_norm": 0.9515162706375122, + "learning_rate": 1.3230115128353303e-05, + "loss": 1.9287, + "mean_token_accuracy": 0.5606156587600708, + "num_tokens": 4137666616.0, + "step": 8094 + }, + { + "epoch": 2.189021092482423, + "grad_norm": 1.1483328342437744, + "learning_rate": 1.3228587769932175e-05, + "loss": 1.9063, + "mean_token_accuracy": 0.5609378814697266, + "num_tokens": 4138190715.0, + "step": 8095 + }, + { + "epoch": 2.1892915089237426, + "grad_norm": 1.1135889291763306, + "learning_rate": 1.3227060343131419e-05, + "loss": 1.9058, + "mean_token_accuracy": 0.5628846287727356, + "num_tokens": 4138714954.0, + "step": 8096 + }, + { + "epoch": 2.1895619253650622, + "grad_norm": 0.9751259088516235, + "learning_rate": 1.3225532847997902e-05, + "loss": 1.8754, + "mean_token_accuracy": 0.570436954498291, + "num_tokens": 4139239129.0, + "step": 8097 + }, + { + "epoch": 2.189832341806382, + "grad_norm": 1.259332537651062, + "learning_rate": 1.3224005284578494e-05, + "loss": 1.8528, + "mean_token_accuracy": 0.5449689626693726, + "num_tokens": 4139763235.0, + "step": 8098 + }, + { + "epoch": 2.1901027582477015, + "grad_norm": 1.0038089752197266, + "learning_rate": 1.3222477652920063e-05, + "loss": 1.8442, + "mean_token_accuracy": 0.5649415254592896, + "num_tokens": 4140243848.0, + "step": 8099 + }, + { + "epoch": 2.190373174689021, + "grad_norm": 1.1452263593673706, + "learning_rate": 1.3220949953069481e-05, + "loss": 1.9219, + "mean_token_accuracy": 0.5314699411392212, + "num_tokens": 4140748986.0, + "step": 8100 + }, + { + "epoch": 2.190643591130341, + "grad_norm": 0.6019266843795776, + "learning_rate": 1.321942218507362e-05, + "loss": 1.0768, + "mean_token_accuracy": 0.7091261148452759, + "num_tokens": 4141273216.0, + "step": 8101 + }, + { + "epoch": 2.1909140075716604, + "grad_norm": 1.8039202690124512, + "learning_rate": 1.3217894348979366e-05, + "loss": 1.8748, + "mean_token_accuracy": 0.5807390809059143, + "num_tokens": 4141797457.0, + "step": 8102 + }, + { + "epoch": 2.19118442401298, + "grad_norm": 1.7063937187194824, + "learning_rate": 1.3216366444833585e-05, + "loss": 1.9103, + "mean_token_accuracy": 0.5721025466918945, + "num_tokens": 4142321682.0, + "step": 8103 + }, + { + "epoch": 2.1914548404542997, + "grad_norm": 1.1456772089004517, + "learning_rate": 1.3214838472683165e-05, + "loss": 1.9644, + "mean_token_accuracy": 0.5551273822784424, + "num_tokens": 4142845766.0, + "step": 8104 + }, + { + "epoch": 2.1917252568956194, + "grad_norm": 1.3865677118301392, + "learning_rate": 1.3213310432574991e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.5570439100265503, + "num_tokens": 4143369957.0, + "step": 8105 + }, + { + "epoch": 2.191995673336939, + "grad_norm": 1.417508602142334, + "learning_rate": 1.321178232455594e-05, + "loss": 1.735, + "mean_token_accuracy": 0.6225507259368896, + "num_tokens": 4143894181.0, + "step": 8106 + }, + { + "epoch": 2.1922660897782587, + "grad_norm": 1.480896234512329, + "learning_rate": 1.3210254148672905e-05, + "loss": 1.9219, + "mean_token_accuracy": 0.5485265254974365, + "num_tokens": 4144418449.0, + "step": 8107 + }, + { + "epoch": 2.1925365062195783, + "grad_norm": 1.458484172821045, + "learning_rate": 1.320872590497277e-05, + "loss": 1.8889, + "mean_token_accuracy": 0.564630389213562, + "num_tokens": 4144942732.0, + "step": 8108 + }, + { + "epoch": 2.192806922660898, + "grad_norm": 1.2855722904205322, + "learning_rate": 1.3207197593502432e-05, + "loss": 1.9199, + "mean_token_accuracy": 0.5659034252166748, + "num_tokens": 4145467001.0, + "step": 8109 + }, + { + "epoch": 2.1930773391022176, + "grad_norm": 1.3631469011306763, + "learning_rate": 1.3205669214308778e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5562156438827515, + "num_tokens": 4145991257.0, + "step": 8110 + }, + { + "epoch": 2.1933477555435372, + "grad_norm": 1.4676105976104736, + "learning_rate": 1.3204140767438709e-05, + "loss": 1.946, + "mean_token_accuracy": 0.5664293766021729, + "num_tokens": 4146446772.0, + "step": 8111 + }, + { + "epoch": 2.193618171984857, + "grad_norm": 1.1682227849960327, + "learning_rate": 1.320261225293912e-05, + "loss": 1.9342, + "mean_token_accuracy": 0.5672762393951416, + "num_tokens": 4146971050.0, + "step": 8112 + }, + { + "epoch": 2.1938885884261765, + "grad_norm": 1.242397665977478, + "learning_rate": 1.3201083670856908e-05, + "loss": 1.9561, + "mean_token_accuracy": 0.565325140953064, + "num_tokens": 4147495315.0, + "step": 8113 + }, + { + "epoch": 2.1941590048674957, + "grad_norm": 1.5619783401489258, + "learning_rate": 1.3199555021238972e-05, + "loss": 1.8437, + "mean_token_accuracy": 0.5802838206291199, + "num_tokens": 4147966162.0, + "step": 8114 + }, + { + "epoch": 2.194429421308816, + "grad_norm": 1.5983080863952637, + "learning_rate": 1.3198026304132225e-05, + "loss": 2.0353, + "mean_token_accuracy": 0.5532523393630981, + "num_tokens": 4148469369.0, + "step": 8115 + }, + { + "epoch": 2.194699837750135, + "grad_norm": 1.1293774843215942, + "learning_rate": 1.3196497519583564e-05, + "loss": 1.9226, + "mean_token_accuracy": 0.5641503930091858, + "num_tokens": 4148993627.0, + "step": 8116 + }, + { + "epoch": 2.1949702541914546, + "grad_norm": 1.210387110710144, + "learning_rate": 1.3194968667639901e-05, + "loss": 1.9012, + "mean_token_accuracy": 0.5665366649627686, + "num_tokens": 4149517768.0, + "step": 8117 + }, + { + "epoch": 2.1952406706327743, + "grad_norm": 1.3203891515731812, + "learning_rate": 1.3193439748348145e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5399349927902222, + "num_tokens": 4150041992.0, + "step": 8118 + }, + { + "epoch": 2.195511087074094, + "grad_norm": 1.2482913732528687, + "learning_rate": 1.3191910761755204e-05, + "loss": 1.909, + "mean_token_accuracy": 0.5502128005027771, + "num_tokens": 4150566223.0, + "step": 8119 + }, + { + "epoch": 2.1957815035154136, + "grad_norm": 1.1199867725372314, + "learning_rate": 1.3190381707907997e-05, + "loss": 1.7546, + "mean_token_accuracy": 0.5855124592781067, + "num_tokens": 4151090504.0, + "step": 8120 + }, + { + "epoch": 2.196051919956733, + "grad_norm": 0.5642635226249695, + "learning_rate": 1.3188852586853437e-05, + "loss": 1.1742, + "mean_token_accuracy": 0.6664593815803528, + "num_tokens": 4151603833.0, + "step": 8121 + }, + { + "epoch": 2.196322336398053, + "grad_norm": 1.679110050201416, + "learning_rate": 1.3187323398638445e-05, + "loss": 1.9905, + "mean_token_accuracy": 0.5535926818847656, + "num_tokens": 4152128100.0, + "step": 8122 + }, + { + "epoch": 2.1965927528393725, + "grad_norm": 1.5000720024108887, + "learning_rate": 1.3185794143309935e-05, + "loss": 2.0151, + "mean_token_accuracy": 0.5537258386611938, + "num_tokens": 4152597481.0, + "step": 8123 + }, + { + "epoch": 2.196863169280692, + "grad_norm": 1.373390793800354, + "learning_rate": 1.3184264820914835e-05, + "loss": 1.7946, + "mean_token_accuracy": 0.5819977521896362, + "num_tokens": 4153118997.0, + "step": 8124 + }, + { + "epoch": 2.197133585722012, + "grad_norm": 1.3137744665145874, + "learning_rate": 1.3182735431500067e-05, + "loss": 1.9357, + "mean_token_accuracy": 0.5627672672271729, + "num_tokens": 4153643229.0, + "step": 8125 + }, + { + "epoch": 2.1974040021633314, + "grad_norm": 1.1706026792526245, + "learning_rate": 1.3181205975112556e-05, + "loss": 1.8444, + "mean_token_accuracy": 0.5691574215888977, + "num_tokens": 4154167337.0, + "step": 8126 + }, + { + "epoch": 2.197674418604651, + "grad_norm": 1.431169867515564, + "learning_rate": 1.317967645179923e-05, + "loss": 1.8605, + "mean_token_accuracy": 0.5767225623130798, + "num_tokens": 4154691587.0, + "step": 8127 + }, + { + "epoch": 2.1979448350459707, + "grad_norm": 1.440301537513733, + "learning_rate": 1.3178146861607022e-05, + "loss": 2.0203, + "mean_token_accuracy": 0.5486193895339966, + "num_tokens": 4155215778.0, + "step": 8128 + }, + { + "epoch": 2.1982152514872904, + "grad_norm": 1.2514946460723877, + "learning_rate": 1.3176617204582865e-05, + "loss": 2.0217, + "mean_token_accuracy": 0.5409851670265198, + "num_tokens": 4155695339.0, + "step": 8129 + }, + { + "epoch": 2.19848566792861, + "grad_norm": 1.262183427810669, + "learning_rate": 1.3175087480773686e-05, + "loss": 1.95, + "mean_token_accuracy": 0.5600253343582153, + "num_tokens": 4156217686.0, + "step": 8130 + }, + { + "epoch": 2.1987560843699296, + "grad_norm": 1.6162278652191162, + "learning_rate": 1.3173557690226428e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.565301239490509, + "num_tokens": 4156741913.0, + "step": 8131 + }, + { + "epoch": 2.1990265008112493, + "grad_norm": 1.2744678258895874, + "learning_rate": 1.3172027832988034e-05, + "loss": 1.8229, + "mean_token_accuracy": 0.5662966370582581, + "num_tokens": 4157236372.0, + "step": 8132 + }, + { + "epoch": 2.199296917252569, + "grad_norm": 1.4160058498382568, + "learning_rate": 1.3170497909105433e-05, + "loss": 1.8295, + "mean_token_accuracy": 0.5858749151229858, + "num_tokens": 4157760541.0, + "step": 8133 + }, + { + "epoch": 2.1995673336938886, + "grad_norm": 1.1526751518249512, + "learning_rate": 1.3168967918625573e-05, + "loss": 1.7967, + "mean_token_accuracy": 0.5861817598342896, + "num_tokens": 4158284818.0, + "step": 8134 + }, + { + "epoch": 2.199837750135208, + "grad_norm": 1.3250309228897095, + "learning_rate": 1.31674378615954e-05, + "loss": 1.9704, + "mean_token_accuracy": 0.5612837672233582, + "num_tokens": 4158808999.0, + "step": 8135 + }, + { + "epoch": 2.200108166576528, + "grad_norm": 1.4288774728775024, + "learning_rate": 1.3165907738061858e-05, + "loss": 1.9247, + "mean_token_accuracy": 0.5650839805603027, + "num_tokens": 4159309346.0, + "step": 8136 + }, + { + "epoch": 2.2003785830178475, + "grad_norm": 1.2830071449279785, + "learning_rate": 1.3164377548071899e-05, + "loss": 1.9035, + "mean_token_accuracy": 0.5674088001251221, + "num_tokens": 4159795804.0, + "step": 8137 + }, + { + "epoch": 2.200648999459167, + "grad_norm": 1.5353950262069702, + "learning_rate": 1.3162847291672474e-05, + "loss": 1.7972, + "mean_token_accuracy": 0.57415771484375, + "num_tokens": 4160319963.0, + "step": 8138 + }, + { + "epoch": 2.200919415900487, + "grad_norm": 1.2869235277175903, + "learning_rate": 1.3161316968910528e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.5780191421508789, + "num_tokens": 4160813714.0, + "step": 8139 + }, + { + "epoch": 2.2011898323418064, + "grad_norm": 1.3920331001281738, + "learning_rate": 1.3159786579833025e-05, + "loss": 1.7641, + "mean_token_accuracy": 0.6011860370635986, + "num_tokens": 4161337956.0, + "step": 8140 + }, + { + "epoch": 2.201460248783126, + "grad_norm": 0.444046288728714, + "learning_rate": 1.3158256124486918e-05, + "loss": 1.0175, + "mean_token_accuracy": 0.7224814891815186, + "num_tokens": 4161862126.0, + "step": 8141 + }, + { + "epoch": 2.2017306652244457, + "grad_norm": 1.6921114921569824, + "learning_rate": 1.3156725602919167e-05, + "loss": 1.8728, + "mean_token_accuracy": 0.5569742918014526, + "num_tokens": 4162363353.0, + "step": 8142 + }, + { + "epoch": 2.2020010816657654, + "grad_norm": 1.5861101150512695, + "learning_rate": 1.3155195015176727e-05, + "loss": 1.8991, + "mean_token_accuracy": 0.554112434387207, + "num_tokens": 4162887514.0, + "step": 8143 + }, + { + "epoch": 2.202271498107085, + "grad_norm": 1.099871277809143, + "learning_rate": 1.3153664361306568e-05, + "loss": 1.9082, + "mean_token_accuracy": 0.5566306710243225, + "num_tokens": 4163411694.0, + "step": 8144 + }, + { + "epoch": 2.2025419145484046, + "grad_norm": 1.3328691720962524, + "learning_rate": 1.3152133641355656e-05, + "loss": 1.9089, + "mean_token_accuracy": 0.5638406276702881, + "num_tokens": 4163897153.0, + "step": 8145 + }, + { + "epoch": 2.2028123309897243, + "grad_norm": 1.5318585634231567, + "learning_rate": 1.3150602855370953e-05, + "loss": 2.0179, + "mean_token_accuracy": 0.5417574644088745, + "num_tokens": 4164421366.0, + "step": 8146 + }, + { + "epoch": 2.203082747431044, + "grad_norm": 1.2392827272415161, + "learning_rate": 1.314907200339943e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5585833787918091, + "num_tokens": 4164926904.0, + "step": 8147 + }, + { + "epoch": 2.2033531638723636, + "grad_norm": 1.1205064058303833, + "learning_rate": 1.3147541085488058e-05, + "loss": 1.8509, + "mean_token_accuracy": 0.5757560729980469, + "num_tokens": 4165451127.0, + "step": 8148 + }, + { + "epoch": 2.203623580313683, + "grad_norm": 0.9592485427856445, + "learning_rate": 1.314601010168381e-05, + "loss": 2.0533, + "mean_token_accuracy": 0.5243419408798218, + "num_tokens": 4165975400.0, + "step": 8149 + }, + { + "epoch": 2.203893996755003, + "grad_norm": 1.044542670249939, + "learning_rate": 1.3144479052033661e-05, + "loss": 1.6286, + "mean_token_accuracy": 0.616303563117981, + "num_tokens": 4166499535.0, + "step": 8150 + }, + { + "epoch": 2.2041644131963225, + "grad_norm": 1.183823585510254, + "learning_rate": 1.3142947936584588e-05, + "loss": 1.9277, + "mean_token_accuracy": 0.5630173683166504, + "num_tokens": 4167002369.0, + "step": 8151 + }, + { + "epoch": 2.204434829637642, + "grad_norm": 1.2258806228637695, + "learning_rate": 1.3141416755383569e-05, + "loss": 1.8989, + "mean_token_accuracy": 0.5540469288825989, + "num_tokens": 4167526424.0, + "step": 8152 + }, + { + "epoch": 2.204705246078962, + "grad_norm": 1.1471387147903442, + "learning_rate": 1.313988550847759e-05, + "loss": 2.0063, + "mean_token_accuracy": 0.5426040887832642, + "num_tokens": 4168050639.0, + "step": 8153 + }, + { + "epoch": 2.2049756625202814, + "grad_norm": 1.3718839883804321, + "learning_rate": 1.313835419591363e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5541787147521973, + "num_tokens": 4168574854.0, + "step": 8154 + }, + { + "epoch": 2.2052460789616006, + "grad_norm": 1.1567696332931519, + "learning_rate": 1.313682281773867e-05, + "loss": 1.8361, + "mean_token_accuracy": 0.5686042308807373, + "num_tokens": 4169099092.0, + "step": 8155 + }, + { + "epoch": 2.2055164954029207, + "grad_norm": 1.2234262228012085, + "learning_rate": 1.3135291373999708e-05, + "loss": 1.9055, + "mean_token_accuracy": 0.5707724094390869, + "num_tokens": 4169582346.0, + "step": 8156 + }, + { + "epoch": 2.20578691184424, + "grad_norm": 1.507058024406433, + "learning_rate": 1.3133759864743722e-05, + "loss": 1.9474, + "mean_token_accuracy": 0.5739390254020691, + "num_tokens": 4170044639.0, + "step": 8157 + }, + { + "epoch": 2.2060573282855596, + "grad_norm": 1.2893282175064087, + "learning_rate": 1.313222829001771e-05, + "loss": 1.9196, + "mean_token_accuracy": 0.5498659610748291, + "num_tokens": 4170568895.0, + "step": 8158 + }, + { + "epoch": 2.206327744726879, + "grad_norm": 1.0328925848007202, + "learning_rate": 1.3130696649868663e-05, + "loss": 1.8938, + "mean_token_accuracy": 0.5742125511169434, + "num_tokens": 4171065798.0, + "step": 8159 + }, + { + "epoch": 2.206598161168199, + "grad_norm": 1.1403145790100098, + "learning_rate": 1.312916494434358e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.5910938382148743, + "num_tokens": 4171525888.0, + "step": 8160 + }, + { + "epoch": 2.2068685776095185, + "grad_norm": 0.49195441603660583, + "learning_rate": 1.3127633173489454e-05, + "loss": 1.0481, + "mean_token_accuracy": 0.733439028263092, + "num_tokens": 4172003068.0, + "step": 8161 + }, + { + "epoch": 2.207138994050838, + "grad_norm": 1.3388346433639526, + "learning_rate": 1.3126101337353286e-05, + "loss": 1.8514, + "mean_token_accuracy": 0.5701699256896973, + "num_tokens": 4172527348.0, + "step": 8162 + }, + { + "epoch": 2.2074094104921578, + "grad_norm": 1.2757089138031006, + "learning_rate": 1.3124569435982074e-05, + "loss": 1.9962, + "mean_token_accuracy": 0.5395890474319458, + "num_tokens": 4173013851.0, + "step": 8163 + }, + { + "epoch": 2.2076798269334774, + "grad_norm": 1.037895917892456, + "learning_rate": 1.3123037469422826e-05, + "loss": 1.69, + "mean_token_accuracy": 0.6256422996520996, + "num_tokens": 4173528361.0, + "step": 8164 + }, + { + "epoch": 2.207950243374797, + "grad_norm": 1.162319540977478, + "learning_rate": 1.3121505437722541e-05, + "loss": 1.9433, + "mean_token_accuracy": 0.5679569244384766, + "num_tokens": 4174050462.0, + "step": 8165 + }, + { + "epoch": 2.2082206598161167, + "grad_norm": 1.4457817077636719, + "learning_rate": 1.3119973340928231e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.5625299215316772, + "num_tokens": 4174574664.0, + "step": 8166 + }, + { + "epoch": 2.2084910762574363, + "grad_norm": 1.333038091659546, + "learning_rate": 1.3118441179086909e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.58150315284729, + "num_tokens": 4175098869.0, + "step": 8167 + }, + { + "epoch": 2.208761492698756, + "grad_norm": 1.2129420042037964, + "learning_rate": 1.311690895224558e-05, + "loss": 1.8712, + "mean_token_accuracy": 0.5599769353866577, + "num_tokens": 4175622996.0, + "step": 8168 + }, + { + "epoch": 2.2090319091400756, + "grad_norm": 1.5062741041183472, + "learning_rate": 1.3115376660451258e-05, + "loss": 2.0292, + "mean_token_accuracy": 0.5451735258102417, + "num_tokens": 4176109483.0, + "step": 8169 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 1.1952202320098877, + "learning_rate": 1.3113844303750958e-05, + "loss": 1.8155, + "mean_token_accuracy": 0.5999958515167236, + "num_tokens": 4176573093.0, + "step": 8170 + }, + { + "epoch": 2.209572742022715, + "grad_norm": 1.2796032428741455, + "learning_rate": 1.3112311882191699e-05, + "loss": 1.853, + "mean_token_accuracy": 0.5783206224441528, + "num_tokens": 4177039331.0, + "step": 8171 + }, + { + "epoch": 2.2098431584640346, + "grad_norm": 1.9016072750091553, + "learning_rate": 1.31107793958205e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5403062105178833, + "num_tokens": 4177563460.0, + "step": 8172 + }, + { + "epoch": 2.210113574905354, + "grad_norm": 1.623225212097168, + "learning_rate": 1.3109246844684383e-05, + "loss": 1.7702, + "mean_token_accuracy": 0.5881778001785278, + "num_tokens": 4178087621.0, + "step": 8173 + }, + { + "epoch": 2.210383991346674, + "grad_norm": 1.7008535861968994, + "learning_rate": 1.310771422883037e-05, + "loss": 1.9508, + "mean_token_accuracy": 0.5730193853378296, + "num_tokens": 4178611845.0, + "step": 8174 + }, + { + "epoch": 2.2106544077879935, + "grad_norm": 1.3853121995925903, + "learning_rate": 1.3106181548305484e-05, + "loss": 1.9649, + "mean_token_accuracy": 0.5495680570602417, + "num_tokens": 4179135929.0, + "step": 8175 + }, + { + "epoch": 2.210924824229313, + "grad_norm": 1.0344293117523193, + "learning_rate": 1.3104648803156754e-05, + "loss": 1.5452, + "mean_token_accuracy": 0.6365272998809814, + "num_tokens": 4179660069.0, + "step": 8176 + }, + { + "epoch": 2.2111952406706328, + "grad_norm": 1.534788966178894, + "learning_rate": 1.310311599343121e-05, + "loss": 1.865, + "mean_token_accuracy": 0.5590029954910278, + "num_tokens": 4180184277.0, + "step": 8177 + }, + { + "epoch": 2.2114656571119524, + "grad_norm": 1.3481823205947876, + "learning_rate": 1.3101583119175882e-05, + "loss": 1.8767, + "mean_token_accuracy": 0.5554187893867493, + "num_tokens": 4180708446.0, + "step": 8178 + }, + { + "epoch": 2.211736073553272, + "grad_norm": 1.3865926265716553, + "learning_rate": 1.3100050180437805e-05, + "loss": 1.9516, + "mean_token_accuracy": 0.5614598393440247, + "num_tokens": 4181232731.0, + "step": 8179 + }, + { + "epoch": 2.2120064899945917, + "grad_norm": 1.5988683700561523, + "learning_rate": 1.3098517177264013e-05, + "loss": 1.9808, + "mean_token_accuracy": 0.5457922220230103, + "num_tokens": 4181742816.0, + "step": 8180 + }, + { + "epoch": 2.2122769064359114, + "grad_norm": 0.49548569321632385, + "learning_rate": 1.309698410970154e-05, + "loss": 1.03, + "mean_token_accuracy": 0.7255919575691223, + "num_tokens": 4182188562.0, + "step": 8181 + }, + { + "epoch": 2.212547322877231, + "grad_norm": 1.6678698062896729, + "learning_rate": 1.309545097779743e-05, + "loss": 1.9076, + "mean_token_accuracy": 0.5637243986129761, + "num_tokens": 4182676819.0, + "step": 8182 + }, + { + "epoch": 2.2128177393185506, + "grad_norm": 1.583806037902832, + "learning_rate": 1.3093917781598722e-05, + "loss": 1.9076, + "mean_token_accuracy": 0.5689541697502136, + "num_tokens": 4183201099.0, + "step": 8183 + }, + { + "epoch": 2.2130881557598703, + "grad_norm": 1.2907254695892334, + "learning_rate": 1.3092384521152457e-05, + "loss": 1.9091, + "mean_token_accuracy": 0.5640401244163513, + "num_tokens": 4183701202.0, + "step": 8184 + }, + { + "epoch": 2.21335857220119, + "grad_norm": 1.2848237752914429, + "learning_rate": 1.3090851196505685e-05, + "loss": 1.8396, + "mean_token_accuracy": 0.5958292484283447, + "num_tokens": 4184162941.0, + "step": 8185 + }, + { + "epoch": 2.2136289886425096, + "grad_norm": 1.3553528785705566, + "learning_rate": 1.3089317807705445e-05, + "loss": 1.8937, + "mean_token_accuracy": 0.5471087694168091, + "num_tokens": 4184687181.0, + "step": 8186 + }, + { + "epoch": 2.213899405083829, + "grad_norm": 1.1899745464324951, + "learning_rate": 1.3087784354798791e-05, + "loss": 1.7544, + "mean_token_accuracy": 0.5786614418029785, + "num_tokens": 4185211434.0, + "step": 8187 + }, + { + "epoch": 2.214169821525149, + "grad_norm": 1.2925734519958496, + "learning_rate": 1.3086250837832774e-05, + "loss": 2.0761, + "mean_token_accuracy": 0.5583614706993103, + "num_tokens": 4185672023.0, + "step": 8188 + }, + { + "epoch": 2.2144402379664685, + "grad_norm": 1.4058805704116821, + "learning_rate": 1.3084717256854447e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.568018913269043, + "num_tokens": 4186196122.0, + "step": 8189 + }, + { + "epoch": 2.214710654407788, + "grad_norm": 1.1377930641174316, + "learning_rate": 1.308318361191086e-05, + "loss": 1.8823, + "mean_token_accuracy": 0.5718091726303101, + "num_tokens": 4186720369.0, + "step": 8190 + }, + { + "epoch": 2.214981070849108, + "grad_norm": 1.3247828483581543, + "learning_rate": 1.3081649903049084e-05, + "loss": 2.0333, + "mean_token_accuracy": 0.5405163764953613, + "num_tokens": 4187244634.0, + "step": 8191 + }, + { + "epoch": 2.2152514872904274, + "grad_norm": 1.1176151037216187, + "learning_rate": 1.3080116130316157e-05, + "loss": 1.7708, + "mean_token_accuracy": 0.5867024660110474, + "num_tokens": 4187706201.0, + "step": 8192 + }, + { + "epoch": 2.215521903731747, + "grad_norm": 1.0859222412109375, + "learning_rate": 1.307858229375915e-05, + "loss": 1.8132, + "mean_token_accuracy": 0.5857741236686707, + "num_tokens": 4188230337.0, + "step": 8193 + }, + { + "epoch": 2.2157923201730667, + "grad_norm": 1.4836959838867188, + "learning_rate": 1.3077048393425128e-05, + "loss": 1.9982, + "mean_token_accuracy": 0.5533192157745361, + "num_tokens": 4188754590.0, + "step": 8194 + }, + { + "epoch": 2.2160627366143864, + "grad_norm": 1.0885214805603027, + "learning_rate": 1.3075514429361156e-05, + "loss": 1.8762, + "mean_token_accuracy": 0.5735320448875427, + "num_tokens": 4189278820.0, + "step": 8195 + }, + { + "epoch": 2.2163331530557056, + "grad_norm": 1.4487489461898804, + "learning_rate": 1.3073980401614295e-05, + "loss": 1.9711, + "mean_token_accuracy": 0.5609511137008667, + "num_tokens": 4189777567.0, + "step": 8196 + }, + { + "epoch": 2.2166035694970256, + "grad_norm": 1.6652638912200928, + "learning_rate": 1.3072446310231619e-05, + "loss": 1.9602, + "mean_token_accuracy": 0.5588949918746948, + "num_tokens": 4190301796.0, + "step": 8197 + }, + { + "epoch": 2.216873985938345, + "grad_norm": 1.1002535820007324, + "learning_rate": 1.3070912155260192e-05, + "loss": 1.903, + "mean_token_accuracy": 0.5624758005142212, + "num_tokens": 4190825900.0, + "step": 8198 + }, + { + "epoch": 2.2171444023796645, + "grad_norm": 1.3799513578414917, + "learning_rate": 1.3069377936747087e-05, + "loss": 1.9736, + "mean_token_accuracy": 0.583407998085022, + "num_tokens": 4191285958.0, + "step": 8199 + }, + { + "epoch": 2.217414818820984, + "grad_norm": 1.3049319982528687, + "learning_rate": 1.3067843654739385e-05, + "loss": 1.913, + "mean_token_accuracy": 0.5580338835716248, + "num_tokens": 4191778091.0, + "step": 8200 + }, + { + "epoch": 2.2176852352623038, + "grad_norm": 0.5692897439002991, + "learning_rate": 1.306630930928416e-05, + "loss": 1.1403, + "mean_token_accuracy": 0.701663076877594, + "num_tokens": 4192302066.0, + "step": 8201 + }, + { + "epoch": 2.2179556517036234, + "grad_norm": 1.555732011795044, + "learning_rate": 1.3064774900428486e-05, + "loss": 2.0306, + "mean_token_accuracy": 0.53365159034729, + "num_tokens": 4192826315.0, + "step": 8202 + }, + { + "epoch": 2.218226068144943, + "grad_norm": 1.378222107887268, + "learning_rate": 1.3063240428219446e-05, + "loss": 1.9248, + "mean_token_accuracy": 0.5681856870651245, + "num_tokens": 4193348914.0, + "step": 8203 + }, + { + "epoch": 2.2184964845862627, + "grad_norm": 1.2220889329910278, + "learning_rate": 1.3061705892704126e-05, + "loss": 1.9299, + "mean_token_accuracy": 0.5752984285354614, + "num_tokens": 4193812507.0, + "step": 8204 + }, + { + "epoch": 2.2187669010275823, + "grad_norm": 1.1121586561203003, + "learning_rate": 1.3060171293929604e-05, + "loss": 1.9553, + "mean_token_accuracy": 0.5531355738639832, + "num_tokens": 4194336525.0, + "step": 8205 + }, + { + "epoch": 2.219037317468902, + "grad_norm": 1.1061584949493408, + "learning_rate": 1.3058636631942965e-05, + "loss": 1.9238, + "mean_token_accuracy": 0.5626147389411926, + "num_tokens": 4194860717.0, + "step": 8206 + }, + { + "epoch": 2.2193077339102216, + "grad_norm": 1.2610149383544922, + "learning_rate": 1.3057101906791303e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5738905668258667, + "num_tokens": 4195347823.0, + "step": 8207 + }, + { + "epoch": 2.2195781503515413, + "grad_norm": 1.402933120727539, + "learning_rate": 1.3055567118521704e-05, + "loss": 1.9318, + "mean_token_accuracy": 0.5608123540878296, + "num_tokens": 4195871977.0, + "step": 8208 + }, + { + "epoch": 2.219848566792861, + "grad_norm": 1.1581778526306152, + "learning_rate": 1.3054032267181261e-05, + "loss": 1.8521, + "mean_token_accuracy": 0.583389401435852, + "num_tokens": 4196366981.0, + "step": 8209 + }, + { + "epoch": 2.2201189832341806, + "grad_norm": 1.2198336124420166, + "learning_rate": 1.3052497352817069e-05, + "loss": 1.8838, + "mean_token_accuracy": 0.5587913990020752, + "num_tokens": 4196891203.0, + "step": 8210 + }, + { + "epoch": 2.2203893996755, + "grad_norm": 1.3598591089248657, + "learning_rate": 1.3050962375476221e-05, + "loss": 1.8607, + "mean_token_accuracy": 0.5885503888130188, + "num_tokens": 4197415460.0, + "step": 8211 + }, + { + "epoch": 2.22065981611682, + "grad_norm": 1.3560012578964233, + "learning_rate": 1.3049427335205818e-05, + "loss": 1.99, + "mean_token_accuracy": 0.5655525922775269, + "num_tokens": 4197939498.0, + "step": 8212 + }, + { + "epoch": 2.2209302325581395, + "grad_norm": 1.6155791282653809, + "learning_rate": 1.3047892232052954e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.5736287832260132, + "num_tokens": 4198404052.0, + "step": 8213 + }, + { + "epoch": 2.221200648999459, + "grad_norm": 1.124234676361084, + "learning_rate": 1.3046357066064735e-05, + "loss": 1.9118, + "mean_token_accuracy": 0.5523207783699036, + "num_tokens": 4198928264.0, + "step": 8214 + }, + { + "epoch": 2.2214710654407788, + "grad_norm": 1.555478572845459, + "learning_rate": 1.3044821837288262e-05, + "loss": 1.9267, + "mean_token_accuracy": 0.5559618473052979, + "num_tokens": 4199417354.0, + "step": 8215 + }, + { + "epoch": 2.2217414818820984, + "grad_norm": 1.583222508430481, + "learning_rate": 1.3043286545770644e-05, + "loss": 1.9235, + "mean_token_accuracy": 0.5630348920822144, + "num_tokens": 4199891156.0, + "step": 8216 + }, + { + "epoch": 2.222011898323418, + "grad_norm": 1.0968737602233887, + "learning_rate": 1.3041751191558985e-05, + "loss": 1.7595, + "mean_token_accuracy": 0.5890430808067322, + "num_tokens": 4200384388.0, + "step": 8217 + }, + { + "epoch": 2.2222823147647377, + "grad_norm": 1.5766289234161377, + "learning_rate": 1.3040215774700396e-05, + "loss": 1.8134, + "mean_token_accuracy": 0.5760860443115234, + "num_tokens": 4200908575.0, + "step": 8218 + }, + { + "epoch": 2.2225527312060573, + "grad_norm": 1.0154794454574585, + "learning_rate": 1.3038680295241983e-05, + "loss": 1.7692, + "mean_token_accuracy": 0.5890101194381714, + "num_tokens": 4201393757.0, + "step": 8219 + }, + { + "epoch": 2.222823147647377, + "grad_norm": 1.0037775039672852, + "learning_rate": 1.303714475323087e-05, + "loss": 1.8266, + "mean_token_accuracy": 0.5744898319244385, + "num_tokens": 4201917981.0, + "step": 8220 + }, + { + "epoch": 2.2230935640886966, + "grad_norm": 0.5394905209541321, + "learning_rate": 1.3035609148714156e-05, + "loss": 1.1612, + "mean_token_accuracy": 0.6993415355682373, + "num_tokens": 4202442177.0, + "step": 8221 + }, + { + "epoch": 2.2233639805300163, + "grad_norm": 1.4501477479934692, + "learning_rate": 1.3034073481738973e-05, + "loss": 1.8618, + "mean_token_accuracy": 0.5898759365081787, + "num_tokens": 4202903407.0, + "step": 8222 + }, + { + "epoch": 2.223634396971336, + "grad_norm": 1.3171948194503784, + "learning_rate": 1.3032537752352435e-05, + "loss": 1.9288, + "mean_token_accuracy": 0.5725046992301941, + "num_tokens": 4203384625.0, + "step": 8223 + }, + { + "epoch": 2.2239048134126556, + "grad_norm": 1.381128191947937, + "learning_rate": 1.303100196060166e-05, + "loss": 1.9585, + "mean_token_accuracy": 0.5309476852416992, + "num_tokens": 4203908709.0, + "step": 8224 + }, + { + "epoch": 2.224175229853975, + "grad_norm": 1.2731009721755981, + "learning_rate": 1.3029466106533773e-05, + "loss": 1.9176, + "mean_token_accuracy": 0.5606016516685486, + "num_tokens": 4204407619.0, + "step": 8225 + }, + { + "epoch": 2.224445646295295, + "grad_norm": 1.0690743923187256, + "learning_rate": 1.3027930190195895e-05, + "loss": 1.9319, + "mean_token_accuracy": 0.5490855574607849, + "num_tokens": 4204931847.0, + "step": 8226 + }, + { + "epoch": 2.2247160627366145, + "grad_norm": 1.1721796989440918, + "learning_rate": 1.3026394211635158e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.566612958908081, + "num_tokens": 4205456063.0, + "step": 8227 + }, + { + "epoch": 2.224986479177934, + "grad_norm": 1.2395496368408203, + "learning_rate": 1.3024858170898682e-05, + "loss": 1.818, + "mean_token_accuracy": 0.5644160509109497, + "num_tokens": 4205980330.0, + "step": 8228 + }, + { + "epoch": 2.2252568956192538, + "grad_norm": 1.2838789224624634, + "learning_rate": 1.3023322068033608e-05, + "loss": 1.8923, + "mean_token_accuracy": 0.5462796688079834, + "num_tokens": 4206504611.0, + "step": 8229 + }, + { + "epoch": 2.2255273120605734, + "grad_norm": 1.2161805629730225, + "learning_rate": 1.302178590308706e-05, + "loss": 1.9649, + "mean_token_accuracy": 0.549466073513031, + "num_tokens": 4207028806.0, + "step": 8230 + }, + { + "epoch": 2.225797728501893, + "grad_norm": 1.0058223009109497, + "learning_rate": 1.3020249676106176e-05, + "loss": 1.8512, + "mean_token_accuracy": 0.5769102573394775, + "num_tokens": 4207534404.0, + "step": 8231 + }, + { + "epoch": 2.2260681449432127, + "grad_norm": 1.1989266872406006, + "learning_rate": 1.3018713387138091e-05, + "loss": 1.9305, + "mean_token_accuracy": 0.5449748039245605, + "num_tokens": 4208058672.0, + "step": 8232 + }, + { + "epoch": 2.2263385613845323, + "grad_norm": 1.1153627634048462, + "learning_rate": 1.3017177036229942e-05, + "loss": 1.9181, + "mean_token_accuracy": 0.5682607889175415, + "num_tokens": 4208579894.0, + "step": 8233 + }, + { + "epoch": 2.226608977825852, + "grad_norm": 1.0860803127288818, + "learning_rate": 1.301564062342887e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5666787624359131, + "num_tokens": 4209104173.0, + "step": 8234 + }, + { + "epoch": 2.2268793942671716, + "grad_norm": 1.22254478931427, + "learning_rate": 1.3014104148782017e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5390294790267944, + "num_tokens": 4209628365.0, + "step": 8235 + }, + { + "epoch": 2.2271498107084913, + "grad_norm": 1.1776683330535889, + "learning_rate": 1.3012567612336522e-05, + "loss": 1.8353, + "mean_token_accuracy": 0.5716402530670166, + "num_tokens": 4210152563.0, + "step": 8236 + }, + { + "epoch": 2.227420227149811, + "grad_norm": 1.2264182567596436, + "learning_rate": 1.3011031014139537e-05, + "loss": 1.9089, + "mean_token_accuracy": 0.575629472732544, + "num_tokens": 4210647647.0, + "step": 8237 + }, + { + "epoch": 2.2276906435911306, + "grad_norm": 1.036281943321228, + "learning_rate": 1.3009494354238203e-05, + "loss": 1.8737, + "mean_token_accuracy": 0.5490994453430176, + "num_tokens": 4211171866.0, + "step": 8238 + }, + { + "epoch": 2.2279610600324498, + "grad_norm": 1.198648452758789, + "learning_rate": 1.3007957632679675e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.5485597848892212, + "num_tokens": 4211643076.0, + "step": 8239 + }, + { + "epoch": 2.2282314764737694, + "grad_norm": 1.1391828060150146, + "learning_rate": 1.3006420849511106e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5586128830909729, + "num_tokens": 4212167346.0, + "step": 8240 + }, + { + "epoch": 2.228501892915089, + "grad_norm": 0.6498465538024902, + "learning_rate": 1.3004884004779639e-05, + "loss": 1.099, + "mean_token_accuracy": 0.6999359130859375, + "num_tokens": 4212691588.0, + "step": 8241 + }, + { + "epoch": 2.2287723093564087, + "grad_norm": 1.4971106052398682, + "learning_rate": 1.3003347098532437e-05, + "loss": 1.9205, + "mean_token_accuracy": 0.5742325186729431, + "num_tokens": 4213214843.0, + "step": 8242 + }, + { + "epoch": 2.2290427257977283, + "grad_norm": 1.4423996210098267, + "learning_rate": 1.3001810130816655e-05, + "loss": 1.8272, + "mean_token_accuracy": 0.5758976936340332, + "num_tokens": 4213645914.0, + "step": 8243 + }, + { + "epoch": 2.229313142239048, + "grad_norm": 1.1490962505340576, + "learning_rate": 1.3000273101679449e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5500521659851074, + "num_tokens": 4214169966.0, + "step": 8244 + }, + { + "epoch": 2.2295835586803676, + "grad_norm": 1.350866436958313, + "learning_rate": 1.2998736011167984e-05, + "loss": 2.0127, + "mean_token_accuracy": 0.5415053367614746, + "num_tokens": 4214694245.0, + "step": 8245 + }, + { + "epoch": 2.2298539751216873, + "grad_norm": 1.3042781352996826, + "learning_rate": 1.299719885932942e-05, + "loss": 1.9983, + "mean_token_accuracy": 0.5628823041915894, + "num_tokens": 4215177119.0, + "step": 8246 + }, + { + "epoch": 2.230124391563007, + "grad_norm": 1.1654857397079468, + "learning_rate": 1.2995661646210922e-05, + "loss": 1.8439, + "mean_token_accuracy": 0.5577383637428284, + "num_tokens": 4215701369.0, + "step": 8247 + }, + { + "epoch": 2.2303948080043265, + "grad_norm": 1.2837547063827515, + "learning_rate": 1.2994124371859653e-05, + "loss": 1.9758, + "mean_token_accuracy": 0.5515388250350952, + "num_tokens": 4216174664.0, + "step": 8248 + }, + { + "epoch": 2.230665224445646, + "grad_norm": 1.141121745109558, + "learning_rate": 1.2992587036322788e-05, + "loss": 1.7229, + "mean_token_accuracy": 0.6147659420967102, + "num_tokens": 4216698876.0, + "step": 8249 + }, + { + "epoch": 2.230935640886966, + "grad_norm": 1.2259182929992676, + "learning_rate": 1.2991049639647488e-05, + "loss": 1.8899, + "mean_token_accuracy": 0.5664716362953186, + "num_tokens": 4217223057.0, + "step": 8250 + }, + { + "epoch": 2.2312060573282855, + "grad_norm": 1.064316749572754, + "learning_rate": 1.2989512181880935e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.558833658695221, + "num_tokens": 4217747193.0, + "step": 8251 + }, + { + "epoch": 2.231476473769605, + "grad_norm": 1.1542892456054688, + "learning_rate": 1.2987974663070295e-05, + "loss": 2.0407, + "mean_token_accuracy": 0.5472184419631958, + "num_tokens": 4218271463.0, + "step": 8252 + }, + { + "epoch": 2.2317468902109248, + "grad_norm": 1.2574143409729004, + "learning_rate": 1.2986437083262747e-05, + "loss": 1.77, + "mean_token_accuracy": 0.586820125579834, + "num_tokens": 4218731984.0, + "step": 8253 + }, + { + "epoch": 2.2320173066522444, + "grad_norm": 1.1652717590332031, + "learning_rate": 1.2984899442505465e-05, + "loss": 1.9506, + "mean_token_accuracy": 0.5630469918251038, + "num_tokens": 4219256268.0, + "step": 8254 + }, + { + "epoch": 2.232287723093564, + "grad_norm": 1.4683095216751099, + "learning_rate": 1.2983361740845634e-05, + "loss": 1.9042, + "mean_token_accuracy": 0.5661734938621521, + "num_tokens": 4219780532.0, + "step": 8255 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 1.6786850690841675, + "learning_rate": 1.2981823978330433e-05, + "loss": 2.0413, + "mean_token_accuracy": 0.5505362749099731, + "num_tokens": 4220304753.0, + "step": 8256 + }, + { + "epoch": 2.2328285559762033, + "grad_norm": 1.3894540071487427, + "learning_rate": 1.2980286155007038e-05, + "loss": 1.7787, + "mean_token_accuracy": 0.6095712780952454, + "num_tokens": 4220829015.0, + "step": 8257 + }, + { + "epoch": 2.233098972417523, + "grad_norm": 1.2187676429748535, + "learning_rate": 1.2978748270922646e-05, + "loss": 1.975, + "mean_token_accuracy": 0.5625728368759155, + "num_tokens": 4221261853.0, + "step": 8258 + }, + { + "epoch": 2.2333693888588426, + "grad_norm": 1.557077407836914, + "learning_rate": 1.2977210326124436e-05, + "loss": 2.068, + "mean_token_accuracy": 0.5433052182197571, + "num_tokens": 4221786031.0, + "step": 8259 + }, + { + "epoch": 2.2336398053001623, + "grad_norm": 1.0873194932937622, + "learning_rate": 1.2975672320659598e-05, + "loss": 1.8633, + "mean_token_accuracy": 0.5570138692855835, + "num_tokens": 4222310200.0, + "step": 8260 + }, + { + "epoch": 2.233910221741482, + "grad_norm": 0.6387686729431152, + "learning_rate": 1.2974134254575323e-05, + "loss": 1.0897, + "mean_token_accuracy": 0.7108978033065796, + "num_tokens": 4222818939.0, + "step": 8261 + }, + { + "epoch": 2.2341806381828015, + "grad_norm": 1.8277775049209595, + "learning_rate": 1.2972596127918806e-05, + "loss": 2.0103, + "mean_token_accuracy": 0.5347781777381897, + "num_tokens": 4223306253.0, + "step": 8262 + }, + { + "epoch": 2.234451054624121, + "grad_norm": 1.6885528564453125, + "learning_rate": 1.2971057940737235e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.5528790950775146, + "num_tokens": 4223830406.0, + "step": 8263 + }, + { + "epoch": 2.234721471065441, + "grad_norm": 1.0333616733551025, + "learning_rate": 1.2969519693077813e-05, + "loss": 1.8292, + "mean_token_accuracy": 0.5903806686401367, + "num_tokens": 4224354627.0, + "step": 8264 + }, + { + "epoch": 2.2349918875067605, + "grad_norm": 1.7148311138153076, + "learning_rate": 1.2967981384987733e-05, + "loss": 1.8833, + "mean_token_accuracy": 0.5572496652603149, + "num_tokens": 4224878894.0, + "step": 8265 + }, + { + "epoch": 2.23526230394808, + "grad_norm": 1.8827359676361084, + "learning_rate": 1.2966443016514201e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5628108978271484, + "num_tokens": 4225401995.0, + "step": 8266 + }, + { + "epoch": 2.2355327203893998, + "grad_norm": 1.3256028890609741, + "learning_rate": 1.2964904587704408e-05, + "loss": 1.8737, + "mean_token_accuracy": 0.5807981491088867, + "num_tokens": 4225891369.0, + "step": 8267 + }, + { + "epoch": 2.2358031368307194, + "grad_norm": 1.7677122354507446, + "learning_rate": 1.2963366098605566e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.5554138422012329, + "num_tokens": 4226397050.0, + "step": 8268 + }, + { + "epoch": 2.236073553272039, + "grad_norm": 1.9020473957061768, + "learning_rate": 1.296182754926488e-05, + "loss": 1.9397, + "mean_token_accuracy": 0.560312807559967, + "num_tokens": 4226921317.0, + "step": 8269 + }, + { + "epoch": 2.2363439697133587, + "grad_norm": 1.2714879512786865, + "learning_rate": 1.2960288939729555e-05, + "loss": 1.728, + "mean_token_accuracy": 0.5794507265090942, + "num_tokens": 4227445570.0, + "step": 8270 + }, + { + "epoch": 2.2366143861546783, + "grad_norm": 1.5691781044006348, + "learning_rate": 1.2958750270046797e-05, + "loss": 1.8787, + "mean_token_accuracy": 0.5695590972900391, + "num_tokens": 4227969848.0, + "step": 8271 + }, + { + "epoch": 2.236884802595998, + "grad_norm": 2.1592955589294434, + "learning_rate": 1.2957211540263823e-05, + "loss": 2.0174, + "mean_token_accuracy": 0.5557855367660522, + "num_tokens": 4228494124.0, + "step": 8272 + }, + { + "epoch": 2.2371552190373176, + "grad_norm": 1.422646403312683, + "learning_rate": 1.2955672750427846e-05, + "loss": 2.0481, + "mean_token_accuracy": 0.5410188436508179, + "num_tokens": 4229009413.0, + "step": 8273 + }, + { + "epoch": 2.2374256354786373, + "grad_norm": 1.3205517530441284, + "learning_rate": 1.2954133900586074e-05, + "loss": 1.864, + "mean_token_accuracy": 0.5750299692153931, + "num_tokens": 4229525278.0, + "step": 8274 + }, + { + "epoch": 2.237696051919957, + "grad_norm": 1.6358178853988647, + "learning_rate": 1.2952594990785727e-05, + "loss": 1.9126, + "mean_token_accuracy": 0.5570040345191956, + "num_tokens": 4230049553.0, + "step": 8275 + }, + { + "epoch": 2.2379664683612766, + "grad_norm": 1.6266415119171143, + "learning_rate": 1.2951056021074029e-05, + "loss": 1.8852, + "mean_token_accuracy": 0.5542510747909546, + "num_tokens": 4230573713.0, + "step": 8276 + }, + { + "epoch": 2.238236884802596, + "grad_norm": 1.4459939002990723, + "learning_rate": 1.294951699149819e-05, + "loss": 1.8408, + "mean_token_accuracy": 0.5545656681060791, + "num_tokens": 4231097940.0, + "step": 8277 + }, + { + "epoch": 2.238507301243916, + "grad_norm": 1.4556469917297363, + "learning_rate": 1.2947977902105433e-05, + "loss": 1.9581, + "mean_token_accuracy": 0.555062472820282, + "num_tokens": 4231622158.0, + "step": 8278 + }, + { + "epoch": 2.2387777176852355, + "grad_norm": 1.4249187707901, + "learning_rate": 1.2946438752942992e-05, + "loss": 1.8739, + "mean_token_accuracy": 0.5697653889656067, + "num_tokens": 4232146427.0, + "step": 8279 + }, + { + "epoch": 2.2390481341265547, + "grad_norm": 1.3858603239059448, + "learning_rate": 1.2944899544058082e-05, + "loss": 1.8932, + "mean_token_accuracy": 0.573523998260498, + "num_tokens": 4232670610.0, + "step": 8280 + }, + { + "epoch": 2.2393185505678743, + "grad_norm": 0.5099235773086548, + "learning_rate": 1.2943360275497936e-05, + "loss": 1.1662, + "mean_token_accuracy": 0.6822066903114319, + "num_tokens": 4233194814.0, + "step": 8281 + }, + { + "epoch": 2.239588967009194, + "grad_norm": 1.8049275875091553, + "learning_rate": 1.294182094730978e-05, + "loss": 1.87, + "mean_token_accuracy": 0.5617690086364746, + "num_tokens": 4233682366.0, + "step": 8282 + }, + { + "epoch": 2.2398593834505136, + "grad_norm": 1.5446465015411377, + "learning_rate": 1.2940281559540851e-05, + "loss": 1.869, + "mean_token_accuracy": 0.5641367435455322, + "num_tokens": 4234206636.0, + "step": 8283 + }, + { + "epoch": 2.2401297998918333, + "grad_norm": 1.3300493955612183, + "learning_rate": 1.2938742112238374e-05, + "loss": 1.9525, + "mean_token_accuracy": 0.5409917831420898, + "num_tokens": 4234730913.0, + "step": 8284 + }, + { + "epoch": 2.240400216333153, + "grad_norm": 1.1103070974349976, + "learning_rate": 1.2937202605449589e-05, + "loss": 1.9351, + "mean_token_accuracy": 0.5471078753471375, + "num_tokens": 4235254998.0, + "step": 8285 + }, + { + "epoch": 2.2406706327744725, + "grad_norm": 1.4613096714019775, + "learning_rate": 1.293566303922173e-05, + "loss": 1.8459, + "mean_token_accuracy": 0.5858023166656494, + "num_tokens": 4235742636.0, + "step": 8286 + }, + { + "epoch": 2.240941049215792, + "grad_norm": 1.5979421138763428, + "learning_rate": 1.2934123413602036e-05, + "loss": 1.8828, + "mean_token_accuracy": 0.5855076909065247, + "num_tokens": 4236224008.0, + "step": 8287 + }, + { + "epoch": 2.241211465657112, + "grad_norm": 1.1068533658981323, + "learning_rate": 1.2932583728637749e-05, + "loss": 1.9621, + "mean_token_accuracy": 0.5560474395751953, + "num_tokens": 4236748184.0, + "step": 8288 + }, + { + "epoch": 2.2414818820984315, + "grad_norm": 1.1972081661224365, + "learning_rate": 1.293104398437611e-05, + "loss": 1.7999, + "mean_token_accuracy": 0.5959382653236389, + "num_tokens": 4237272397.0, + "step": 8289 + }, + { + "epoch": 2.241752298539751, + "grad_norm": 1.4797327518463135, + "learning_rate": 1.2929504180864363e-05, + "loss": 1.8832, + "mean_token_accuracy": 0.5692770481109619, + "num_tokens": 4237727829.0, + "step": 8290 + }, + { + "epoch": 2.2420227149810708, + "grad_norm": 1.031281590461731, + "learning_rate": 1.2927964318149751e-05, + "loss": 1.9816, + "mean_token_accuracy": 0.5476232767105103, + "num_tokens": 4238252001.0, + "step": 8291 + }, + { + "epoch": 2.2422931314223904, + "grad_norm": 1.1085550785064697, + "learning_rate": 1.2926424396279527e-05, + "loss": 1.9768, + "mean_token_accuracy": 0.5374661087989807, + "num_tokens": 4238776242.0, + "step": 8292 + }, + { + "epoch": 2.24256354786371, + "grad_norm": 1.3259668350219727, + "learning_rate": 1.2924884415300933e-05, + "loss": 1.8896, + "mean_token_accuracy": 0.5604331493377686, + "num_tokens": 4239300430.0, + "step": 8293 + }, + { + "epoch": 2.2428339643050297, + "grad_norm": 1.1923362016677856, + "learning_rate": 1.292334437526123e-05, + "loss": 2.0404, + "mean_token_accuracy": 0.5480166673660278, + "num_tokens": 4239824616.0, + "step": 8294 + }, + { + "epoch": 2.2431043807463493, + "grad_norm": 1.4034327268600464, + "learning_rate": 1.292180427620766e-05, + "loss": 1.8689, + "mean_token_accuracy": 0.587444543838501, + "num_tokens": 4240348780.0, + "step": 8295 + }, + { + "epoch": 2.243374797187669, + "grad_norm": 1.1104190349578857, + "learning_rate": 1.2920264118187492e-05, + "loss": 1.8862, + "mean_token_accuracy": 0.5706323385238647, + "num_tokens": 4240872922.0, + "step": 8296 + }, + { + "epoch": 2.2436452136289886, + "grad_norm": 1.4576746225357056, + "learning_rate": 1.2918723901247965e-05, + "loss": 2.0082, + "mean_token_accuracy": 0.5409865379333496, + "num_tokens": 4241397139.0, + "step": 8297 + }, + { + "epoch": 2.2439156300703083, + "grad_norm": 1.5373992919921875, + "learning_rate": 1.2917183625436347e-05, + "loss": 1.8721, + "mean_token_accuracy": 0.5735929012298584, + "num_tokens": 4241921423.0, + "step": 8298 + }, + { + "epoch": 2.244186046511628, + "grad_norm": 1.1069960594177246, + "learning_rate": 1.29156432907999e-05, + "loss": 1.9124, + "mean_token_accuracy": 0.5598156452178955, + "num_tokens": 4242392696.0, + "step": 8299 + }, + { + "epoch": 2.2444564629529475, + "grad_norm": 1.0864379405975342, + "learning_rate": 1.2914102897385882e-05, + "loss": 1.7562, + "mean_token_accuracy": 0.5841059684753418, + "num_tokens": 4242855879.0, + "step": 8300 + }, + { + "epoch": 2.244726879394267, + "grad_norm": 0.5370120406150818, + "learning_rate": 1.291256244524156e-05, + "loss": 1.061, + "mean_token_accuracy": 0.7228753566741943, + "num_tokens": 4243380109.0, + "step": 8301 + }, + { + "epoch": 2.244997295835587, + "grad_norm": 1.40247642993927, + "learning_rate": 1.2911021934414196e-05, + "loss": 1.8535, + "mean_token_accuracy": 0.5646591186523438, + "num_tokens": 4243885640.0, + "step": 8302 + }, + { + "epoch": 2.2452677122769065, + "grad_norm": 1.22743558883667, + "learning_rate": 1.2909481364951057e-05, + "loss": 1.7979, + "mean_token_accuracy": 0.5953467488288879, + "num_tokens": 4244409862.0, + "step": 8303 + }, + { + "epoch": 2.245538128718226, + "grad_norm": 1.0691349506378174, + "learning_rate": 1.2907940736899418e-05, + "loss": 1.8856, + "mean_token_accuracy": 0.5808137059211731, + "num_tokens": 4244933983.0, + "step": 8304 + }, + { + "epoch": 2.2458085451595458, + "grad_norm": 1.321863055229187, + "learning_rate": 1.2906400050306544e-05, + "loss": 1.8725, + "mean_token_accuracy": 0.5782492756843567, + "num_tokens": 4245424987.0, + "step": 8305 + }, + { + "epoch": 2.2460789616008654, + "grad_norm": 1.376021146774292, + "learning_rate": 1.2904859305219712e-05, + "loss": 1.9487, + "mean_token_accuracy": 0.5706652402877808, + "num_tokens": 4245949223.0, + "step": 8306 + }, + { + "epoch": 2.246349378042185, + "grad_norm": 1.2293113470077515, + "learning_rate": 1.2903318501686193e-05, + "loss": 1.8821, + "mean_token_accuracy": 0.5544213056564331, + "num_tokens": 4246473472.0, + "step": 8307 + }, + { + "epoch": 2.2466197944835047, + "grad_norm": 1.216378927230835, + "learning_rate": 1.2901777639753266e-05, + "loss": 1.8267, + "mean_token_accuracy": 0.5722028017044067, + "num_tokens": 4246959493.0, + "step": 8308 + }, + { + "epoch": 2.2468902109248243, + "grad_norm": 1.0981557369232178, + "learning_rate": 1.2900236719468209e-05, + "loss": 1.9251, + "mean_token_accuracy": 0.5563353300094604, + "num_tokens": 4247457244.0, + "step": 8309 + }, + { + "epoch": 2.247160627366144, + "grad_norm": 1.39890718460083, + "learning_rate": 1.28986957408783e-05, + "loss": 1.8852, + "mean_token_accuracy": 0.5629903078079224, + "num_tokens": 4247981407.0, + "step": 8310 + }, + { + "epoch": 2.2474310438074636, + "grad_norm": 1.2189537286758423, + "learning_rate": 1.2897154704030823e-05, + "loss": 1.8978, + "mean_token_accuracy": 0.5698941946029663, + "num_tokens": 4248470964.0, + "step": 8311 + }, + { + "epoch": 2.2477014602487833, + "grad_norm": 1.094701886177063, + "learning_rate": 1.2895613608973058e-05, + "loss": 1.9321, + "mean_token_accuracy": 0.565658450126648, + "num_tokens": 4248995129.0, + "step": 8312 + }, + { + "epoch": 2.247971876690103, + "grad_norm": 1.2012287378311157, + "learning_rate": 1.2894072455752295e-05, + "loss": 1.9207, + "mean_token_accuracy": 0.5681493878364563, + "num_tokens": 4249519388.0, + "step": 8313 + }, + { + "epoch": 2.2482422931314225, + "grad_norm": 1.1394047737121582, + "learning_rate": 1.2892531244415818e-05, + "loss": 1.9997, + "mean_token_accuracy": 0.5445704460144043, + "num_tokens": 4250043593.0, + "step": 8314 + }, + { + "epoch": 2.248512709572742, + "grad_norm": 1.0876352787017822, + "learning_rate": 1.2890989975010919e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.5707577466964722, + "num_tokens": 4250567859.0, + "step": 8315 + }, + { + "epoch": 2.248783126014062, + "grad_norm": 1.2393293380737305, + "learning_rate": 1.2889448647584886e-05, + "loss": 2.0206, + "mean_token_accuracy": 0.5418641567230225, + "num_tokens": 4251047432.0, + "step": 8316 + }, + { + "epoch": 2.2490535424553815, + "grad_norm": 1.0706133842468262, + "learning_rate": 1.2887907262185008e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5679813623428345, + "num_tokens": 4251571582.0, + "step": 8317 + }, + { + "epoch": 2.249323958896701, + "grad_norm": 1.0338464975357056, + "learning_rate": 1.2886365818858588e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.545343279838562, + "num_tokens": 4252095847.0, + "step": 8318 + }, + { + "epoch": 2.2495943753380208, + "grad_norm": 1.1806132793426514, + "learning_rate": 1.2884824317652915e-05, + "loss": 1.9002, + "mean_token_accuracy": 0.5826611518859863, + "num_tokens": 4252619977.0, + "step": 8319 + }, + { + "epoch": 2.2498647917793404, + "grad_norm": 1.3003968000411987, + "learning_rate": 1.288328275861529e-05, + "loss": 1.9779, + "mean_token_accuracy": 0.5610705614089966, + "num_tokens": 4253144179.0, + "step": 8320 + }, + { + "epoch": 2.2501352082206596, + "grad_norm": 0.5627424716949463, + "learning_rate": 1.288174114179301e-05, + "loss": 1.0743, + "mean_token_accuracy": 0.7150809168815613, + "num_tokens": 4253668445.0, + "step": 8321 + }, + { + "epoch": 2.2504056246619797, + "grad_norm": 1.3662800788879395, + "learning_rate": 1.2880199467233379e-05, + "loss": 1.7721, + "mean_token_accuracy": 0.581653356552124, + "num_tokens": 4254187206.0, + "step": 8322 + }, + { + "epoch": 2.250676041103299, + "grad_norm": 1.4916086196899414, + "learning_rate": 1.28786577349837e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5598582029342651, + "num_tokens": 4254711469.0, + "step": 8323 + }, + { + "epoch": 2.2509464575446185, + "grad_norm": 1.069946527481079, + "learning_rate": 1.2877115945091277e-05, + "loss": 1.8284, + "mean_token_accuracy": 0.5693789124488831, + "num_tokens": 4255235753.0, + "step": 8324 + }, + { + "epoch": 2.251216873985938, + "grad_norm": 1.342935562133789, + "learning_rate": 1.2875574097603413e-05, + "loss": 2.0048, + "mean_token_accuracy": 0.5381234884262085, + "num_tokens": 4255760017.0, + "step": 8325 + }, + { + "epoch": 2.251487290427258, + "grad_norm": 1.2987816333770752, + "learning_rate": 1.2874032192567423e-05, + "loss": 1.8553, + "mean_token_accuracy": 0.5755715370178223, + "num_tokens": 4256284253.0, + "step": 8326 + }, + { + "epoch": 2.2517577068685775, + "grad_norm": 1.1935492753982544, + "learning_rate": 1.2872490230030615e-05, + "loss": 1.9067, + "mean_token_accuracy": 0.5672152042388916, + "num_tokens": 4256801477.0, + "step": 8327 + }, + { + "epoch": 2.252028123309897, + "grad_norm": 1.4478733539581299, + "learning_rate": 1.2870948210040295e-05, + "loss": 1.8752, + "mean_token_accuracy": 0.558542013168335, + "num_tokens": 4257325690.0, + "step": 8328 + }, + { + "epoch": 2.2522985397512167, + "grad_norm": 1.2743115425109863, + "learning_rate": 1.286940613264379e-05, + "loss": 1.892, + "mean_token_accuracy": 0.5712947845458984, + "num_tokens": 4257849918.0, + "step": 8329 + }, + { + "epoch": 2.2525689561925364, + "grad_norm": 1.205067753791809, + "learning_rate": 1.2867863997888398e-05, + "loss": 1.8566, + "mean_token_accuracy": 0.5808509588241577, + "num_tokens": 4258351707.0, + "step": 8330 + }, + { + "epoch": 2.252839372633856, + "grad_norm": 1.4422074556350708, + "learning_rate": 1.2866321805821452e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.5542747974395752, + "num_tokens": 4258849180.0, + "step": 8331 + }, + { + "epoch": 2.2531097890751757, + "grad_norm": 1.2181938886642456, + "learning_rate": 1.2864779556490264e-05, + "loss": 1.9572, + "mean_token_accuracy": 0.5505800247192383, + "num_tokens": 4259350497.0, + "step": 8332 + }, + { + "epoch": 2.2533802055164953, + "grad_norm": 1.296356201171875, + "learning_rate": 1.2863237249942154e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.5625589489936829, + "num_tokens": 4259874765.0, + "step": 8333 + }, + { + "epoch": 2.253650621957815, + "grad_norm": 1.1383004188537598, + "learning_rate": 1.2861694886224445e-05, + "loss": 1.8288, + "mean_token_accuracy": 0.5738557577133179, + "num_tokens": 4260398993.0, + "step": 8334 + }, + { + "epoch": 2.2539210383991346, + "grad_norm": 1.3416798114776611, + "learning_rate": 1.2860152465384463e-05, + "loss": 2.046, + "mean_token_accuracy": 0.5475735664367676, + "num_tokens": 4260923171.0, + "step": 8335 + }, + { + "epoch": 2.2541914548404542, + "grad_norm": 1.2974426746368408, + "learning_rate": 1.2858609987469533e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.5852981805801392, + "num_tokens": 4261384461.0, + "step": 8336 + }, + { + "epoch": 2.254461871281774, + "grad_norm": 1.4156603813171387, + "learning_rate": 1.2857067452526982e-05, + "loss": 1.9293, + "mean_token_accuracy": 0.5764684677124023, + "num_tokens": 4261872285.0, + "step": 8337 + }, + { + "epoch": 2.2547322877230935, + "grad_norm": 1.2480883598327637, + "learning_rate": 1.2855524860604143e-05, + "loss": 1.8493, + "mean_token_accuracy": 0.5727816224098206, + "num_tokens": 4262396481.0, + "step": 8338 + }, + { + "epoch": 2.255002704164413, + "grad_norm": 1.2434700727462769, + "learning_rate": 1.2853982211748342e-05, + "loss": 1.8779, + "mean_token_accuracy": 0.5908809304237366, + "num_tokens": 4262860808.0, + "step": 8339 + }, + { + "epoch": 2.255273120605733, + "grad_norm": 1.2833331823349, + "learning_rate": 1.2852439506006913e-05, + "loss": 1.9775, + "mean_token_accuracy": 0.5491127371788025, + "num_tokens": 4263384893.0, + "step": 8340 + }, + { + "epoch": 2.2555435370470525, + "grad_norm": 0.5676349997520447, + "learning_rate": 1.2850896743427194e-05, + "loss": 1.1483, + "mean_token_accuracy": 0.6931790113449097, + "num_tokens": 4263909157.0, + "step": 8341 + }, + { + "epoch": 2.255813953488372, + "grad_norm": 1.6584981679916382, + "learning_rate": 1.284935392405652e-05, + "loss": 1.8101, + "mean_token_accuracy": 0.5693888664245605, + "num_tokens": 4264433440.0, + "step": 8342 + }, + { + "epoch": 2.2560843699296917, + "grad_norm": 1.6621521711349487, + "learning_rate": 1.2847811047942228e-05, + "loss": 1.7569, + "mean_token_accuracy": 0.6145913600921631, + "num_tokens": 4264957634.0, + "step": 8343 + }, + { + "epoch": 2.2563547863710114, + "grad_norm": 1.1018269062042236, + "learning_rate": 1.2846268115131661e-05, + "loss": 1.9057, + "mean_token_accuracy": 0.5731280446052551, + "num_tokens": 4265481904.0, + "step": 8344 + }, + { + "epoch": 2.256625202812331, + "grad_norm": 1.1732043027877808, + "learning_rate": 1.284472512567216e-05, + "loss": 1.8821, + "mean_token_accuracy": 0.5681041479110718, + "num_tokens": 4266006116.0, + "step": 8345 + }, + { + "epoch": 2.2568956192536507, + "grad_norm": 1.4678444862365723, + "learning_rate": 1.2843182079611063e-05, + "loss": 1.9075, + "mean_token_accuracy": 0.5782619714736938, + "num_tokens": 4266507451.0, + "step": 8346 + }, + { + "epoch": 2.2571660356949703, + "grad_norm": 1.219631314277649, + "learning_rate": 1.2841638976995723e-05, + "loss": 1.8415, + "mean_token_accuracy": 0.5813775062561035, + "num_tokens": 4267031614.0, + "step": 8347 + }, + { + "epoch": 2.25743645213629, + "grad_norm": 1.3193267583847046, + "learning_rate": 1.2840095817873481e-05, + "loss": 1.8613, + "mean_token_accuracy": 0.5736157894134521, + "num_tokens": 4267533093.0, + "step": 8348 + }, + { + "epoch": 2.2577068685776096, + "grad_norm": 1.422601342201233, + "learning_rate": 1.2838552602291688e-05, + "loss": 1.9257, + "mean_token_accuracy": 0.5448299646377563, + "num_tokens": 4268057154.0, + "step": 8349 + }, + { + "epoch": 2.2579772850189292, + "grad_norm": 1.1443899869918823, + "learning_rate": 1.2837009330297695e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.572539210319519, + "num_tokens": 4268581351.0, + "step": 8350 + }, + { + "epoch": 2.258247701460249, + "grad_norm": 1.37739896774292, + "learning_rate": 1.283546600193885e-05, + "loss": 1.8785, + "mean_token_accuracy": 0.5821405649185181, + "num_tokens": 4269025412.0, + "step": 8351 + }, + { + "epoch": 2.2585181179015685, + "grad_norm": 1.3908036947250366, + "learning_rate": 1.2833922617262515e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5652329325675964, + "num_tokens": 4269549606.0, + "step": 8352 + }, + { + "epoch": 2.258788534342888, + "grad_norm": 1.1702656745910645, + "learning_rate": 1.2832379176316039e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.5619524717330933, + "num_tokens": 4270048673.0, + "step": 8353 + }, + { + "epoch": 2.259058950784208, + "grad_norm": 1.1878479719161987, + "learning_rate": 1.2830835679146782e-05, + "loss": 1.799, + "mean_token_accuracy": 0.5806589126586914, + "num_tokens": 4270553770.0, + "step": 8354 + }, + { + "epoch": 2.2593293672255275, + "grad_norm": 1.6190847158432007, + "learning_rate": 1.2829292125802105e-05, + "loss": 1.7592, + "mean_token_accuracy": 0.6004149317741394, + "num_tokens": 4271023491.0, + "step": 8355 + }, + { + "epoch": 2.259599783666847, + "grad_norm": 1.3057880401611328, + "learning_rate": 1.2827748516329359e-05, + "loss": 1.9021, + "mean_token_accuracy": 0.5903960466384888, + "num_tokens": 4271547748.0, + "step": 8356 + }, + { + "epoch": 2.2598702001081667, + "grad_norm": 1.1656914949417114, + "learning_rate": 1.2826204850775917e-05, + "loss": 1.8822, + "mean_token_accuracy": 0.5736478567123413, + "num_tokens": 4272071942.0, + "step": 8357 + }, + { + "epoch": 2.2601406165494864, + "grad_norm": 1.212457299232483, + "learning_rate": 1.2824661129189143e-05, + "loss": 1.894, + "mean_token_accuracy": 0.5809508562088013, + "num_tokens": 4272596226.0, + "step": 8358 + }, + { + "epoch": 2.260411032990806, + "grad_norm": 1.1452155113220215, + "learning_rate": 1.2823117351616396e-05, + "loss": 1.9227, + "mean_token_accuracy": 0.5923753976821899, + "num_tokens": 4273055085.0, + "step": 8359 + }, + { + "epoch": 2.2606814494321252, + "grad_norm": 1.1306195259094238, + "learning_rate": 1.2821573518105048e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5575755834579468, + "num_tokens": 4273579369.0, + "step": 8360 + }, + { + "epoch": 2.2609518658734453, + "grad_norm": 0.47483041882514954, + "learning_rate": 1.2820029628702467e-05, + "loss": 1.1566, + "mean_token_accuracy": 0.6916427612304688, + "num_tokens": 4274103598.0, + "step": 8361 + }, + { + "epoch": 2.2612222823147645, + "grad_norm": 1.5551949739456177, + "learning_rate": 1.2818485683456026e-05, + "loss": 1.8175, + "mean_token_accuracy": 0.5760471820831299, + "num_tokens": 4274627750.0, + "step": 8362 + }, + { + "epoch": 2.2614926987560846, + "grad_norm": 1.480576515197754, + "learning_rate": 1.2816941682413093e-05, + "loss": 1.94, + "mean_token_accuracy": 0.5377088785171509, + "num_tokens": 4275151931.0, + "step": 8363 + }, + { + "epoch": 2.261763115197404, + "grad_norm": 1.3513281345367432, + "learning_rate": 1.2815397625621048e-05, + "loss": 1.9143, + "mean_token_accuracy": 0.5378594994544983, + "num_tokens": 4275676173.0, + "step": 8364 + }, + { + "epoch": 2.2620335316387234, + "grad_norm": 1.306665301322937, + "learning_rate": 1.2813853513127266e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5412135124206543, + "num_tokens": 4276200322.0, + "step": 8365 + }, + { + "epoch": 2.262303948080043, + "grad_norm": 1.3353010416030884, + "learning_rate": 1.2812309344979124e-05, + "loss": 1.9537, + "mean_token_accuracy": 0.545059084892273, + "num_tokens": 4276724590.0, + "step": 8366 + }, + { + "epoch": 2.2625743645213627, + "grad_norm": 1.2374464273452759, + "learning_rate": 1.2810765121224e-05, + "loss": 1.9268, + "mean_token_accuracy": 0.5494521856307983, + "num_tokens": 4277248866.0, + "step": 8367 + }, + { + "epoch": 2.2628447809626824, + "grad_norm": 1.0845507383346558, + "learning_rate": 1.280922084190928e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.549796462059021, + "num_tokens": 4277709479.0, + "step": 8368 + }, + { + "epoch": 2.263115197404002, + "grad_norm": 1.350820541381836, + "learning_rate": 1.2807676507082342e-05, + "loss": 1.8502, + "mean_token_accuracy": 0.5608941316604614, + "num_tokens": 4278233661.0, + "step": 8369 + }, + { + "epoch": 2.2633856138453217, + "grad_norm": 1.1473133563995361, + "learning_rate": 1.280613211679057e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5686408281326294, + "num_tokens": 4278757793.0, + "step": 8370 + }, + { + "epoch": 2.2636560302866413, + "grad_norm": 1.1343408823013306, + "learning_rate": 1.2804587671081357e-05, + "loss": 1.9864, + "mean_token_accuracy": 0.5426304936408997, + "num_tokens": 4279281945.0, + "step": 8371 + }, + { + "epoch": 2.263926446727961, + "grad_norm": 1.2333365678787231, + "learning_rate": 1.2803043170002088e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.5647094249725342, + "num_tokens": 4279806208.0, + "step": 8372 + }, + { + "epoch": 2.2641968631692806, + "grad_norm": 1.0543155670166016, + "learning_rate": 1.280149861360015e-05, + "loss": 1.9726, + "mean_token_accuracy": 0.5381574630737305, + "num_tokens": 4280330397.0, + "step": 8373 + }, + { + "epoch": 2.2644672796106002, + "grad_norm": 1.197050929069519, + "learning_rate": 1.2799954001922936e-05, + "loss": 1.8891, + "mean_token_accuracy": 0.5598903894424438, + "num_tokens": 4280854632.0, + "step": 8374 + }, + { + "epoch": 2.26473769605192, + "grad_norm": 1.2371104955673218, + "learning_rate": 1.279840933501784e-05, + "loss": 1.9063, + "mean_token_accuracy": 0.5540663003921509, + "num_tokens": 4281378829.0, + "step": 8375 + }, + { + "epoch": 2.2650081124932395, + "grad_norm": 1.0110142230987549, + "learning_rate": 1.2796864612932259e-05, + "loss": 1.7822, + "mean_token_accuracy": 0.5911031365394592, + "num_tokens": 4281903094.0, + "step": 8376 + }, + { + "epoch": 2.265278528934559, + "grad_norm": 1.0105030536651611, + "learning_rate": 1.2795319835713585e-05, + "loss": 1.9519, + "mean_token_accuracy": 0.5549880266189575, + "num_tokens": 4282427374.0, + "step": 8377 + }, + { + "epoch": 2.265548945375879, + "grad_norm": 1.260359287261963, + "learning_rate": 1.279377500340922e-05, + "loss": 1.9824, + "mean_token_accuracy": 0.5477505922317505, + "num_tokens": 4282951551.0, + "step": 8378 + }, + { + "epoch": 2.2658193618171985, + "grad_norm": 1.1854758262634277, + "learning_rate": 1.2792230116066561e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5672799348831177, + "num_tokens": 4283436864.0, + "step": 8379 + }, + { + "epoch": 2.266089778258518, + "grad_norm": 1.1066174507141113, + "learning_rate": 1.2790685173733015e-05, + "loss": 1.9435, + "mean_token_accuracy": 0.5564793348312378, + "num_tokens": 4283961135.0, + "step": 8380 + }, + { + "epoch": 2.2663601946998377, + "grad_norm": 0.47582173347473145, + "learning_rate": 1.278914017645598e-05, + "loss": 1.049, + "mean_token_accuracy": 0.7174765467643738, + "num_tokens": 4284485351.0, + "step": 8381 + }, + { + "epoch": 2.2666306111411574, + "grad_norm": 1.57986581325531, + "learning_rate": 1.2787595124282863e-05, + "loss": 1.8273, + "mean_token_accuracy": 0.556287944316864, + "num_tokens": 4285009610.0, + "step": 8382 + }, + { + "epoch": 2.266901027582477, + "grad_norm": 1.6293940544128418, + "learning_rate": 1.278605001726107e-05, + "loss": 1.8202, + "mean_token_accuracy": 0.569547176361084, + "num_tokens": 4285533838.0, + "step": 8383 + }, + { + "epoch": 2.2671714440237967, + "grad_norm": 1.2463147640228271, + "learning_rate": 1.2784504855438011e-05, + "loss": 1.8245, + "mean_token_accuracy": 0.574230432510376, + "num_tokens": 4286058038.0, + "step": 8384 + }, + { + "epoch": 2.2674418604651163, + "grad_norm": 1.4948184490203857, + "learning_rate": 1.2782959638861094e-05, + "loss": 1.9523, + "mean_token_accuracy": 0.5477719902992249, + "num_tokens": 4286544285.0, + "step": 8385 + }, + { + "epoch": 2.267712276906436, + "grad_norm": 1.1385048627853394, + "learning_rate": 1.2781414367577731e-05, + "loss": 1.9132, + "mean_token_accuracy": 0.5617715120315552, + "num_tokens": 4287063868.0, + "step": 8386 + }, + { + "epoch": 2.2679826933477556, + "grad_norm": 1.3360538482666016, + "learning_rate": 1.277986904163534e-05, + "loss": 1.877, + "mean_token_accuracy": 0.572509765625, + "num_tokens": 4287537695.0, + "step": 8387 + }, + { + "epoch": 2.2682531097890752, + "grad_norm": 1.5331851243972778, + "learning_rate": 1.2778323661081332e-05, + "loss": 1.8465, + "mean_token_accuracy": 0.5670672655105591, + "num_tokens": 4288061920.0, + "step": 8388 + }, + { + "epoch": 2.268523526230395, + "grad_norm": 1.2033919095993042, + "learning_rate": 1.277677822596312e-05, + "loss": 1.8047, + "mean_token_accuracy": 0.596551775932312, + "num_tokens": 4288472098.0, + "step": 8389 + }, + { + "epoch": 2.2687939426717145, + "grad_norm": 1.1963332891464233, + "learning_rate": 1.277523273632813e-05, + "loss": 1.8853, + "mean_token_accuracy": 0.5910776853561401, + "num_tokens": 4288996263.0, + "step": 8390 + }, + { + "epoch": 2.269064359113034, + "grad_norm": 1.2695358991622925, + "learning_rate": 1.2773687192223779e-05, + "loss": 1.977, + "mean_token_accuracy": 0.5512059926986694, + "num_tokens": 4289520389.0, + "step": 8391 + }, + { + "epoch": 2.269334775554354, + "grad_norm": 1.036263108253479, + "learning_rate": 1.2772141593697486e-05, + "loss": 1.8604, + "mean_token_accuracy": 0.5574911236763, + "num_tokens": 4290044656.0, + "step": 8392 + }, + { + "epoch": 2.2696051919956735, + "grad_norm": 1.1066399812698364, + "learning_rate": 1.2770595940796681e-05, + "loss": 1.9043, + "mean_token_accuracy": 0.5659502744674683, + "num_tokens": 4290568847.0, + "step": 8393 + }, + { + "epoch": 2.269875608436993, + "grad_norm": 1.1849324703216553, + "learning_rate": 1.2769050233568787e-05, + "loss": 1.9288, + "mean_token_accuracy": 0.5627657175064087, + "num_tokens": 4291093025.0, + "step": 8394 + }, + { + "epoch": 2.2701460248783127, + "grad_norm": 1.2555503845214844, + "learning_rate": 1.2767504472061228e-05, + "loss": 1.9552, + "mean_token_accuracy": 0.5571231842041016, + "num_tokens": 4291617234.0, + "step": 8395 + }, + { + "epoch": 2.2704164413196324, + "grad_norm": 1.0947482585906982, + "learning_rate": 1.276595865632143e-05, + "loss": 1.9129, + "mean_token_accuracy": 0.5630209445953369, + "num_tokens": 4292141446.0, + "step": 8396 + }, + { + "epoch": 2.270686857760952, + "grad_norm": 1.1433160305023193, + "learning_rate": 1.2764412786396832e-05, + "loss": 1.9139, + "mean_token_accuracy": 0.5671508312225342, + "num_tokens": 4292641313.0, + "step": 8397 + }, + { + "epoch": 2.2709572742022717, + "grad_norm": 1.1068124771118164, + "learning_rate": 1.2762866862334858e-05, + "loss": 1.895, + "mean_token_accuracy": 0.5698724389076233, + "num_tokens": 4293130902.0, + "step": 8398 + }, + { + "epoch": 2.2712276906435913, + "grad_norm": 1.2098910808563232, + "learning_rate": 1.2761320884182944e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5289865732192993, + "num_tokens": 4293638485.0, + "step": 8399 + }, + { + "epoch": 2.271498107084911, + "grad_norm": 1.027748703956604, + "learning_rate": 1.2759774851988527e-05, + "loss": 1.8867, + "mean_token_accuracy": 0.5794891119003296, + "num_tokens": 4294105601.0, + "step": 8400 + }, + { + "epoch": 2.27176852352623, + "grad_norm": 0.5998831987380981, + "learning_rate": 1.2758228765799043e-05, + "loss": 1.1282, + "mean_token_accuracy": 0.6677261590957642, + "num_tokens": 4294629780.0, + "step": 8401 + }, + { + "epoch": 2.2720389399675502, + "grad_norm": 1.4288941621780396, + "learning_rate": 1.2756682625661932e-05, + "loss": 1.8432, + "mean_token_accuracy": 0.5770691633224487, + "num_tokens": 4295153964.0, + "step": 8402 + }, + { + "epoch": 2.2723093564088694, + "grad_norm": 1.1739377975463867, + "learning_rate": 1.2755136431624626e-05, + "loss": 1.8638, + "mean_token_accuracy": 0.5846868753433228, + "num_tokens": 4295678137.0, + "step": 8403 + }, + { + "epoch": 2.2725797728501895, + "grad_norm": 1.1738065481185913, + "learning_rate": 1.2753590183734579e-05, + "loss": 1.8591, + "mean_token_accuracy": 0.5699320435523987, + "num_tokens": 4296202407.0, + "step": 8404 + }, + { + "epoch": 2.2728501892915087, + "grad_norm": 1.417393684387207, + "learning_rate": 1.2752043882039222e-05, + "loss": 1.8828, + "mean_token_accuracy": 0.573462724685669, + "num_tokens": 4296579479.0, + "step": 8405 + }, + { + "epoch": 2.2731206057328284, + "grad_norm": 1.3859148025512695, + "learning_rate": 1.2750497526586007e-05, + "loss": 1.9916, + "mean_token_accuracy": 0.5754347443580627, + "num_tokens": 4296957836.0, + "step": 8406 + }, + { + "epoch": 2.273391022174148, + "grad_norm": 1.1421681642532349, + "learning_rate": 1.2748951117422384e-05, + "loss": 1.9056, + "mean_token_accuracy": 0.5678517818450928, + "num_tokens": 4297462102.0, + "step": 8407 + }, + { + "epoch": 2.2736614386154677, + "grad_norm": 1.2939999103546143, + "learning_rate": 1.2747404654595796e-05, + "loss": 1.8846, + "mean_token_accuracy": 0.5621223449707031, + "num_tokens": 4297986192.0, + "step": 8408 + }, + { + "epoch": 2.2739318550567873, + "grad_norm": 1.1769007444381714, + "learning_rate": 1.2745858138153696e-05, + "loss": 1.9313, + "mean_token_accuracy": 0.5543825626373291, + "num_tokens": 4298510447.0, + "step": 8409 + }, + { + "epoch": 2.274202271498107, + "grad_norm": 1.247499942779541, + "learning_rate": 1.274431156814353e-05, + "loss": 1.8956, + "mean_token_accuracy": 0.5676741600036621, + "num_tokens": 4299034526.0, + "step": 8410 + }, + { + "epoch": 2.2744726879394266, + "grad_norm": 1.2890501022338867, + "learning_rate": 1.2742764944612758e-05, + "loss": 1.8485, + "mean_token_accuracy": 0.5681198835372925, + "num_tokens": 4299558759.0, + "step": 8411 + }, + { + "epoch": 2.2747431043807462, + "grad_norm": 1.3126102685928345, + "learning_rate": 1.2741218267608833e-05, + "loss": 1.9522, + "mean_token_accuracy": 0.5539640188217163, + "num_tokens": 4300082969.0, + "step": 8412 + }, + { + "epoch": 2.275013520822066, + "grad_norm": 1.0627623796463013, + "learning_rate": 1.2739671537179208e-05, + "loss": 1.9508, + "mean_token_accuracy": 0.5631659030914307, + "num_tokens": 4300607183.0, + "step": 8413 + }, + { + "epoch": 2.2752839372633855, + "grad_norm": 1.5385143756866455, + "learning_rate": 1.2738124753371349e-05, + "loss": 2.0226, + "mean_token_accuracy": 0.5453989505767822, + "num_tokens": 4301131380.0, + "step": 8414 + }, + { + "epoch": 2.275554353704705, + "grad_norm": 1.374329924583435, + "learning_rate": 1.2736577916232706e-05, + "loss": 1.878, + "mean_token_accuracy": 0.5658562779426575, + "num_tokens": 4301655659.0, + "step": 8415 + }, + { + "epoch": 2.275824770146025, + "grad_norm": 1.1479389667510986, + "learning_rate": 1.2735031025810746e-05, + "loss": 1.8769, + "mean_token_accuracy": 0.5727561116218567, + "num_tokens": 4302179933.0, + "step": 8416 + }, + { + "epoch": 2.2760951865873444, + "grad_norm": 1.1519577503204346, + "learning_rate": 1.2733484082152937e-05, + "loss": 1.8449, + "mean_token_accuracy": 0.5678742527961731, + "num_tokens": 4302691815.0, + "step": 8417 + }, + { + "epoch": 2.276365603028664, + "grad_norm": 1.1330287456512451, + "learning_rate": 1.2731937085306733e-05, + "loss": 1.8678, + "mean_token_accuracy": 0.5791786909103394, + "num_tokens": 4303213350.0, + "step": 8418 + }, + { + "epoch": 2.2766360194699837, + "grad_norm": 1.1908583641052246, + "learning_rate": 1.2730390035319607e-05, + "loss": 1.8911, + "mean_token_accuracy": 0.5733627080917358, + "num_tokens": 4303737545.0, + "step": 8419 + }, + { + "epoch": 2.2769064359113034, + "grad_norm": 1.4745324850082397, + "learning_rate": 1.2728842932239028e-05, + "loss": 1.7456, + "mean_token_accuracy": 0.6029908657073975, + "num_tokens": 4304261746.0, + "step": 8420 + }, + { + "epoch": 2.277176852352623, + "grad_norm": 0.59499591588974, + "learning_rate": 1.2727295776112463e-05, + "loss": 1.1913, + "mean_token_accuracy": 0.6880663633346558, + "num_tokens": 4304775928.0, + "step": 8421 + }, + { + "epoch": 2.2774472687939427, + "grad_norm": 1.6950541734695435, + "learning_rate": 1.2725748566987382e-05, + "loss": 1.7188, + "mean_token_accuracy": 0.6173105239868164, + "num_tokens": 4305209797.0, + "step": 8422 + }, + { + "epoch": 2.2777176852352623, + "grad_norm": 2.082853078842163, + "learning_rate": 1.2724201304911261e-05, + "loss": 1.9634, + "mean_token_accuracy": 0.5450136661529541, + "num_tokens": 4305733881.0, + "step": 8423 + }, + { + "epoch": 2.277988101676582, + "grad_norm": 1.170721411705017, + "learning_rate": 1.2722653989931575e-05, + "loss": 1.7686, + "mean_token_accuracy": 0.5740916728973389, + "num_tokens": 4306258025.0, + "step": 8424 + }, + { + "epoch": 2.2782585181179016, + "grad_norm": 1.3007127046585083, + "learning_rate": 1.2721106622095798e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5557985901832581, + "num_tokens": 4306782308.0, + "step": 8425 + }, + { + "epoch": 2.2785289345592212, + "grad_norm": 1.1422985792160034, + "learning_rate": 1.2719559201451407e-05, + "loss": 1.713, + "mean_token_accuracy": 0.6065127849578857, + "num_tokens": 4307306477.0, + "step": 8426 + }, + { + "epoch": 2.278799351000541, + "grad_norm": 1.1516201496124268, + "learning_rate": 1.2718011728045883e-05, + "loss": 1.9219, + "mean_token_accuracy": 0.5553558468818665, + "num_tokens": 4307818320.0, + "step": 8427 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 1.2945661544799805, + "learning_rate": 1.2716464201926708e-05, + "loss": 1.9991, + "mean_token_accuracy": 0.5520942211151123, + "num_tokens": 4308342502.0, + "step": 8428 + }, + { + "epoch": 2.27934018388318, + "grad_norm": 1.4736541509628296, + "learning_rate": 1.2714916623141363e-05, + "loss": 1.9926, + "mean_token_accuracy": 0.5350435972213745, + "num_tokens": 4308866559.0, + "step": 8429 + }, + { + "epoch": 2.2796106003245, + "grad_norm": 1.1588377952575684, + "learning_rate": 1.2713368991737334e-05, + "loss": 1.8651, + "mean_token_accuracy": 0.5712494254112244, + "num_tokens": 4309390576.0, + "step": 8430 + }, + { + "epoch": 2.2798810167658194, + "grad_norm": 1.0500353574752808, + "learning_rate": 1.2711821307762103e-05, + "loss": 1.7853, + "mean_token_accuracy": 0.5842376947402954, + "num_tokens": 4309914692.0, + "step": 8431 + }, + { + "epoch": 2.280151433207139, + "grad_norm": 1.404272437095642, + "learning_rate": 1.271027357126316e-05, + "loss": 1.9523, + "mean_token_accuracy": 0.5580464005470276, + "num_tokens": 4310438973.0, + "step": 8432 + }, + { + "epoch": 2.2804218496484587, + "grad_norm": 0.9521028995513916, + "learning_rate": 1.2708725782287996e-05, + "loss": 1.8064, + "mean_token_accuracy": 0.5768553018569946, + "num_tokens": 4310963186.0, + "step": 8433 + }, + { + "epoch": 2.2806922660897784, + "grad_norm": 1.1617478132247925, + "learning_rate": 1.2707177940884101e-05, + "loss": 1.8923, + "mean_token_accuracy": 0.5714498162269592, + "num_tokens": 4311487462.0, + "step": 8434 + }, + { + "epoch": 2.280962682531098, + "grad_norm": 1.1710405349731445, + "learning_rate": 1.2705630047098962e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.5542550086975098, + "num_tokens": 4312011620.0, + "step": 8435 + }, + { + "epoch": 2.2812330989724177, + "grad_norm": 1.1828759908676147, + "learning_rate": 1.270408210098008e-05, + "loss": 1.9918, + "mean_token_accuracy": 0.5612381100654602, + "num_tokens": 4312535838.0, + "step": 8436 + }, + { + "epoch": 2.2815035154137373, + "grad_norm": 1.079044222831726, + "learning_rate": 1.270253410257495e-05, + "loss": 1.9455, + "mean_token_accuracy": 0.5599949955940247, + "num_tokens": 4313060035.0, + "step": 8437 + }, + { + "epoch": 2.281773931855057, + "grad_norm": 1.3333196640014648, + "learning_rate": 1.2700986051931066e-05, + "loss": 2.0702, + "mean_token_accuracy": 0.5237720012664795, + "num_tokens": 4313584299.0, + "step": 8438 + }, + { + "epoch": 2.2820443482963766, + "grad_norm": 1.117488145828247, + "learning_rate": 1.2699437949095925e-05, + "loss": 1.8426, + "mean_token_accuracy": 0.5659446120262146, + "num_tokens": 4314108579.0, + "step": 8439 + }, + { + "epoch": 2.2823147647376962, + "grad_norm": 0.9892113208770752, + "learning_rate": 1.2697889794117033e-05, + "loss": 1.9015, + "mean_token_accuracy": 0.5669752359390259, + "num_tokens": 4314632834.0, + "step": 8440 + }, + { + "epoch": 2.282585181179016, + "grad_norm": 0.5820308327674866, + "learning_rate": 1.2696341587041884e-05, + "loss": 1.2441, + "mean_token_accuracy": 0.6687441468238831, + "num_tokens": 4315157035.0, + "step": 8441 + }, + { + "epoch": 2.282855597620335, + "grad_norm": 1.440630316734314, + "learning_rate": 1.2694793327917989e-05, + "loss": 1.8936, + "mean_token_accuracy": 0.557475209236145, + "num_tokens": 4315681309.0, + "step": 8442 + }, + { + "epoch": 2.283126014061655, + "grad_norm": 1.1611322164535522, + "learning_rate": 1.2693245016792857e-05, + "loss": 1.7351, + "mean_token_accuracy": 0.6120346188545227, + "num_tokens": 4316205399.0, + "step": 8443 + }, + { + "epoch": 2.2833964305029744, + "grad_norm": 1.0242183208465576, + "learning_rate": 1.269169665371398e-05, + "loss": 1.8465, + "mean_token_accuracy": 0.5725042819976807, + "num_tokens": 4316729514.0, + "step": 8444 + }, + { + "epoch": 2.2836668469442944, + "grad_norm": 1.1991807222366333, + "learning_rate": 1.2690148238728876e-05, + "loss": 2.0237, + "mean_token_accuracy": 0.5471862554550171, + "num_tokens": 4317253690.0, + "step": 8445 + }, + { + "epoch": 2.2839372633856136, + "grad_norm": 1.1115795373916626, + "learning_rate": 1.2688599771885058e-05, + "loss": 2.0089, + "mean_token_accuracy": 0.5342916250228882, + "num_tokens": 4317777966.0, + "step": 8446 + }, + { + "epoch": 2.2842076798269333, + "grad_norm": 1.105795979499817, + "learning_rate": 1.2687051253230029e-05, + "loss": 1.8365, + "mean_token_accuracy": 0.540084958076477, + "num_tokens": 4318302148.0, + "step": 8447 + }, + { + "epoch": 2.284478096268253, + "grad_norm": 1.094470739364624, + "learning_rate": 1.2685502682811305e-05, + "loss": 1.909, + "mean_token_accuracy": 0.566825807094574, + "num_tokens": 4318820245.0, + "step": 8448 + }, + { + "epoch": 2.2847485127095726, + "grad_norm": 1.444370150566101, + "learning_rate": 1.2683954060676407e-05, + "loss": 1.8666, + "mean_token_accuracy": 0.5681402087211609, + "num_tokens": 4319344468.0, + "step": 8449 + }, + { + "epoch": 2.285018929150892, + "grad_norm": 1.2025928497314453, + "learning_rate": 1.2682405386872845e-05, + "loss": 1.8604, + "mean_token_accuracy": 0.5739361047744751, + "num_tokens": 4319868745.0, + "step": 8450 + }, + { + "epoch": 2.285289345592212, + "grad_norm": 1.182133436203003, + "learning_rate": 1.2680856661448136e-05, + "loss": 1.9836, + "mean_token_accuracy": 0.5528355836868286, + "num_tokens": 4320393009.0, + "step": 8451 + }, + { + "epoch": 2.2855597620335315, + "grad_norm": 1.0279468297958374, + "learning_rate": 1.2679307884449803e-05, + "loss": 1.8451, + "mean_token_accuracy": 0.5480226874351501, + "num_tokens": 4320917110.0, + "step": 8452 + }, + { + "epoch": 2.285830178474851, + "grad_norm": 1.1554096937179565, + "learning_rate": 1.2677759055925368e-05, + "loss": 1.8243, + "mean_token_accuracy": 0.557847261428833, + "num_tokens": 4321441347.0, + "step": 8453 + }, + { + "epoch": 2.286100594916171, + "grad_norm": 1.1340354681015015, + "learning_rate": 1.2676210175922346e-05, + "loss": 1.8066, + "mean_token_accuracy": 0.58804851770401, + "num_tokens": 4321965612.0, + "step": 8454 + }, + { + "epoch": 2.2863710113574904, + "grad_norm": 1.0738624334335327, + "learning_rate": 1.2674661244488268e-05, + "loss": 1.909, + "mean_token_accuracy": 0.568595290184021, + "num_tokens": 4322484639.0, + "step": 8455 + }, + { + "epoch": 2.28664142779881, + "grad_norm": 1.2099804878234863, + "learning_rate": 1.2673112261670656e-05, + "loss": 1.93, + "mean_token_accuracy": 0.5650397539138794, + "num_tokens": 4323008878.0, + "step": 8456 + }, + { + "epoch": 2.2869118442401297, + "grad_norm": 1.1684600114822388, + "learning_rate": 1.2671563227517044e-05, + "loss": 1.9701, + "mean_token_accuracy": 0.5597237348556519, + "num_tokens": 4323533161.0, + "step": 8457 + }, + { + "epoch": 2.2871822606814494, + "grad_norm": 1.2415351867675781, + "learning_rate": 1.2670014142074956e-05, + "loss": 1.7694, + "mean_token_accuracy": 0.5412680506706238, + "num_tokens": 4324057292.0, + "step": 8458 + }, + { + "epoch": 2.287452677122769, + "grad_norm": 1.430510401725769, + "learning_rate": 1.266846500539192e-05, + "loss": 1.9504, + "mean_token_accuracy": 0.5596306324005127, + "num_tokens": 4324581466.0, + "step": 8459 + }, + { + "epoch": 2.2877230935640886, + "grad_norm": 1.060327410697937, + "learning_rate": 1.2666915817515475e-05, + "loss": 1.9332, + "mean_token_accuracy": 0.5474858283996582, + "num_tokens": 4325105714.0, + "step": 8460 + }, + { + "epoch": 2.2879935100054083, + "grad_norm": 0.4305576980113983, + "learning_rate": 1.2665366578493148e-05, + "loss": 1.147, + "mean_token_accuracy": 0.7086777687072754, + "num_tokens": 4325567144.0, + "step": 8461 + }, + { + "epoch": 2.288263926446728, + "grad_norm": 1.3164801597595215, + "learning_rate": 1.2663817288372477e-05, + "loss": 1.8716, + "mean_token_accuracy": 0.587059736251831, + "num_tokens": 4326091238.0, + "step": 8462 + }, + { + "epoch": 2.2885343428880476, + "grad_norm": 1.4059760570526123, + "learning_rate": 1.2662267947200999e-05, + "loss": 1.8263, + "mean_token_accuracy": 0.5738547444343567, + "num_tokens": 4326615317.0, + "step": 8463 + }, + { + "epoch": 2.2888047593293672, + "grad_norm": 1.1007963418960571, + "learning_rate": 1.266071855502625e-05, + "loss": 1.8252, + "mean_token_accuracy": 0.5775922536849976, + "num_tokens": 4327139529.0, + "step": 8464 + }, + { + "epoch": 2.289075175770687, + "grad_norm": 1.0922231674194336, + "learning_rate": 1.2659169111895772e-05, + "loss": 1.9315, + "mean_token_accuracy": 0.5593912601470947, + "num_tokens": 4327661430.0, + "step": 8465 + }, + { + "epoch": 2.2893455922120065, + "grad_norm": 1.1670749187469482, + "learning_rate": 1.265761961785711e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.582718014717102, + "num_tokens": 4328158964.0, + "step": 8466 + }, + { + "epoch": 2.289616008653326, + "grad_norm": 1.159077525138855, + "learning_rate": 1.2656070072957803e-05, + "loss": 1.9742, + "mean_token_accuracy": 0.5559507608413696, + "num_tokens": 4328683240.0, + "step": 8467 + }, + { + "epoch": 2.289886425094646, + "grad_norm": 1.0946447849273682, + "learning_rate": 1.2654520477245393e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.5533144474029541, + "num_tokens": 4329196597.0, + "step": 8468 + }, + { + "epoch": 2.2901568415359654, + "grad_norm": 1.0500195026397705, + "learning_rate": 1.265297083076743e-05, + "loss": 1.725, + "mean_token_accuracy": 0.5965476036071777, + "num_tokens": 4329668383.0, + "step": 8469 + }, + { + "epoch": 2.290427257977285, + "grad_norm": 1.2175588607788086, + "learning_rate": 1.2651421133571463e-05, + "loss": 1.6807, + "mean_token_accuracy": 0.614090085029602, + "num_tokens": 4330174028.0, + "step": 8470 + }, + { + "epoch": 2.2906976744186047, + "grad_norm": 1.3290035724639893, + "learning_rate": 1.2649871385705038e-05, + "loss": 1.921, + "mean_token_accuracy": 0.5676029920578003, + "num_tokens": 4330635530.0, + "step": 8471 + }, + { + "epoch": 2.2909680908599244, + "grad_norm": 1.1849586963653564, + "learning_rate": 1.2648321587215704e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5625193119049072, + "num_tokens": 4331159719.0, + "step": 8472 + }, + { + "epoch": 2.291238507301244, + "grad_norm": 1.1247882843017578, + "learning_rate": 1.264677173815102e-05, + "loss": 2.0068, + "mean_token_accuracy": 0.5568743348121643, + "num_tokens": 4331664146.0, + "step": 8473 + }, + { + "epoch": 2.2915089237425637, + "grad_norm": 1.1477158069610596, + "learning_rate": 1.2645221838558535e-05, + "loss": 1.9757, + "mean_token_accuracy": 0.5610145330429077, + "num_tokens": 4332165044.0, + "step": 8474 + }, + { + "epoch": 2.2917793401838833, + "grad_norm": 1.2908308506011963, + "learning_rate": 1.2643671888485804e-05, + "loss": 1.8695, + "mean_token_accuracy": 0.5801206827163696, + "num_tokens": 4332689234.0, + "step": 8475 + }, + { + "epoch": 2.292049756625203, + "grad_norm": 1.0068769454956055, + "learning_rate": 1.2642121887980388e-05, + "loss": 1.8395, + "mean_token_accuracy": 0.5750728249549866, + "num_tokens": 4333213490.0, + "step": 8476 + }, + { + "epoch": 2.2923201730665226, + "grad_norm": 1.2769814729690552, + "learning_rate": 1.264057183708984e-05, + "loss": 1.9049, + "mean_token_accuracy": 0.5585300922393799, + "num_tokens": 4333737517.0, + "step": 8477 + }, + { + "epoch": 2.2925905895078422, + "grad_norm": 1.2383781671524048, + "learning_rate": 1.2639021735861727e-05, + "loss": 1.8015, + "mean_token_accuracy": 0.5964617133140564, + "num_tokens": 4334256988.0, + "step": 8478 + }, + { + "epoch": 2.292861005949162, + "grad_norm": 1.0865044593811035, + "learning_rate": 1.2637471584343608e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.5557876825332642, + "num_tokens": 4334781231.0, + "step": 8479 + }, + { + "epoch": 2.2931314223904815, + "grad_norm": 1.0339266061782837, + "learning_rate": 1.2635921382583045e-05, + "loss": 1.9175, + "mean_token_accuracy": 0.5605607032775879, + "num_tokens": 4335305498.0, + "step": 8480 + }, + { + "epoch": 2.293401838831801, + "grad_norm": 0.48828113079071045, + "learning_rate": 1.2634371130627601e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.6888746023178101, + "num_tokens": 4335829739.0, + "step": 8481 + }, + { + "epoch": 2.293672255273121, + "grad_norm": 1.2915065288543701, + "learning_rate": 1.2632820828524844e-05, + "loss": 1.9386, + "mean_token_accuracy": 0.5455226898193359, + "num_tokens": 4336353864.0, + "step": 8482 + }, + { + "epoch": 2.29394267171444, + "grad_norm": 1.2692968845367432, + "learning_rate": 1.263127047632234e-05, + "loss": 1.9333, + "mean_token_accuracy": 0.5632005929946899, + "num_tokens": 4336878049.0, + "step": 8483 + }, + { + "epoch": 2.29421308815576, + "grad_norm": 1.104641079902649, + "learning_rate": 1.2629720074067666e-05, + "loss": 1.8283, + "mean_token_accuracy": 0.5681933760643005, + "num_tokens": 4337402190.0, + "step": 8484 + }, + { + "epoch": 2.2944835045970793, + "grad_norm": 1.3241525888442993, + "learning_rate": 1.2628169621808383e-05, + "loss": 2.1036, + "mean_token_accuracy": 0.5167149305343628, + "num_tokens": 4337926400.0, + "step": 8485 + }, + { + "epoch": 2.2947539210383994, + "grad_norm": 1.0992594957351685, + "learning_rate": 1.2626619119592074e-05, + "loss": 1.8428, + "mean_token_accuracy": 0.5638524293899536, + "num_tokens": 4338450556.0, + "step": 8486 + }, + { + "epoch": 2.2950243374797186, + "grad_norm": 1.3857582807540894, + "learning_rate": 1.26250685674663e-05, + "loss": 1.9309, + "mean_token_accuracy": 0.5639257431030273, + "num_tokens": 4338974715.0, + "step": 8487 + }, + { + "epoch": 2.295294753921038, + "grad_norm": 1.3543589115142822, + "learning_rate": 1.2623517965478647e-05, + "loss": 1.9314, + "mean_token_accuracy": 0.567125141620636, + "num_tokens": 4339498929.0, + "step": 8488 + }, + { + "epoch": 2.295565170362358, + "grad_norm": 1.263251781463623, + "learning_rate": 1.2621967313676689e-05, + "loss": 1.8518, + "mean_token_accuracy": 0.5888983011245728, + "num_tokens": 4339900679.0, + "step": 8489 + }, + { + "epoch": 2.2958355868036775, + "grad_norm": 1.2448668479919434, + "learning_rate": 1.2620416612108003e-05, + "loss": 1.9201, + "mean_token_accuracy": 0.5742768049240112, + "num_tokens": 4340386111.0, + "step": 8490 + }, + { + "epoch": 2.296106003244997, + "grad_norm": 1.2425434589385986, + "learning_rate": 1.2618865860820167e-05, + "loss": 1.9927, + "mean_token_accuracy": 0.5484778881072998, + "num_tokens": 4340910318.0, + "step": 8491 + }, + { + "epoch": 2.296376419686317, + "grad_norm": 1.2651252746582031, + "learning_rate": 1.2617315059860767e-05, + "loss": 1.9156, + "mean_token_accuracy": 0.5546460151672363, + "num_tokens": 4341434489.0, + "step": 8492 + }, + { + "epoch": 2.2966468361276364, + "grad_norm": 1.316265344619751, + "learning_rate": 1.2615764209277388e-05, + "loss": 1.9202, + "mean_token_accuracy": 0.5823843479156494, + "num_tokens": 4341936029.0, + "step": 8493 + }, + { + "epoch": 2.296917252568956, + "grad_norm": 1.2048639059066772, + "learning_rate": 1.2614213309117609e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.551325261592865, + "num_tokens": 4342460292.0, + "step": 8494 + }, + { + "epoch": 2.2971876690102757, + "grad_norm": 1.0679123401641846, + "learning_rate": 1.2612662359429015e-05, + "loss": 1.8022, + "mean_token_accuracy": 0.5841428637504578, + "num_tokens": 4342984480.0, + "step": 8495 + }, + { + "epoch": 2.2974580854515954, + "grad_norm": 1.2131526470184326, + "learning_rate": 1.2611111360259205e-05, + "loss": 1.954, + "mean_token_accuracy": 0.5376011729240417, + "num_tokens": 4343508637.0, + "step": 8496 + }, + { + "epoch": 2.297728501892915, + "grad_norm": 1.2283246517181396, + "learning_rate": 1.2609560311655755e-05, + "loss": 1.9049, + "mean_token_accuracy": 0.566681444644928, + "num_tokens": 4344032584.0, + "step": 8497 + }, + { + "epoch": 2.2979989183342346, + "grad_norm": 1.3293075561523438, + "learning_rate": 1.2608009213666261e-05, + "loss": 1.8925, + "mean_token_accuracy": 0.5695509910583496, + "num_tokens": 4344521009.0, + "step": 8498 + }, + { + "epoch": 2.2982693347755543, + "grad_norm": 1.0689730644226074, + "learning_rate": 1.2606458066338318e-05, + "loss": 1.8759, + "mean_token_accuracy": 0.5672854781150818, + "num_tokens": 4345045187.0, + "step": 8499 + }, + { + "epoch": 2.298539751216874, + "grad_norm": 1.1838723421096802, + "learning_rate": 1.2604906869719517e-05, + "loss": 1.9427, + "mean_token_accuracy": 0.5733364820480347, + "num_tokens": 4345482030.0, + "step": 8500 + }, + { + "epoch": 2.2988101676581936, + "grad_norm": 0.6373982429504395, + "learning_rate": 1.2603355623857454e-05, + "loss": 1.1327, + "mean_token_accuracy": 0.7028583288192749, + "num_tokens": 4346006225.0, + "step": 8501 + }, + { + "epoch": 2.299080584099513, + "grad_norm": 1.5536184310913086, + "learning_rate": 1.2601804328799723e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5688063502311707, + "num_tokens": 4346530500.0, + "step": 8502 + }, + { + "epoch": 2.299351000540833, + "grad_norm": 1.4649946689605713, + "learning_rate": 1.2600252984593926e-05, + "loss": 1.9185, + "mean_token_accuracy": 0.5636312365531921, + "num_tokens": 4347054755.0, + "step": 8503 + }, + { + "epoch": 2.2996214169821525, + "grad_norm": 1.0328623056411743, + "learning_rate": 1.2598701591287664e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5688153505325317, + "num_tokens": 4347578928.0, + "step": 8504 + }, + { + "epoch": 2.299891833423472, + "grad_norm": 1.5131160020828247, + "learning_rate": 1.259715014892853e-05, + "loss": 2.0468, + "mean_token_accuracy": 0.5506507754325867, + "num_tokens": 4348103166.0, + "step": 8505 + }, + { + "epoch": 2.300162249864792, + "grad_norm": 1.3368514776229858, + "learning_rate": 1.2595598657564136e-05, + "loss": 1.9152, + "mean_token_accuracy": 0.5747372508049011, + "num_tokens": 4348627327.0, + "step": 8506 + }, + { + "epoch": 2.3004326663061114, + "grad_norm": 1.014362096786499, + "learning_rate": 1.2594047117242081e-05, + "loss": 1.9286, + "mean_token_accuracy": 0.561421275138855, + "num_tokens": 4349151609.0, + "step": 8507 + }, + { + "epoch": 2.300703082747431, + "grad_norm": 1.4014521837234497, + "learning_rate": 1.2592495528009973e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.5507797002792358, + "num_tokens": 4349675738.0, + "step": 8508 + }, + { + "epoch": 2.3009734991887507, + "grad_norm": 1.5168545246124268, + "learning_rate": 1.2590943889915423e-05, + "loss": 2.0265, + "mean_token_accuracy": 0.5461182594299316, + "num_tokens": 4350200013.0, + "step": 8509 + }, + { + "epoch": 2.3012439156300704, + "grad_norm": 1.22056245803833, + "learning_rate": 1.2589392203006032e-05, + "loss": 2.0083, + "mean_token_accuracy": 0.5364885926246643, + "num_tokens": 4350724060.0, + "step": 8510 + }, + { + "epoch": 2.30151433207139, + "grad_norm": 1.3661950826644897, + "learning_rate": 1.2587840467329415e-05, + "loss": 1.8761, + "mean_token_accuracy": 0.5681347846984863, + "num_tokens": 4351220145.0, + "step": 8511 + }, + { + "epoch": 2.3017847485127096, + "grad_norm": 1.4090405702590942, + "learning_rate": 1.258628868293318e-05, + "loss": 2.0263, + "mean_token_accuracy": 0.5429044365882874, + "num_tokens": 4351744419.0, + "step": 8512 + }, + { + "epoch": 2.3020551649540293, + "grad_norm": 1.3120440244674683, + "learning_rate": 1.258473684986495e-05, + "loss": 1.973, + "mean_token_accuracy": 0.5793676376342773, + "num_tokens": 4352133476.0, + "step": 8513 + }, + { + "epoch": 2.302325581395349, + "grad_norm": 1.077232003211975, + "learning_rate": 1.2583184968172326e-05, + "loss": 1.941, + "mean_token_accuracy": 0.5592367053031921, + "num_tokens": 4352657747.0, + "step": 8514 + }, + { + "epoch": 2.3025959978366686, + "grad_norm": 0.9871693849563599, + "learning_rate": 1.2581633037902934e-05, + "loss": 1.8123, + "mean_token_accuracy": 0.5933355689048767, + "num_tokens": 4353119657.0, + "step": 8515 + }, + { + "epoch": 2.302866414277988, + "grad_norm": 1.459942102432251, + "learning_rate": 1.2580081059104392e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5651063919067383, + "num_tokens": 4353545961.0, + "step": 8516 + }, + { + "epoch": 2.303136830719308, + "grad_norm": 1.3313114643096924, + "learning_rate": 1.2578529031824313e-05, + "loss": 1.9, + "mean_token_accuracy": 0.5829946994781494, + "num_tokens": 4354011425.0, + "step": 8517 + }, + { + "epoch": 2.3034072471606275, + "grad_norm": 1.2042770385742188, + "learning_rate": 1.2576976956110323e-05, + "loss": 1.9126, + "mean_token_accuracy": 0.563578188419342, + "num_tokens": 4354535523.0, + "step": 8518 + }, + { + "epoch": 2.303677663601947, + "grad_norm": 1.1608240604400635, + "learning_rate": 1.257542483201004e-05, + "loss": 1.8626, + "mean_token_accuracy": 0.5664837956428528, + "num_tokens": 4355021115.0, + "step": 8519 + }, + { + "epoch": 2.303948080043267, + "grad_norm": 1.3351876735687256, + "learning_rate": 1.2573872659571092e-05, + "loss": 1.8179, + "mean_token_accuracy": 0.583541214466095, + "num_tokens": 4355516397.0, + "step": 8520 + }, + { + "epoch": 2.3042184964845864, + "grad_norm": 0.5189459323883057, + "learning_rate": 1.2572320438841102e-05, + "loss": 0.9241, + "mean_token_accuracy": 0.7426179647445679, + "num_tokens": 4356040679.0, + "step": 8521 + }, + { + "epoch": 2.304488912925906, + "grad_norm": 1.7189785242080688, + "learning_rate": 1.2570768169867698e-05, + "loss": 1.8938, + "mean_token_accuracy": 0.5635128021240234, + "num_tokens": 4356564900.0, + "step": 8522 + }, + { + "epoch": 2.3047593293672257, + "grad_norm": 1.722865343093872, + "learning_rate": 1.2569215852698504e-05, + "loss": 1.9655, + "mean_token_accuracy": 0.5720717310905457, + "num_tokens": 4357089138.0, + "step": 8523 + }, + { + "epoch": 2.305029745808545, + "grad_norm": 1.274735689163208, + "learning_rate": 1.2567663487381154e-05, + "loss": 2.0298, + "mean_token_accuracy": 0.5419092178344727, + "num_tokens": 4357612114.0, + "step": 8524 + }, + { + "epoch": 2.305300162249865, + "grad_norm": 1.3289976119995117, + "learning_rate": 1.2566111073963281e-05, + "loss": 1.8518, + "mean_token_accuracy": 0.5783430337905884, + "num_tokens": 4358136395.0, + "step": 8525 + }, + { + "epoch": 2.305570578691184, + "grad_norm": 1.3841758966445923, + "learning_rate": 1.2564558612492513e-05, + "loss": 1.8678, + "mean_token_accuracy": 0.5721424221992493, + "num_tokens": 4358660666.0, + "step": 8526 + }, + { + "epoch": 2.3058409951325043, + "grad_norm": 1.2976114749908447, + "learning_rate": 1.2563006103016484e-05, + "loss": 1.9075, + "mean_token_accuracy": 0.5699889063835144, + "num_tokens": 4359184829.0, + "step": 8527 + }, + { + "epoch": 2.3061114115738235, + "grad_norm": 1.2288126945495605, + "learning_rate": 1.2561453545582832e-05, + "loss": 1.8839, + "mean_token_accuracy": 0.5650854110717773, + "num_tokens": 4359709104.0, + "step": 8528 + }, + { + "epoch": 2.306381828015143, + "grad_norm": 1.1426661014556885, + "learning_rate": 1.2559900940239196e-05, + "loss": 1.8283, + "mean_token_accuracy": 0.5965868830680847, + "num_tokens": 4360233259.0, + "step": 8529 + }, + { + "epoch": 2.3066522444564628, + "grad_norm": 1.4256500005722046, + "learning_rate": 1.255834828703321e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5663471221923828, + "num_tokens": 4360757476.0, + "step": 8530 + }, + { + "epoch": 2.3069226608977824, + "grad_norm": 1.4486477375030518, + "learning_rate": 1.2556795586012514e-05, + "loss": 1.8372, + "mean_token_accuracy": 0.5729100704193115, + "num_tokens": 4361247256.0, + "step": 8531 + }, + { + "epoch": 2.307193077339102, + "grad_norm": 1.232548475265503, + "learning_rate": 1.2555242837224754e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5728137493133545, + "num_tokens": 4361771506.0, + "step": 8532 + }, + { + "epoch": 2.3074634937804217, + "grad_norm": 1.3007220029830933, + "learning_rate": 1.2553690040717567e-05, + "loss": 1.9426, + "mean_token_accuracy": 0.5675221085548401, + "num_tokens": 4362295696.0, + "step": 8533 + }, + { + "epoch": 2.3077339102217413, + "grad_norm": 1.3642090559005737, + "learning_rate": 1.2552137196538602e-05, + "loss": 1.9507, + "mean_token_accuracy": 0.5602450370788574, + "num_tokens": 4362819935.0, + "step": 8534 + }, + { + "epoch": 2.308004326663061, + "grad_norm": 1.1517424583435059, + "learning_rate": 1.2550584304735505e-05, + "loss": 1.927, + "mean_token_accuracy": 0.5676543116569519, + "num_tokens": 4363344067.0, + "step": 8535 + }, + { + "epoch": 2.3082747431043806, + "grad_norm": 1.364645004272461, + "learning_rate": 1.254903136535592e-05, + "loss": 1.8176, + "mean_token_accuracy": 0.5979639291763306, + "num_tokens": 4363818910.0, + "step": 8536 + }, + { + "epoch": 2.3085451595457003, + "grad_norm": 1.108853816986084, + "learning_rate": 1.2547478378447497e-05, + "loss": 1.805, + "mean_token_accuracy": 0.5684927701950073, + "num_tokens": 4364343070.0, + "step": 8537 + }, + { + "epoch": 2.30881557598702, + "grad_norm": 1.1143594980239868, + "learning_rate": 1.2545925344057888e-05, + "loss": 1.7687, + "mean_token_accuracy": 0.573819100856781, + "num_tokens": 4364846135.0, + "step": 8538 + }, + { + "epoch": 2.3090859924283396, + "grad_norm": 1.5795854330062866, + "learning_rate": 1.2544372262234743e-05, + "loss": 1.871, + "mean_token_accuracy": 0.5608250498771667, + "num_tokens": 4365370388.0, + "step": 8539 + }, + { + "epoch": 2.309356408869659, + "grad_norm": 1.2315703630447388, + "learning_rate": 1.2542819133025713e-05, + "loss": 1.8311, + "mean_token_accuracy": 0.5906287431716919, + "num_tokens": 4365894648.0, + "step": 8540 + }, + { + "epoch": 2.309626825310979, + "grad_norm": 0.6033897995948792, + "learning_rate": 1.2541265956478453e-05, + "loss": 1.1431, + "mean_token_accuracy": 0.6972895860671997, + "num_tokens": 4366418908.0, + "step": 8541 + }, + { + "epoch": 2.3098972417522985, + "grad_norm": 1.9369666576385498, + "learning_rate": 1.2539712732640625e-05, + "loss": 1.9108, + "mean_token_accuracy": 0.5840528011322021, + "num_tokens": 4366883929.0, + "step": 8542 + }, + { + "epoch": 2.310167658193618, + "grad_norm": 1.8939592838287354, + "learning_rate": 1.2538159461559882e-05, + "loss": 1.9615, + "mean_token_accuracy": 0.5567395687103271, + "num_tokens": 4367408173.0, + "step": 8543 + }, + { + "epoch": 2.3104380746349378, + "grad_norm": 1.1033012866973877, + "learning_rate": 1.2536606143283883e-05, + "loss": 1.9189, + "mean_token_accuracy": 0.5577670335769653, + "num_tokens": 4367932349.0, + "step": 8544 + }, + { + "epoch": 2.3107084910762574, + "grad_norm": 1.2058601379394531, + "learning_rate": 1.2535052777860288e-05, + "loss": 1.9935, + "mean_token_accuracy": 0.538835883140564, + "num_tokens": 4368456504.0, + "step": 8545 + }, + { + "epoch": 2.310978907517577, + "grad_norm": 1.615903377532959, + "learning_rate": 1.253349936533676e-05, + "loss": 1.8073, + "mean_token_accuracy": 0.5873883962631226, + "num_tokens": 4368920127.0, + "step": 8546 + }, + { + "epoch": 2.3112493239588967, + "grad_norm": 1.5598649978637695, + "learning_rate": 1.2531945905760961e-05, + "loss": 1.9903, + "mean_token_accuracy": 0.5553687810897827, + "num_tokens": 4369444383.0, + "step": 8547 + }, + { + "epoch": 2.3115197404002163, + "grad_norm": 1.194605827331543, + "learning_rate": 1.2530392399180556e-05, + "loss": 1.9469, + "mean_token_accuracy": 0.5576481223106384, + "num_tokens": 4369968632.0, + "step": 8548 + }, + { + "epoch": 2.311790156841536, + "grad_norm": 1.5631897449493408, + "learning_rate": 1.2528838845643214e-05, + "loss": 1.9119, + "mean_token_accuracy": 0.5689929723739624, + "num_tokens": 4370442218.0, + "step": 8549 + }, + { + "epoch": 2.3120605732828556, + "grad_norm": 1.4030342102050781, + "learning_rate": 1.2527285245196599e-05, + "loss": 1.8323, + "mean_token_accuracy": 0.5829449892044067, + "num_tokens": 4370911429.0, + "step": 8550 + }, + { + "epoch": 2.3123309897241753, + "grad_norm": 1.518567681312561, + "learning_rate": 1.252573159788838e-05, + "loss": 1.9746, + "mean_token_accuracy": 0.5390150547027588, + "num_tokens": 4371435691.0, + "step": 8551 + }, + { + "epoch": 2.312601406165495, + "grad_norm": 1.2159323692321777, + "learning_rate": 1.2524177903766233e-05, + "loss": 1.7853, + "mean_token_accuracy": 0.6015232801437378, + "num_tokens": 4371940611.0, + "step": 8552 + }, + { + "epoch": 2.3128718226068146, + "grad_norm": 1.3286566734313965, + "learning_rate": 1.252262416287782e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.5494808554649353, + "num_tokens": 4372456990.0, + "step": 8553 + }, + { + "epoch": 2.313142239048134, + "grad_norm": 1.382524847984314, + "learning_rate": 1.2521070375270823e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.5569620728492737, + "num_tokens": 4372908425.0, + "step": 8554 + }, + { + "epoch": 2.313412655489454, + "grad_norm": 1.3843061923980713, + "learning_rate": 1.2519516540992913e-05, + "loss": 1.5945, + "mean_token_accuracy": 0.5933643579483032, + "num_tokens": 4373432553.0, + "step": 8555 + }, + { + "epoch": 2.3136830719307735, + "grad_norm": 1.6951712369918823, + "learning_rate": 1.2517962660091767e-05, + "loss": 1.9362, + "mean_token_accuracy": 0.5433070063591003, + "num_tokens": 4373956800.0, + "step": 8556 + }, + { + "epoch": 2.313953488372093, + "grad_norm": 1.3093065023422241, + "learning_rate": 1.251640873261506e-05, + "loss": 1.8591, + "mean_token_accuracy": 0.5698330402374268, + "num_tokens": 4374433118.0, + "step": 8557 + }, + { + "epoch": 2.314223904813413, + "grad_norm": 1.2559908628463745, + "learning_rate": 1.251485475861048e-05, + "loss": 1.9244, + "mean_token_accuracy": 0.5546725988388062, + "num_tokens": 4374947328.0, + "step": 8558 + }, + { + "epoch": 2.3144943212547324, + "grad_norm": 1.3263083696365356, + "learning_rate": 1.2513300738125695e-05, + "loss": 1.8592, + "mean_token_accuracy": 0.5550409555435181, + "num_tokens": 4375471473.0, + "step": 8559 + }, + { + "epoch": 2.314764737696052, + "grad_norm": 1.3482015132904053, + "learning_rate": 1.2511746671208394e-05, + "loss": 2.0109, + "mean_token_accuracy": 0.5494369268417358, + "num_tokens": 4375995675.0, + "step": 8560 + }, + { + "epoch": 2.3150351541373717, + "grad_norm": 0.5316922068595886, + "learning_rate": 1.2510192557906263e-05, + "loss": 1.103, + "mean_token_accuracy": 0.7061624526977539, + "num_tokens": 4376519917.0, + "step": 8561 + }, + { + "epoch": 2.3153055705786914, + "grad_norm": 1.9963650703430176, + "learning_rate": 1.2508638398266979e-05, + "loss": 1.95, + "mean_token_accuracy": 0.5780616998672485, + "num_tokens": 4377044052.0, + "step": 8562 + }, + { + "epoch": 2.315575987020011, + "grad_norm": 1.8252145051956177, + "learning_rate": 1.2507084192338233e-05, + "loss": 2.0027, + "mean_token_accuracy": 0.5501686334609985, + "num_tokens": 4377568255.0, + "step": 8563 + }, + { + "epoch": 2.3158464034613306, + "grad_norm": 1.3780120611190796, + "learning_rate": 1.2505529940167713e-05, + "loss": 1.8879, + "mean_token_accuracy": 0.5317123532295227, + "num_tokens": 4378092517.0, + "step": 8564 + }, + { + "epoch": 2.31611681990265, + "grad_norm": 1.2401573657989502, + "learning_rate": 1.2503975641803108e-05, + "loss": 1.8619, + "mean_token_accuracy": 0.559077799320221, + "num_tokens": 4378616759.0, + "step": 8565 + }, + { + "epoch": 2.31638723634397, + "grad_norm": 1.5613746643066406, + "learning_rate": 1.2502421297292107e-05, + "loss": 1.873, + "mean_token_accuracy": 0.5741708874702454, + "num_tokens": 4379140914.0, + "step": 8566 + }, + { + "epoch": 2.316657652785289, + "grad_norm": 1.603432536125183, + "learning_rate": 1.2500866906682401e-05, + "loss": 1.8607, + "mean_token_accuracy": 0.5798434019088745, + "num_tokens": 4379665165.0, + "step": 8567 + }, + { + "epoch": 2.316928069226609, + "grad_norm": 1.5864124298095703, + "learning_rate": 1.2499312470021685e-05, + "loss": 1.9091, + "mean_token_accuracy": 0.5627112984657288, + "num_tokens": 4380189276.0, + "step": 8568 + }, + { + "epoch": 2.3171984856679284, + "grad_norm": 1.4676488637924194, + "learning_rate": 1.2497757987357653e-05, + "loss": 1.8537, + "mean_token_accuracy": 0.5676624774932861, + "num_tokens": 4380713500.0, + "step": 8569 + }, + { + "epoch": 2.317468902109248, + "grad_norm": 1.5404040813446045, + "learning_rate": 1.2496203458738002e-05, + "loss": 1.8782, + "mean_token_accuracy": 0.5698052644729614, + "num_tokens": 4381237742.0, + "step": 8570 + }, + { + "epoch": 2.3177393185505677, + "grad_norm": 1.3217048645019531, + "learning_rate": 1.2494648884210428e-05, + "loss": 1.9736, + "mean_token_accuracy": 0.5622718334197998, + "num_tokens": 4381761886.0, + "step": 8571 + }, + { + "epoch": 2.3180097349918873, + "grad_norm": 1.2056660652160645, + "learning_rate": 1.2493094263822632e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5538609623908997, + "num_tokens": 4382286147.0, + "step": 8572 + }, + { + "epoch": 2.318280151433207, + "grad_norm": 1.6023685932159424, + "learning_rate": 1.2491539597622313e-05, + "loss": 1.9715, + "mean_token_accuracy": 0.5718896389007568, + "num_tokens": 4382748439.0, + "step": 8573 + }, + { + "epoch": 2.3185505678745266, + "grad_norm": 1.3434590101242065, + "learning_rate": 1.2489984885657172e-05, + "loss": 1.9168, + "mean_token_accuracy": 0.5632265210151672, + "num_tokens": 4383272634.0, + "step": 8574 + }, + { + "epoch": 2.3188209843158463, + "grad_norm": 1.4127146005630493, + "learning_rate": 1.2488430127974916e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5774121284484863, + "num_tokens": 4383796814.0, + "step": 8575 + }, + { + "epoch": 2.319091400757166, + "grad_norm": 1.198857069015503, + "learning_rate": 1.248687532462324e-05, + "loss": 1.9031, + "mean_token_accuracy": 0.5646055936813354, + "num_tokens": 4384320983.0, + "step": 8576 + }, + { + "epoch": 2.3193618171984856, + "grad_norm": 1.1573618650436401, + "learning_rate": 1.2485320475649858e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5688667893409729, + "num_tokens": 4384834943.0, + "step": 8577 + }, + { + "epoch": 2.319632233639805, + "grad_norm": 1.2167491912841797, + "learning_rate": 1.2483765581102478e-05, + "loss": 1.9386, + "mean_token_accuracy": 0.5614545345306396, + "num_tokens": 4385322329.0, + "step": 8578 + }, + { + "epoch": 2.319902650081125, + "grad_norm": 1.2065900564193726, + "learning_rate": 1.248221064102881e-05, + "loss": 1.9526, + "mean_token_accuracy": 0.5587664246559143, + "num_tokens": 4385846595.0, + "step": 8579 + }, + { + "epoch": 2.3201730665224445, + "grad_norm": 1.1554346084594727, + "learning_rate": 1.2480655655476554e-05, + "loss": 1.9082, + "mean_token_accuracy": 0.564630389213562, + "num_tokens": 4386370876.0, + "step": 8580 + }, + { + "epoch": 2.320443482963764, + "grad_norm": 0.6407864689826965, + "learning_rate": 1.2479100624493435e-05, + "loss": 1.1262, + "mean_token_accuracy": 0.7014826536178589, + "num_tokens": 4386895067.0, + "step": 8581 + }, + { + "epoch": 2.3207138994050838, + "grad_norm": 1.4406572580337524, + "learning_rate": 1.2477545548127155e-05, + "loss": 1.9321, + "mean_token_accuracy": 0.5498764514923096, + "num_tokens": 4387419254.0, + "step": 8582 + }, + { + "epoch": 2.3209843158464034, + "grad_norm": 1.2588578462600708, + "learning_rate": 1.2475990426425429e-05, + "loss": 1.9623, + "mean_token_accuracy": 0.5393825769424438, + "num_tokens": 4387943450.0, + "step": 8583 + }, + { + "epoch": 2.321254732287723, + "grad_norm": 1.087677240371704, + "learning_rate": 1.247443525943598e-05, + "loss": 1.9972, + "mean_token_accuracy": 0.5476899147033691, + "num_tokens": 4388467665.0, + "step": 8584 + }, + { + "epoch": 2.3215251487290427, + "grad_norm": 1.4711791276931763, + "learning_rate": 1.247288004720652e-05, + "loss": 1.9364, + "mean_token_accuracy": 0.548905074596405, + "num_tokens": 4388991834.0, + "step": 8585 + }, + { + "epoch": 2.3217955651703623, + "grad_norm": 1.0831749439239502, + "learning_rate": 1.2471324789784771e-05, + "loss": 1.8745, + "mean_token_accuracy": 0.5642117261886597, + "num_tokens": 4389490991.0, + "step": 8586 + }, + { + "epoch": 2.322065981611682, + "grad_norm": 1.1919296979904175, + "learning_rate": 1.2469769487218448e-05, + "loss": 2.057, + "mean_token_accuracy": 0.5588226318359375, + "num_tokens": 4389958579.0, + "step": 8587 + }, + { + "epoch": 2.3223363980530016, + "grad_norm": 1.054142713546753, + "learning_rate": 1.2468214139555277e-05, + "loss": 1.8259, + "mean_token_accuracy": 0.5763020515441895, + "num_tokens": 4390482715.0, + "step": 8588 + }, + { + "epoch": 2.3226068144943213, + "grad_norm": 1.1018812656402588, + "learning_rate": 1.2466658746842979e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.5667819976806641, + "num_tokens": 4391006886.0, + "step": 8589 + }, + { + "epoch": 2.322877230935641, + "grad_norm": 1.137210726737976, + "learning_rate": 1.2465103309129275e-05, + "loss": 1.8156, + "mean_token_accuracy": 0.5877938270568848, + "num_tokens": 4391511749.0, + "step": 8590 + }, + { + "epoch": 2.3231476473769606, + "grad_norm": 1.2938485145568848, + "learning_rate": 1.2463547826461892e-05, + "loss": 1.9912, + "mean_token_accuracy": 0.535615086555481, + "num_tokens": 4391980610.0, + "step": 8591 + }, + { + "epoch": 2.32341806381828, + "grad_norm": 1.063415765762329, + "learning_rate": 1.2461992298888559e-05, + "loss": 1.8659, + "mean_token_accuracy": 0.5584559440612793, + "num_tokens": 4392504870.0, + "step": 8592 + }, + { + "epoch": 2.3236884802596, + "grad_norm": 1.2277470827102661, + "learning_rate": 1.2460436726457003e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5822427272796631, + "num_tokens": 4392999093.0, + "step": 8593 + }, + { + "epoch": 2.3239588967009195, + "grad_norm": 1.2271301746368408, + "learning_rate": 1.2458881109214953e-05, + "loss": 1.9142, + "mean_token_accuracy": 0.5665543675422668, + "num_tokens": 4393523333.0, + "step": 8594 + }, + { + "epoch": 2.324229313142239, + "grad_norm": 1.4363090991973877, + "learning_rate": 1.2457325447210145e-05, + "loss": 1.8973, + "mean_token_accuracy": 0.5705409646034241, + "num_tokens": 4394030914.0, + "step": 8595 + }, + { + "epoch": 2.3244997295835588, + "grad_norm": 1.3335694074630737, + "learning_rate": 1.2455769740490303e-05, + "loss": 1.8559, + "mean_token_accuracy": 0.5802698135375977, + "num_tokens": 4394514813.0, + "step": 8596 + }, + { + "epoch": 2.3247701460248784, + "grad_norm": 1.1152312755584717, + "learning_rate": 1.2454213989103162e-05, + "loss": 1.9671, + "mean_token_accuracy": 0.5509165525436401, + "num_tokens": 4394982145.0, + "step": 8597 + }, + { + "epoch": 2.325040562466198, + "grad_norm": 1.0811468362808228, + "learning_rate": 1.2452658193096461e-05, + "loss": 1.8657, + "mean_token_accuracy": 0.5725224018096924, + "num_tokens": 4395445107.0, + "step": 8598 + }, + { + "epoch": 2.3253109789075177, + "grad_norm": 1.1404205560684204, + "learning_rate": 1.2451102352517934e-05, + "loss": 1.9866, + "mean_token_accuracy": 0.5390576720237732, + "num_tokens": 4395923363.0, + "step": 8599 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 1.069000482559204, + "learning_rate": 1.244954646741532e-05, + "loss": 1.8014, + "mean_token_accuracy": 0.5990643501281738, + "num_tokens": 4396343560.0, + "step": 8600 + }, + { + "epoch": 2.325851811790157, + "grad_norm": 0.5552605986595154, + "learning_rate": 1.2447990537836358e-05, + "loss": 1.0724, + "mean_token_accuracy": 0.7155433893203735, + "num_tokens": 4396867537.0, + "step": 8601 + }, + { + "epoch": 2.3261222282314766, + "grad_norm": 1.3541908264160156, + "learning_rate": 1.2446434563828787e-05, + "loss": 1.8678, + "mean_token_accuracy": 0.5767920017242432, + "num_tokens": 4397391742.0, + "step": 8602 + }, + { + "epoch": 2.3263926446727963, + "grad_norm": 1.2579368352890015, + "learning_rate": 1.2444878545440351e-05, + "loss": 1.9374, + "mean_token_accuracy": 0.5654878616333008, + "num_tokens": 4397916009.0, + "step": 8603 + }, + { + "epoch": 2.326663061114116, + "grad_norm": 1.1936458349227905, + "learning_rate": 1.2443322482718791e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.5385113954544067, + "num_tokens": 4398440231.0, + "step": 8604 + }, + { + "epoch": 2.3269334775554356, + "grad_norm": 1.22805917263031, + "learning_rate": 1.2441766375711854e-05, + "loss": 1.9047, + "mean_token_accuracy": 0.5381437540054321, + "num_tokens": 4398964319.0, + "step": 8605 + }, + { + "epoch": 2.3272038939967548, + "grad_norm": 1.5261542797088623, + "learning_rate": 1.2440210224467284e-05, + "loss": 1.8742, + "mean_token_accuracy": 0.5752710103988647, + "num_tokens": 4399392669.0, + "step": 8606 + }, + { + "epoch": 2.327474310438075, + "grad_norm": 1.4213862419128418, + "learning_rate": 1.2438654029032828e-05, + "loss": 1.9007, + "mean_token_accuracy": 0.5567673444747925, + "num_tokens": 4399916949.0, + "step": 8607 + }, + { + "epoch": 2.327744726879394, + "grad_norm": 1.3191159963607788, + "learning_rate": 1.2437097789456238e-05, + "loss": 1.7777, + "mean_token_accuracy": 0.5987226963043213, + "num_tokens": 4400441217.0, + "step": 8608 + }, + { + "epoch": 2.328015143320714, + "grad_norm": 1.2318342924118042, + "learning_rate": 1.243554150578526e-05, + "loss": 2.021, + "mean_token_accuracy": 0.5430859327316284, + "num_tokens": 4400965459.0, + "step": 8609 + }, + { + "epoch": 2.3282855597620333, + "grad_norm": 1.4732643365859985, + "learning_rate": 1.2433985178067648e-05, + "loss": 1.8176, + "mean_token_accuracy": 0.5742844343185425, + "num_tokens": 4401489659.0, + "step": 8610 + }, + { + "epoch": 2.328555976203353, + "grad_norm": 1.755447506904602, + "learning_rate": 1.243242880635115e-05, + "loss": 1.8847, + "mean_token_accuracy": 0.5917836427688599, + "num_tokens": 4401910919.0, + "step": 8611 + }, + { + "epoch": 2.3288263926446726, + "grad_norm": 1.4601727724075317, + "learning_rate": 1.2430872390683528e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5547927618026733, + "num_tokens": 4402435191.0, + "step": 8612 + }, + { + "epoch": 2.3290968090859923, + "grad_norm": 1.2156322002410889, + "learning_rate": 1.2429315931112528e-05, + "loss": 1.891, + "mean_token_accuracy": 0.5582960844039917, + "num_tokens": 4402959467.0, + "step": 8613 + }, + { + "epoch": 2.329367225527312, + "grad_norm": 1.1994187831878662, + "learning_rate": 1.242775942768592e-05, + "loss": 1.969, + "mean_token_accuracy": 0.5669720768928528, + "num_tokens": 4403434260.0, + "step": 8614 + }, + { + "epoch": 2.3296376419686315, + "grad_norm": 1.4296016693115234, + "learning_rate": 1.242620288045145e-05, + "loss": 1.8014, + "mean_token_accuracy": 0.5762932896614075, + "num_tokens": 4403958452.0, + "step": 8615 + }, + { + "epoch": 2.329908058409951, + "grad_norm": 1.183172583580017, + "learning_rate": 1.242464628945688e-05, + "loss": 1.8341, + "mean_token_accuracy": 0.5905159711837769, + "num_tokens": 4404482665.0, + "step": 8616 + }, + { + "epoch": 2.330178474851271, + "grad_norm": 1.188549518585205, + "learning_rate": 1.2423089654749971e-05, + "loss": 1.953, + "mean_token_accuracy": 0.5590909719467163, + "num_tokens": 4405006921.0, + "step": 8617 + }, + { + "epoch": 2.3304488912925905, + "grad_norm": 1.5780740976333618, + "learning_rate": 1.242153297637849e-05, + "loss": 1.8076, + "mean_token_accuracy": 0.5990058183670044, + "num_tokens": 4405531181.0, + "step": 8618 + }, + { + "epoch": 2.33071930773391, + "grad_norm": 1.3524119853973389, + "learning_rate": 1.2419976254390193e-05, + "loss": 1.9093, + "mean_token_accuracy": 0.5444142818450928, + "num_tokens": 4406055389.0, + "step": 8619 + }, + { + "epoch": 2.3309897241752298, + "grad_norm": 1.3135974407196045, + "learning_rate": 1.2418419488832851e-05, + "loss": 1.8585, + "mean_token_accuracy": 0.5594010353088379, + "num_tokens": 4406547202.0, + "step": 8620 + }, + { + "epoch": 2.3312601406165494, + "grad_norm": 0.41338181495666504, + "learning_rate": 1.2416862679754229e-05, + "loss": 1.0437, + "mean_token_accuracy": 0.7231453657150269, + "num_tokens": 4407033753.0, + "step": 8621 + }, + { + "epoch": 2.331530557057869, + "grad_norm": 1.723349928855896, + "learning_rate": 1.2415305827202093e-05, + "loss": 1.909, + "mean_token_accuracy": 0.5741008520126343, + "num_tokens": 4407501765.0, + "step": 8622 + }, + { + "epoch": 2.3318009734991887, + "grad_norm": 1.7898024320602417, + "learning_rate": 1.241374893122421e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.5355842113494873, + "num_tokens": 4408026022.0, + "step": 8623 + }, + { + "epoch": 2.3320713899405083, + "grad_norm": 1.4562934637069702, + "learning_rate": 1.2412191991868354e-05, + "loss": 1.935, + "mean_token_accuracy": 0.5543055534362793, + "num_tokens": 4408496318.0, + "step": 8624 + }, + { + "epoch": 2.332341806381828, + "grad_norm": 1.0897040367126465, + "learning_rate": 1.2410635009182295e-05, + "loss": 1.8378, + "mean_token_accuracy": 0.5706522464752197, + "num_tokens": 4409020571.0, + "step": 8625 + }, + { + "epoch": 2.3326122228231476, + "grad_norm": 1.3869670629501343, + "learning_rate": 1.2409077983213802e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5669363737106323, + "num_tokens": 4409544752.0, + "step": 8626 + }, + { + "epoch": 2.3328826392644673, + "grad_norm": 1.2365363836288452, + "learning_rate": 1.2407520914010655e-05, + "loss": 1.8983, + "mean_token_accuracy": 0.5709927082061768, + "num_tokens": 4410054596.0, + "step": 8627 + }, + { + "epoch": 2.333153055705787, + "grad_norm": 1.177682638168335, + "learning_rate": 1.240596380162063e-05, + "loss": 1.8922, + "mean_token_accuracy": 0.5611857771873474, + "num_tokens": 4410578857.0, + "step": 8628 + }, + { + "epoch": 2.3334234721471065, + "grad_norm": 1.358277678489685, + "learning_rate": 1.2404406646091496e-05, + "loss": 1.9646, + "mean_token_accuracy": 0.5626789331436157, + "num_tokens": 4411077049.0, + "step": 8629 + }, + { + "epoch": 2.333693888588426, + "grad_norm": 1.0660736560821533, + "learning_rate": 1.2402849447471036e-05, + "loss": 1.7948, + "mean_token_accuracy": 0.5703203082084656, + "num_tokens": 4411560459.0, + "step": 8630 + }, + { + "epoch": 2.333964305029746, + "grad_norm": 1.3523283004760742, + "learning_rate": 1.2401292205807037e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.5578846335411072, + "num_tokens": 4412084720.0, + "step": 8631 + }, + { + "epoch": 2.3342347214710655, + "grad_norm": 1.4497082233428955, + "learning_rate": 1.2399734921147263e-05, + "loss": 1.8979, + "mean_token_accuracy": 0.5576179027557373, + "num_tokens": 4412608993.0, + "step": 8632 + }, + { + "epoch": 2.334505137912385, + "grad_norm": 1.037351369857788, + "learning_rate": 1.2398177593539508e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.5733404159545898, + "num_tokens": 4413114545.0, + "step": 8633 + }, + { + "epoch": 2.3347755543537048, + "grad_norm": 1.3423511981964111, + "learning_rate": 1.2396620223031555e-05, + "loss": 1.9934, + "mean_token_accuracy": 0.5470250844955444, + "num_tokens": 4413638771.0, + "step": 8634 + }, + { + "epoch": 2.3350459707950244, + "grad_norm": 1.3059360980987549, + "learning_rate": 1.2395062809671185e-05, + "loss": 1.912, + "mean_token_accuracy": 0.5501936674118042, + "num_tokens": 4414163051.0, + "step": 8635 + }, + { + "epoch": 2.335316387236344, + "grad_norm": 1.0571684837341309, + "learning_rate": 1.2393505353506186e-05, + "loss": 1.9466, + "mean_token_accuracy": 0.5734700560569763, + "num_tokens": 4414687311.0, + "step": 8636 + }, + { + "epoch": 2.3355868036776637, + "grad_norm": 1.076967477798462, + "learning_rate": 1.2391947854584345e-05, + "loss": 1.7959, + "mean_token_accuracy": 0.5867669582366943, + "num_tokens": 4415205413.0, + "step": 8637 + }, + { + "epoch": 2.3358572201189833, + "grad_norm": 1.217105507850647, + "learning_rate": 1.2390390312953451e-05, + "loss": 1.8859, + "mean_token_accuracy": 0.5519192218780518, + "num_tokens": 4415729674.0, + "step": 8638 + }, + { + "epoch": 2.336127636560303, + "grad_norm": 1.0563035011291504, + "learning_rate": 1.2388832728661289e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5767653584480286, + "num_tokens": 4416253933.0, + "step": 8639 + }, + { + "epoch": 2.3363980530016226, + "grad_norm": 1.1665250062942505, + "learning_rate": 1.2387275101755658e-05, + "loss": 1.7479, + "mean_token_accuracy": 0.6203376054763794, + "num_tokens": 4416778107.0, + "step": 8640 + }, + { + "epoch": 2.3366684694429423, + "grad_norm": 0.47640666365623474, + "learning_rate": 1.2385717432284346e-05, + "loss": 1.203, + "mean_token_accuracy": 0.6916290521621704, + "num_tokens": 4417206581.0, + "step": 8641 + }, + { + "epoch": 2.336938885884262, + "grad_norm": 1.4651144742965698, + "learning_rate": 1.2384159720295148e-05, + "loss": 1.8764, + "mean_token_accuracy": 0.5635972023010254, + "num_tokens": 4417730703.0, + "step": 8642 + }, + { + "epoch": 2.3372093023255816, + "grad_norm": 1.193850040435791, + "learning_rate": 1.238260196583586e-05, + "loss": 1.878, + "mean_token_accuracy": 0.5693392157554626, + "num_tokens": 4418201559.0, + "step": 8643 + }, + { + "epoch": 2.337479718766901, + "grad_norm": 1.34098219871521, + "learning_rate": 1.238104416895428e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.5459651350975037, + "num_tokens": 4418725571.0, + "step": 8644 + }, + { + "epoch": 2.337750135208221, + "grad_norm": 0.9907608032226562, + "learning_rate": 1.23794863296982e-05, + "loss": 1.8227, + "mean_token_accuracy": 0.5866773128509521, + "num_tokens": 4419249827.0, + "step": 8645 + }, + { + "epoch": 2.3380205516495405, + "grad_norm": 1.0726550817489624, + "learning_rate": 1.2377928448115424e-05, + "loss": 1.8761, + "mean_token_accuracy": 0.5757648348808289, + "num_tokens": 4419774050.0, + "step": 8646 + }, + { + "epoch": 2.3382909680908597, + "grad_norm": 1.1550034284591675, + "learning_rate": 1.2376370524253752e-05, + "loss": 1.917, + "mean_token_accuracy": 0.5699527859687805, + "num_tokens": 4420298289.0, + "step": 8647 + }, + { + "epoch": 2.3385613845321798, + "grad_norm": 1.0620673894882202, + "learning_rate": 1.2374812558160982e-05, + "loss": 1.8427, + "mean_token_accuracy": 0.5720833539962769, + "num_tokens": 4420822351.0, + "step": 8648 + }, + { + "epoch": 2.338831800973499, + "grad_norm": 1.0014050006866455, + "learning_rate": 1.237325454988492e-05, + "loss": 1.903, + "mean_token_accuracy": 0.5611337423324585, + "num_tokens": 4421346581.0, + "step": 8649 + }, + { + "epoch": 2.339102217414819, + "grad_norm": 1.2596805095672607, + "learning_rate": 1.2371696499473373e-05, + "loss": 1.7977, + "mean_token_accuracy": 0.589836835861206, + "num_tokens": 4421870689.0, + "step": 8650 + }, + { + "epoch": 2.3393726338561383, + "grad_norm": 1.272337794303894, + "learning_rate": 1.2370138406974141e-05, + "loss": 1.9442, + "mean_token_accuracy": 0.5647163987159729, + "num_tokens": 4422394909.0, + "step": 8651 + }, + { + "epoch": 2.339643050297458, + "grad_norm": 1.1681541204452515, + "learning_rate": 1.2368580272435033e-05, + "loss": 1.9384, + "mean_token_accuracy": 0.5580642223358154, + "num_tokens": 4422919010.0, + "step": 8652 + }, + { + "epoch": 2.3399134667387775, + "grad_norm": 1.13765287399292, + "learning_rate": 1.2367022095903858e-05, + "loss": 1.9113, + "mean_token_accuracy": 0.5636236667633057, + "num_tokens": 4423443184.0, + "step": 8653 + }, + { + "epoch": 2.340183883180097, + "grad_norm": 1.1059181690216064, + "learning_rate": 1.2365463877428426e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.5610679984092712, + "num_tokens": 4423967331.0, + "step": 8654 + }, + { + "epoch": 2.340454299621417, + "grad_norm": 1.1966112852096558, + "learning_rate": 1.2363905617056544e-05, + "loss": 1.8654, + "mean_token_accuracy": 0.548914909362793, + "num_tokens": 4424491545.0, + "step": 8655 + }, + { + "epoch": 2.3407247160627365, + "grad_norm": 1.2636138200759888, + "learning_rate": 1.236234731483603e-05, + "loss": 2.0042, + "mean_token_accuracy": 0.5606868267059326, + "num_tokens": 4425011427.0, + "step": 8656 + }, + { + "epoch": 2.340995132504056, + "grad_norm": 0.9414071440696716, + "learning_rate": 1.2360788970814694e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5678679943084717, + "num_tokens": 4425535639.0, + "step": 8657 + }, + { + "epoch": 2.3412655489453758, + "grad_norm": 1.1727036237716675, + "learning_rate": 1.235923058504035e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.5884008407592773, + "num_tokens": 4426059850.0, + "step": 8658 + }, + { + "epoch": 2.3415359653866954, + "grad_norm": 1.329912543296814, + "learning_rate": 1.2357672157560812e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5623307824134827, + "num_tokens": 4426584099.0, + "step": 8659 + }, + { + "epoch": 2.341806381828015, + "grad_norm": 1.4599025249481201, + "learning_rate": 1.2356113688423903e-05, + "loss": 1.6514, + "mean_token_accuracy": 0.6256796717643738, + "num_tokens": 4427072475.0, + "step": 8660 + }, + { + "epoch": 2.3420767982693347, + "grad_norm": 0.46686843037605286, + "learning_rate": 1.2354555177677431e-05, + "loss": 1.135, + "mean_token_accuracy": 0.6958792209625244, + "num_tokens": 4427596574.0, + "step": 8661 + }, + { + "epoch": 2.3423472147106543, + "grad_norm": 1.9148459434509277, + "learning_rate": 1.2352996625369232e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.5506454706192017, + "num_tokens": 4428120767.0, + "step": 8662 + }, + { + "epoch": 2.342617631151974, + "grad_norm": 1.5329523086547852, + "learning_rate": 1.2351438031547111e-05, + "loss": 1.8166, + "mean_token_accuracy": 0.5681432485580444, + "num_tokens": 4428637683.0, + "step": 8663 + }, + { + "epoch": 2.3428880475932936, + "grad_norm": 1.283987045288086, + "learning_rate": 1.2349879396258902e-05, + "loss": 1.7936, + "mean_token_accuracy": 0.5917424559593201, + "num_tokens": 4429161809.0, + "step": 8664 + }, + { + "epoch": 2.3431584640346133, + "grad_norm": 1.5328314304351807, + "learning_rate": 1.2348320719552418e-05, + "loss": 2.0664, + "mean_token_accuracy": 0.5269311666488647, + "num_tokens": 4429685987.0, + "step": 8665 + }, + { + "epoch": 2.343428880475933, + "grad_norm": 1.6347756385803223, + "learning_rate": 1.2346762001475492e-05, + "loss": 1.9592, + "mean_token_accuracy": 0.5423270463943481, + "num_tokens": 4430210242.0, + "step": 8666 + }, + { + "epoch": 2.3436992969172525, + "grad_norm": 1.4609946012496948, + "learning_rate": 1.234520324207595e-05, + "loss": 2.0026, + "mean_token_accuracy": 0.5489234328269958, + "num_tokens": 4430734366.0, + "step": 8667 + }, + { + "epoch": 2.343969713358572, + "grad_norm": 1.253736972808838, + "learning_rate": 1.234364444140161e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5638706088066101, + "num_tokens": 4431253270.0, + "step": 8668 + }, + { + "epoch": 2.344240129799892, + "grad_norm": 1.1071738004684448, + "learning_rate": 1.234208559950031e-05, + "loss": 1.8113, + "mean_token_accuracy": 0.5788819789886475, + "num_tokens": 4431741580.0, + "step": 8669 + }, + { + "epoch": 2.3445105462412115, + "grad_norm": 1.2053136825561523, + "learning_rate": 1.234052671641988e-05, + "loss": 2.0036, + "mean_token_accuracy": 0.5460487604141235, + "num_tokens": 4432265799.0, + "step": 8670 + }, + { + "epoch": 2.344780962682531, + "grad_norm": 1.3003720045089722, + "learning_rate": 1.2338967792208148e-05, + "loss": 1.9217, + "mean_token_accuracy": 0.5724990367889404, + "num_tokens": 4432738633.0, + "step": 8671 + }, + { + "epoch": 2.3450513791238508, + "grad_norm": 1.2493841648101807, + "learning_rate": 1.2337408826912942e-05, + "loss": 1.8442, + "mean_token_accuracy": 0.5736809968948364, + "num_tokens": 4433209166.0, + "step": 8672 + }, + { + "epoch": 2.3453217955651704, + "grad_norm": 1.090097188949585, + "learning_rate": 1.2335849820582104e-05, + "loss": 1.8837, + "mean_token_accuracy": 0.5734443664550781, + "num_tokens": 4433727782.0, + "step": 8673 + }, + { + "epoch": 2.34559221200649, + "grad_norm": 1.030584692955017, + "learning_rate": 1.2334290773263465e-05, + "loss": 1.9095, + "mean_token_accuracy": 0.5714395642280579, + "num_tokens": 4434251994.0, + "step": 8674 + }, + { + "epoch": 2.3458626284478097, + "grad_norm": 1.047127366065979, + "learning_rate": 1.233273168500486e-05, + "loss": 1.8245, + "mean_token_accuracy": 0.5782829523086548, + "num_tokens": 4434776274.0, + "step": 8675 + }, + { + "epoch": 2.3461330448891293, + "grad_norm": 1.2218550443649292, + "learning_rate": 1.2331172555854123e-05, + "loss": 1.951, + "mean_token_accuracy": 0.5697160959243774, + "num_tokens": 4435298599.0, + "step": 8676 + }, + { + "epoch": 2.346403461330449, + "grad_norm": 0.9525225758552551, + "learning_rate": 1.2329613385859103e-05, + "loss": 1.8692, + "mean_token_accuracy": 0.5732260942459106, + "num_tokens": 4435804947.0, + "step": 8677 + }, + { + "epoch": 2.3466738777717686, + "grad_norm": 1.259932041168213, + "learning_rate": 1.2328054175067631e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.5435836911201477, + "num_tokens": 4436329036.0, + "step": 8678 + }, + { + "epoch": 2.3469442942130883, + "grad_norm": 1.5896035432815552, + "learning_rate": 1.2326494923527556e-05, + "loss": 2.0141, + "mean_token_accuracy": 0.5551573634147644, + "num_tokens": 4436853301.0, + "step": 8679 + }, + { + "epoch": 2.347214710654408, + "grad_norm": 1.137819528579712, + "learning_rate": 1.2324935631286712e-05, + "loss": 1.7749, + "mean_token_accuracy": 0.5800850987434387, + "num_tokens": 4437377583.0, + "step": 8680 + }, + { + "epoch": 2.3474851270957275, + "grad_norm": 0.5307381749153137, + "learning_rate": 1.2323376298392949e-05, + "loss": 1.1445, + "mean_token_accuracy": 0.6853810548782349, + "num_tokens": 4437901815.0, + "step": 8681 + }, + { + "epoch": 2.347755543537047, + "grad_norm": 2.340420722961426, + "learning_rate": 1.2321816924894106e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.5536024570465088, + "num_tokens": 4438426080.0, + "step": 8682 + }, + { + "epoch": 2.348025959978367, + "grad_norm": 1.7965799570083618, + "learning_rate": 1.2320257510838034e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5565857291221619, + "num_tokens": 4438950354.0, + "step": 8683 + }, + { + "epoch": 2.3482963764196865, + "grad_norm": 1.3394771814346313, + "learning_rate": 1.231869805627258e-05, + "loss": 1.8914, + "mean_token_accuracy": 0.5448038578033447, + "num_tokens": 4439474594.0, + "step": 8684 + }, + { + "epoch": 2.348566792861006, + "grad_norm": 1.5473525524139404, + "learning_rate": 1.231713856124559e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5414350628852844, + "num_tokens": 4439998664.0, + "step": 8685 + }, + { + "epoch": 2.3488372093023258, + "grad_norm": 2.185478448867798, + "learning_rate": 1.2315579025804915e-05, + "loss": 1.8639, + "mean_token_accuracy": 0.5726944208145142, + "num_tokens": 4440522857.0, + "step": 8686 + }, + { + "epoch": 2.3491076257436454, + "grad_norm": 1.5628924369812012, + "learning_rate": 1.231401944999841e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5642369389533997, + "num_tokens": 4441047050.0, + "step": 8687 + }, + { + "epoch": 2.3493780421849646, + "grad_norm": 1.4747830629348755, + "learning_rate": 1.2312459833873922e-05, + "loss": 2.123, + "mean_token_accuracy": 0.5307480692863464, + "num_tokens": 4441542109.0, + "step": 8688 + }, + { + "epoch": 2.3496484586262847, + "grad_norm": 1.900516390800476, + "learning_rate": 1.2310900177479306e-05, + "loss": 1.8644, + "mean_token_accuracy": 0.5685447454452515, + "num_tokens": 4442066368.0, + "step": 8689 + }, + { + "epoch": 2.349918875067604, + "grad_norm": 1.3638871908187866, + "learning_rate": 1.2309340480862417e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5631197690963745, + "num_tokens": 4442590549.0, + "step": 8690 + }, + { + "epoch": 2.350189291508924, + "grad_norm": 1.334363579750061, + "learning_rate": 1.2307780744071112e-05, + "loss": 1.9338, + "mean_token_accuracy": 0.5420137643814087, + "num_tokens": 4443114830.0, + "step": 8691 + }, + { + "epoch": 2.350459707950243, + "grad_norm": 1.5782663822174072, + "learning_rate": 1.2306220967153248e-05, + "loss": 1.9422, + "mean_token_accuracy": 0.5598165988922119, + "num_tokens": 4443639103.0, + "step": 8692 + }, + { + "epoch": 2.350730124391563, + "grad_norm": 1.299810528755188, + "learning_rate": 1.2304661150156684e-05, + "loss": 2.0054, + "mean_token_accuracy": 0.5266515016555786, + "num_tokens": 4444163244.0, + "step": 8693 + }, + { + "epoch": 2.3510005408328825, + "grad_norm": 1.222141981124878, + "learning_rate": 1.230310129312928e-05, + "loss": 1.8807, + "mean_token_accuracy": 0.5696796774864197, + "num_tokens": 4444687496.0, + "step": 8694 + }, + { + "epoch": 2.351270957274202, + "grad_norm": 1.1699938774108887, + "learning_rate": 1.2301541396118894e-05, + "loss": 1.8859, + "mean_token_accuracy": 0.5739259123802185, + "num_tokens": 4445211676.0, + "step": 8695 + }, + { + "epoch": 2.3515413737155217, + "grad_norm": 1.1406569480895996, + "learning_rate": 1.2299981459173392e-05, + "loss": 1.9325, + "mean_token_accuracy": 0.5615633726119995, + "num_tokens": 4445735829.0, + "step": 8696 + }, + { + "epoch": 2.3518117901568414, + "grad_norm": 1.4564838409423828, + "learning_rate": 1.2298421482340635e-05, + "loss": 1.876, + "mean_token_accuracy": 0.5765018463134766, + "num_tokens": 4446248905.0, + "step": 8697 + }, + { + "epoch": 2.352082206598161, + "grad_norm": 1.3205676078796387, + "learning_rate": 1.2296861465668488e-05, + "loss": 1.8812, + "mean_token_accuracy": 0.5703123807907104, + "num_tokens": 4446773157.0, + "step": 8698 + }, + { + "epoch": 2.3523526230394807, + "grad_norm": 1.1904826164245605, + "learning_rate": 1.2295301409204819e-05, + "loss": 1.7529, + "mean_token_accuracy": 0.567638635635376, + "num_tokens": 4447274968.0, + "step": 8699 + }, + { + "epoch": 2.3526230394808003, + "grad_norm": 1.1680080890655518, + "learning_rate": 1.2293741312997494e-05, + "loss": 1.8039, + "mean_token_accuracy": 0.5735878944396973, + "num_tokens": 4447775838.0, + "step": 8700 + }, + { + "epoch": 2.35289345592212, + "grad_norm": 0.5973621010780334, + "learning_rate": 1.229218117709438e-05, + "loss": 1.102, + "mean_token_accuracy": 0.6889779567718506, + "num_tokens": 4448300068.0, + "step": 8701 + }, + { + "epoch": 2.3531638723634396, + "grad_norm": 1.7070574760437012, + "learning_rate": 1.2290621001543346e-05, + "loss": 1.9882, + "mean_token_accuracy": 0.5359759330749512, + "num_tokens": 4448824271.0, + "step": 8702 + }, + { + "epoch": 2.3534342888047592, + "grad_norm": 1.620370864868164, + "learning_rate": 1.2289060786392264e-05, + "loss": 1.7993, + "mean_token_accuracy": 0.5870238542556763, + "num_tokens": 4449314327.0, + "step": 8703 + }, + { + "epoch": 2.353704705246079, + "grad_norm": 1.214486837387085, + "learning_rate": 1.2287500531689007e-05, + "loss": 2.0485, + "mean_token_accuracy": 0.5500354766845703, + "num_tokens": 4449781473.0, + "step": 8704 + }, + { + "epoch": 2.3539751216873985, + "grad_norm": 1.2969638109207153, + "learning_rate": 1.2285940237481446e-05, + "loss": 1.8754, + "mean_token_accuracy": 0.5612642765045166, + "num_tokens": 4450305637.0, + "step": 8705 + }, + { + "epoch": 2.354245538128718, + "grad_norm": 1.1863807439804077, + "learning_rate": 1.2284379903817463e-05, + "loss": 1.8562, + "mean_token_accuracy": 0.5761764049530029, + "num_tokens": 4450791190.0, + "step": 8706 + }, + { + "epoch": 2.354515954570038, + "grad_norm": 1.276586651802063, + "learning_rate": 1.2282819530744921e-05, + "loss": 1.8912, + "mean_token_accuracy": 0.5576344728469849, + "num_tokens": 4451315473.0, + "step": 8707 + }, + { + "epoch": 2.3547863710113575, + "grad_norm": 1.24648916721344, + "learning_rate": 1.2281259118311705e-05, + "loss": 1.8605, + "mean_token_accuracy": 0.5744315385818481, + "num_tokens": 4451812543.0, + "step": 8708 + }, + { + "epoch": 2.355056787452677, + "grad_norm": 1.1640337705612183, + "learning_rate": 1.2279698666565691e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.5342587828636169, + "num_tokens": 4452336650.0, + "step": 8709 + }, + { + "epoch": 2.3553272038939967, + "grad_norm": 1.2428841590881348, + "learning_rate": 1.2278138175554759e-05, + "loss": 1.9747, + "mean_token_accuracy": 0.5656951069831848, + "num_tokens": 4452860752.0, + "step": 8710 + }, + { + "epoch": 2.3555976203353164, + "grad_norm": 1.149669885635376, + "learning_rate": 1.2276577645326788e-05, + "loss": 1.867, + "mean_token_accuracy": 0.5678855180740356, + "num_tokens": 4453320385.0, + "step": 8711 + }, + { + "epoch": 2.355868036776636, + "grad_norm": 1.1528570652008057, + "learning_rate": 1.2275017075929663e-05, + "loss": 1.9512, + "mean_token_accuracy": 0.5569076538085938, + "num_tokens": 4453844628.0, + "step": 8712 + }, + { + "epoch": 2.3561384532179557, + "grad_norm": 1.3329033851623535, + "learning_rate": 1.2273456467411264e-05, + "loss": 1.8227, + "mean_token_accuracy": 0.5949870347976685, + "num_tokens": 4454335040.0, + "step": 8713 + }, + { + "epoch": 2.3564088696592753, + "grad_norm": 1.1432706117630005, + "learning_rate": 1.2271895819819474e-05, + "loss": 1.8589, + "mean_token_accuracy": 0.5799071788787842, + "num_tokens": 4454859317.0, + "step": 8714 + }, + { + "epoch": 2.356679286100595, + "grad_norm": 1.1127572059631348, + "learning_rate": 1.227033513320218e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.5747398138046265, + "num_tokens": 4455383575.0, + "step": 8715 + }, + { + "epoch": 2.3569497025419146, + "grad_norm": 1.4614750146865845, + "learning_rate": 1.2268774407607273e-05, + "loss": 2.0482, + "mean_token_accuracy": 0.5476014614105225, + "num_tokens": 4455853698.0, + "step": 8716 + }, + { + "epoch": 2.3572201189832342, + "grad_norm": 1.2397555112838745, + "learning_rate": 1.2267213643082632e-05, + "loss": 1.8546, + "mean_token_accuracy": 0.5694156885147095, + "num_tokens": 4456377824.0, + "step": 8717 + }, + { + "epoch": 2.357490535424554, + "grad_norm": 1.2053207159042358, + "learning_rate": 1.2265652839676145e-05, + "loss": 1.7983, + "mean_token_accuracy": 0.5930744409561157, + "num_tokens": 4456802726.0, + "step": 8718 + }, + { + "epoch": 2.3577609518658735, + "grad_norm": 1.361289143562317, + "learning_rate": 1.2264091997435714e-05, + "loss": 1.9397, + "mean_token_accuracy": 0.5489047765731812, + "num_tokens": 4457326835.0, + "step": 8719 + }, + { + "epoch": 2.358031368307193, + "grad_norm": 1.0566296577453613, + "learning_rate": 1.2262531116409222e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5533994436264038, + "num_tokens": 4457851044.0, + "step": 8720 + }, + { + "epoch": 2.358301784748513, + "grad_norm": 0.4662567675113678, + "learning_rate": 1.226097019664456e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.6841217279434204, + "num_tokens": 4458375240.0, + "step": 8721 + }, + { + "epoch": 2.3585722011898325, + "grad_norm": 2.020573377609253, + "learning_rate": 1.2259409238189625e-05, + "loss": 1.8568, + "mean_token_accuracy": 0.5597079992294312, + "num_tokens": 4458899383.0, + "step": 8722 + }, + { + "epoch": 2.358842617631152, + "grad_norm": 2.080530881881714, + "learning_rate": 1.2257848241092312e-05, + "loss": 1.9529, + "mean_token_accuracy": 0.5356990098953247, + "num_tokens": 4459423578.0, + "step": 8723 + }, + { + "epoch": 2.3591130340724717, + "grad_norm": 1.206282377243042, + "learning_rate": 1.2256287205400517e-05, + "loss": 1.747, + "mean_token_accuracy": 0.5974916219711304, + "num_tokens": 4459947618.0, + "step": 8724 + }, + { + "epoch": 2.3593834505137914, + "grad_norm": 1.6254433393478394, + "learning_rate": 1.2254726131162133e-05, + "loss": 1.9313, + "mean_token_accuracy": 0.5682533383369446, + "num_tokens": 4460471896.0, + "step": 8725 + }, + { + "epoch": 2.359653866955111, + "grad_norm": 1.5176717042922974, + "learning_rate": 1.2253165018425061e-05, + "loss": 1.9174, + "mean_token_accuracy": 0.5658080577850342, + "num_tokens": 4460996175.0, + "step": 8726 + }, + { + "epoch": 2.3599242833964307, + "grad_norm": 1.445427417755127, + "learning_rate": 1.2251603867237203e-05, + "loss": 1.8086, + "mean_token_accuracy": 0.5624052286148071, + "num_tokens": 4461520366.0, + "step": 8727 + }, + { + "epoch": 2.3601946998377503, + "grad_norm": 1.7968882322311401, + "learning_rate": 1.2250042677646456e-05, + "loss": 1.8269, + "mean_token_accuracy": 0.5787273645401001, + "num_tokens": 4462044564.0, + "step": 8728 + }, + { + "epoch": 2.3604651162790695, + "grad_norm": 1.4555699825286865, + "learning_rate": 1.2248481449700726e-05, + "loss": 1.8538, + "mean_token_accuracy": 0.5544793605804443, + "num_tokens": 4462568436.0, + "step": 8729 + }, + { + "epoch": 2.3607355327203896, + "grad_norm": 1.3100764751434326, + "learning_rate": 1.2246920183447912e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5471279621124268, + "num_tokens": 4463092523.0, + "step": 8730 + }, + { + "epoch": 2.361005949161709, + "grad_norm": 1.344600796699524, + "learning_rate": 1.224535887893592e-05, + "loss": 1.7771, + "mean_token_accuracy": 0.5885348916053772, + "num_tokens": 4463616657.0, + "step": 8731 + }, + { + "epoch": 2.361276365603029, + "grad_norm": 1.29319429397583, + "learning_rate": 1.2243797536212653e-05, + "loss": 1.9309, + "mean_token_accuracy": 0.5627092719078064, + "num_tokens": 4464082032.0, + "step": 8732 + }, + { + "epoch": 2.361546782044348, + "grad_norm": 1.049959659576416, + "learning_rate": 1.2242236155326022e-05, + "loss": 1.7815, + "mean_token_accuracy": 0.5748044848442078, + "num_tokens": 4464547967.0, + "step": 8733 + }, + { + "epoch": 2.3618171984856677, + "grad_norm": 1.4057128429412842, + "learning_rate": 1.2240674736323934e-05, + "loss": 1.8514, + "mean_token_accuracy": 0.5682259798049927, + "num_tokens": 4465025830.0, + "step": 8734 + }, + { + "epoch": 2.3620876149269874, + "grad_norm": 0.9720470905303955, + "learning_rate": 1.2239113279254296e-05, + "loss": 1.7654, + "mean_token_accuracy": 0.6120347380638123, + "num_tokens": 4465549945.0, + "step": 8735 + }, + { + "epoch": 2.362358031368307, + "grad_norm": 1.0904828310012817, + "learning_rate": 1.2237551784165019e-05, + "loss": 1.8479, + "mean_token_accuracy": 0.5753575563430786, + "num_tokens": 4466074171.0, + "step": 8736 + }, + { + "epoch": 2.3626284478096267, + "grad_norm": 1.1256004571914673, + "learning_rate": 1.223599025110401e-05, + "loss": 1.8591, + "mean_token_accuracy": 0.5570254921913147, + "num_tokens": 4466598419.0, + "step": 8737 + }, + { + "epoch": 2.3628988642509463, + "grad_norm": 1.049864411354065, + "learning_rate": 1.2234428680119189e-05, + "loss": 1.8712, + "mean_token_accuracy": 0.5538744926452637, + "num_tokens": 4467122683.0, + "step": 8738 + }, + { + "epoch": 2.363169280692266, + "grad_norm": 0.9744722247123718, + "learning_rate": 1.2232867071258464e-05, + "loss": 1.9857, + "mean_token_accuracy": 0.5643523335456848, + "num_tokens": 4467646889.0, + "step": 8739 + }, + { + "epoch": 2.3634396971335856, + "grad_norm": 1.1646935939788818, + "learning_rate": 1.223130542456975e-05, + "loss": 1.8856, + "mean_token_accuracy": 0.578688383102417, + "num_tokens": 4468171153.0, + "step": 8740 + }, + { + "epoch": 2.3637101135749052, + "grad_norm": 0.5417584180831909, + "learning_rate": 1.2229743740100967e-05, + "loss": 1.0915, + "mean_token_accuracy": 0.7108855843544006, + "num_tokens": 4468695340.0, + "step": 8741 + }, + { + "epoch": 2.363980530016225, + "grad_norm": 1.383704423904419, + "learning_rate": 1.2228182017900032e-05, + "loss": 1.9181, + "mean_token_accuracy": 0.5563375949859619, + "num_tokens": 4469219529.0, + "step": 8742 + }, + { + "epoch": 2.3642509464575445, + "grad_norm": 1.3208301067352295, + "learning_rate": 1.2226620258014859e-05, + "loss": 1.9696, + "mean_token_accuracy": 0.5525997877120972, + "num_tokens": 4469743655.0, + "step": 8743 + }, + { + "epoch": 2.364521362898864, + "grad_norm": 1.0659279823303223, + "learning_rate": 1.2225058460493366e-05, + "loss": 1.9713, + "mean_token_accuracy": 0.5155107975006104, + "num_tokens": 4470267906.0, + "step": 8744 + }, + { + "epoch": 2.364791779340184, + "grad_norm": 1.2392174005508423, + "learning_rate": 1.222349662538348e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5568209886550903, + "num_tokens": 4470766998.0, + "step": 8745 + }, + { + "epoch": 2.3650621957815035, + "grad_norm": 1.130631446838379, + "learning_rate": 1.222193475273312e-05, + "loss": 1.9221, + "mean_token_accuracy": 0.5649664402008057, + "num_tokens": 4471291197.0, + "step": 8746 + }, + { + "epoch": 2.365332612222823, + "grad_norm": 1.1805323362350464, + "learning_rate": 1.2220372842590204e-05, + "loss": 1.8968, + "mean_token_accuracy": 0.5629481077194214, + "num_tokens": 4471773172.0, + "step": 8747 + }, + { + "epoch": 2.3656030286641427, + "grad_norm": 1.174851894378662, + "learning_rate": 1.2218810895002666e-05, + "loss": 1.8979, + "mean_token_accuracy": 0.5619596242904663, + "num_tokens": 4472289596.0, + "step": 8748 + }, + { + "epoch": 2.3658734451054624, + "grad_norm": 1.1476240158081055, + "learning_rate": 1.2217248910018424e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5530198812484741, + "num_tokens": 4472781312.0, + "step": 8749 + }, + { + "epoch": 2.366143861546782, + "grad_norm": 1.1463375091552734, + "learning_rate": 1.2215686887685403e-05, + "loss": 1.912, + "mean_token_accuracy": 0.5521767139434814, + "num_tokens": 4473305588.0, + "step": 8750 + }, + { + "epoch": 2.3664142779881017, + "grad_norm": 1.0045437812805176, + "learning_rate": 1.2214124828051536e-05, + "loss": 1.9457, + "mean_token_accuracy": 0.574164628982544, + "num_tokens": 4473829844.0, + "step": 8751 + }, + { + "epoch": 2.3666846944294213, + "grad_norm": 1.054781198501587, + "learning_rate": 1.221256273116475e-05, + "loss": 1.8178, + "mean_token_accuracy": 0.5768063068389893, + "num_tokens": 4474353915.0, + "step": 8752 + }, + { + "epoch": 2.366955110870741, + "grad_norm": 1.1177617311477661, + "learning_rate": 1.2211000597072968e-05, + "loss": 2.0155, + "mean_token_accuracy": 0.5466564893722534, + "num_tokens": 4474878094.0, + "step": 8753 + }, + { + "epoch": 2.3672255273120606, + "grad_norm": 1.7081472873687744, + "learning_rate": 1.220943842582413e-05, + "loss": 1.7475, + "mean_token_accuracy": 0.5725715160369873, + "num_tokens": 4475402215.0, + "step": 8754 + }, + { + "epoch": 2.3674959437533802, + "grad_norm": 1.3884789943695068, + "learning_rate": 1.2207876217466163e-05, + "loss": 1.8515, + "mean_token_accuracy": 0.5936120748519897, + "num_tokens": 4475926314.0, + "step": 8755 + }, + { + "epoch": 2.3677663601947, + "grad_norm": 1.4273324012756348, + "learning_rate": 1.2206313972047003e-05, + "loss": 1.9907, + "mean_token_accuracy": 0.5529664158821106, + "num_tokens": 4476428550.0, + "step": 8756 + }, + { + "epoch": 2.3680367766360195, + "grad_norm": 1.3136212825775146, + "learning_rate": 1.2204751689614583e-05, + "loss": 1.8749, + "mean_token_accuracy": 0.5759290456771851, + "num_tokens": 4476903303.0, + "step": 8757 + }, + { + "epoch": 2.368307193077339, + "grad_norm": 1.296749234199524, + "learning_rate": 1.2203189370216838e-05, + "loss": 1.934, + "mean_token_accuracy": 0.576060950756073, + "num_tokens": 4477410351.0, + "step": 8758 + }, + { + "epoch": 2.368577609518659, + "grad_norm": 1.2933664321899414, + "learning_rate": 1.2201627013901702e-05, + "loss": 1.9736, + "mean_token_accuracy": 0.5561342239379883, + "num_tokens": 4477934615.0, + "step": 8759 + }, + { + "epoch": 2.3688480259599785, + "grad_norm": 1.1993913650512695, + "learning_rate": 1.2200064620717118e-05, + "loss": 1.9088, + "mean_token_accuracy": 0.5737102031707764, + "num_tokens": 4478458886.0, + "step": 8760 + }, + { + "epoch": 2.369118442401298, + "grad_norm": 0.5008079409599304, + "learning_rate": 1.2198502190711019e-05, + "loss": 1.2123, + "mean_token_accuracy": 0.6734326481819153, + "num_tokens": 4478983151.0, + "step": 8761 + }, + { + "epoch": 2.3693888588426177, + "grad_norm": 1.4981085062026978, + "learning_rate": 1.2196939723931354e-05, + "loss": 1.947, + "mean_token_accuracy": 0.5574171543121338, + "num_tokens": 4479507275.0, + "step": 8762 + }, + { + "epoch": 2.3696592752839374, + "grad_norm": 1.3319106101989746, + "learning_rate": 1.2195377220426054e-05, + "loss": 1.9152, + "mean_token_accuracy": 0.5613834261894226, + "num_tokens": 4479973504.0, + "step": 8763 + }, + { + "epoch": 2.369929691725257, + "grad_norm": 1.2792855501174927, + "learning_rate": 1.2193814680243068e-05, + "loss": 1.9247, + "mean_token_accuracy": 0.5692522525787354, + "num_tokens": 4480478864.0, + "step": 8764 + }, + { + "epoch": 2.3702001081665767, + "grad_norm": 1.1774413585662842, + "learning_rate": 1.2192252103430335e-05, + "loss": 1.9152, + "mean_token_accuracy": 0.5484235286712646, + "num_tokens": 4481003043.0, + "step": 8765 + }, + { + "epoch": 2.3704705246078963, + "grad_norm": 1.2266104221343994, + "learning_rate": 1.2190689490035804e-05, + "loss": 1.936, + "mean_token_accuracy": 0.5409165620803833, + "num_tokens": 4481527269.0, + "step": 8766 + }, + { + "epoch": 2.370740941049216, + "grad_norm": 1.1907768249511719, + "learning_rate": 1.2189126840107415e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.5505121946334839, + "num_tokens": 4482051447.0, + "step": 8767 + }, + { + "epoch": 2.3710113574905356, + "grad_norm": 1.139088749885559, + "learning_rate": 1.2187564153693118e-05, + "loss": 1.9099, + "mean_token_accuracy": 0.5557301044464111, + "num_tokens": 4482575604.0, + "step": 8768 + }, + { + "epoch": 2.3712817739318552, + "grad_norm": 1.1852593421936035, + "learning_rate": 1.2186001430840862e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.5886411070823669, + "num_tokens": 4483099740.0, + "step": 8769 + }, + { + "epoch": 2.3715521903731744, + "grad_norm": 1.2161792516708374, + "learning_rate": 1.218443867159859e-05, + "loss": 1.8622, + "mean_token_accuracy": 0.5718064904212952, + "num_tokens": 4483623995.0, + "step": 8770 + }, + { + "epoch": 2.3718226068144945, + "grad_norm": 1.1488561630249023, + "learning_rate": 1.2182875876014259e-05, + "loss": 1.8522, + "mean_token_accuracy": 0.5676921606063843, + "num_tokens": 4484148205.0, + "step": 8771 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 1.266607403755188, + "learning_rate": 1.2181313044135821e-05, + "loss": 1.8726, + "mean_token_accuracy": 0.5683914422988892, + "num_tokens": 4484672441.0, + "step": 8772 + }, + { + "epoch": 2.372363439697134, + "grad_norm": 1.3106391429901123, + "learning_rate": 1.2179750176011222e-05, + "loss": 1.7814, + "mean_token_accuracy": 0.5817183256149292, + "num_tokens": 4485145173.0, + "step": 8773 + }, + { + "epoch": 2.372633856138453, + "grad_norm": 1.0075833797454834, + "learning_rate": 1.2178187271688417e-05, + "loss": 1.8988, + "mean_token_accuracy": 0.5492538809776306, + "num_tokens": 4485669421.0, + "step": 8774 + }, + { + "epoch": 2.372904272579773, + "grad_norm": 1.1481274366378784, + "learning_rate": 1.2176624331215361e-05, + "loss": 1.8036, + "mean_token_accuracy": 0.5738114714622498, + "num_tokens": 4486193661.0, + "step": 8775 + }, + { + "epoch": 2.3731746890210923, + "grad_norm": 1.1404603719711304, + "learning_rate": 1.2175061354640011e-05, + "loss": 1.7399, + "mean_token_accuracy": 0.5693782567977905, + "num_tokens": 4486717917.0, + "step": 8776 + }, + { + "epoch": 2.373445105462412, + "grad_norm": 1.1334736347198486, + "learning_rate": 1.2173498342010326e-05, + "loss": 1.8862, + "mean_token_accuracy": 0.579682469367981, + "num_tokens": 4487225918.0, + "step": 8777 + }, + { + "epoch": 2.3737155219037316, + "grad_norm": 1.1616313457489014, + "learning_rate": 1.217193529337426e-05, + "loss": 1.8286, + "mean_token_accuracy": 0.5702300071716309, + "num_tokens": 4487750179.0, + "step": 8778 + }, + { + "epoch": 2.3739859383450512, + "grad_norm": 1.1948477029800415, + "learning_rate": 1.217037220877977e-05, + "loss": 1.8599, + "mean_token_accuracy": 0.566036581993103, + "num_tokens": 4488230940.0, + "step": 8779 + }, + { + "epoch": 2.374256354786371, + "grad_norm": 0.9678730368614197, + "learning_rate": 1.216880908827482e-05, + "loss": 1.7947, + "mean_token_accuracy": 0.5736975073814392, + "num_tokens": 4488755191.0, + "step": 8780 + }, + { + "epoch": 2.3745267712276905, + "grad_norm": 0.4948660135269165, + "learning_rate": 1.216724593190737e-05, + "loss": 1.1468, + "mean_token_accuracy": 0.704649806022644, + "num_tokens": 4489279447.0, + "step": 8781 + }, + { + "epoch": 2.37479718766901, + "grad_norm": 1.531673789024353, + "learning_rate": 1.2165682739725383e-05, + "loss": 1.8881, + "mean_token_accuracy": 0.5653865337371826, + "num_tokens": 4489795855.0, + "step": 8782 + }, + { + "epoch": 2.37506760411033, + "grad_norm": 1.3792754411697388, + "learning_rate": 1.216411951177682e-05, + "loss": 1.7241, + "mean_token_accuracy": 0.6219543218612671, + "num_tokens": 4490297910.0, + "step": 8783 + }, + { + "epoch": 2.3753380205516494, + "grad_norm": 1.115523099899292, + "learning_rate": 1.2162556248109647e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.5492358803749084, + "num_tokens": 4490822105.0, + "step": 8784 + }, + { + "epoch": 2.375608436992969, + "grad_norm": 1.4085417985916138, + "learning_rate": 1.2160992948771832e-05, + "loss": 1.8162, + "mean_token_accuracy": 0.5803791284561157, + "num_tokens": 4491324926.0, + "step": 8785 + }, + { + "epoch": 2.3758788534342887, + "grad_norm": 1.1200979948043823, + "learning_rate": 1.2159429613811338e-05, + "loss": 1.8504, + "mean_token_accuracy": 0.5661007761955261, + "num_tokens": 4491849065.0, + "step": 8786 + }, + { + "epoch": 2.3761492698756084, + "grad_norm": 1.112296462059021, + "learning_rate": 1.215786624327613e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.5451720952987671, + "num_tokens": 4492373256.0, + "step": 8787 + }, + { + "epoch": 2.376419686316928, + "grad_norm": 1.4881621599197388, + "learning_rate": 1.2156302837214187e-05, + "loss": 1.9212, + "mean_token_accuracy": 0.5572094917297363, + "num_tokens": 4492897479.0, + "step": 8788 + }, + { + "epoch": 2.3766901027582477, + "grad_norm": 1.2580881118774414, + "learning_rate": 1.2154739395673468e-05, + "loss": 1.8504, + "mean_token_accuracy": 0.5651249289512634, + "num_tokens": 4493421654.0, + "step": 8789 + }, + { + "epoch": 2.3769605191995673, + "grad_norm": 1.378407597541809, + "learning_rate": 1.2153175918701947e-05, + "loss": 1.7917, + "mean_token_accuracy": 0.5687389373779297, + "num_tokens": 4493945899.0, + "step": 8790 + }, + { + "epoch": 2.377230935640887, + "grad_norm": 1.1751232147216797, + "learning_rate": 1.2151612406347602e-05, + "loss": 1.8394, + "mean_token_accuracy": 0.586974024772644, + "num_tokens": 4494421671.0, + "step": 8791 + }, + { + "epoch": 2.3775013520822066, + "grad_norm": 1.271798849105835, + "learning_rate": 1.21500488586584e-05, + "loss": 1.8091, + "mean_token_accuracy": 0.5687810778617859, + "num_tokens": 4494945756.0, + "step": 8792 + }, + { + "epoch": 2.3777717685235262, + "grad_norm": 1.1807392835617065, + "learning_rate": 1.2148485275682315e-05, + "loss": 1.886, + "mean_token_accuracy": 0.5682604312896729, + "num_tokens": 4495469984.0, + "step": 8793 + }, + { + "epoch": 2.378042184964846, + "grad_norm": 1.1239453554153442, + "learning_rate": 1.2146921657467327e-05, + "loss": 1.9828, + "mean_token_accuracy": 0.5537058711051941, + "num_tokens": 4495983376.0, + "step": 8794 + }, + { + "epoch": 2.3783126014061655, + "grad_norm": 1.3837890625, + "learning_rate": 1.214535800406141e-05, + "loss": 1.7846, + "mean_token_accuracy": 0.6046316027641296, + "num_tokens": 4496507626.0, + "step": 8795 + }, + { + "epoch": 2.378583017847485, + "grad_norm": 1.075600028038025, + "learning_rate": 1.2143794315512538e-05, + "loss": 1.8075, + "mean_token_accuracy": 0.5773515105247498, + "num_tokens": 4497031817.0, + "step": 8796 + }, + { + "epoch": 2.378853434288805, + "grad_norm": 1.002084493637085, + "learning_rate": 1.2142230591868693e-05, + "loss": 1.7377, + "mean_token_accuracy": 0.5652463436126709, + "num_tokens": 4497555959.0, + "step": 8797 + }, + { + "epoch": 2.3791238507301244, + "grad_norm": 1.095724105834961, + "learning_rate": 1.2140666833177859e-05, + "loss": 1.8297, + "mean_token_accuracy": 0.5718876123428345, + "num_tokens": 4498080080.0, + "step": 8798 + }, + { + "epoch": 2.379394267171444, + "grad_norm": 1.0476049184799194, + "learning_rate": 1.2139103039488009e-05, + "loss": 1.9439, + "mean_token_accuracy": 0.5627882480621338, + "num_tokens": 4498604268.0, + "step": 8799 + }, + { + "epoch": 2.3796646836127637, + "grad_norm": 1.4203391075134277, + "learning_rate": 1.213753921084713e-05, + "loss": 1.7957, + "mean_token_accuracy": 0.5808619856834412, + "num_tokens": 4499128448.0, + "step": 8800 + }, + { + "epoch": 2.3799351000540834, + "grad_norm": 0.5509973168373108, + "learning_rate": 1.2135975347303199e-05, + "loss": 1.0297, + "mean_token_accuracy": 0.7219375371932983, + "num_tokens": 4499652677.0, + "step": 8801 + }, + { + "epoch": 2.380205516495403, + "grad_norm": 1.4093060493469238, + "learning_rate": 1.2134411448904205e-05, + "loss": 1.7836, + "mean_token_accuracy": 0.5799286365509033, + "num_tokens": 4500176903.0, + "step": 8802 + }, + { + "epoch": 2.3804759329367227, + "grad_norm": 1.338491678237915, + "learning_rate": 1.2132847515698131e-05, + "loss": 2.009, + "mean_token_accuracy": 0.5523722171783447, + "num_tokens": 4500701185.0, + "step": 8803 + }, + { + "epoch": 2.3807463493780423, + "grad_norm": 1.209740400314331, + "learning_rate": 1.2131283547732963e-05, + "loss": 1.8867, + "mean_token_accuracy": 0.553113579750061, + "num_tokens": 4501225228.0, + "step": 8804 + }, + { + "epoch": 2.381016765819362, + "grad_norm": 1.5302088260650635, + "learning_rate": 1.2129719545056693e-05, + "loss": 2.0437, + "mean_token_accuracy": 0.5118664503097534, + "num_tokens": 4501749362.0, + "step": 8805 + }, + { + "epoch": 2.3812871822606816, + "grad_norm": 1.1025134325027466, + "learning_rate": 1.21281555077173e-05, + "loss": 1.8657, + "mean_token_accuracy": 0.5750527381896973, + "num_tokens": 4502244864.0, + "step": 8806 + }, + { + "epoch": 2.3815575987020012, + "grad_norm": 1.5228676795959473, + "learning_rate": 1.2126591435762785e-05, + "loss": 1.8926, + "mean_token_accuracy": 0.5897014141082764, + "num_tokens": 4502680994.0, + "step": 8807 + }, + { + "epoch": 2.381828015143321, + "grad_norm": 1.5359981060028076, + "learning_rate": 1.2125027329241128e-05, + "loss": 1.9873, + "mean_token_accuracy": 0.5387071371078491, + "num_tokens": 4503205278.0, + "step": 8808 + }, + { + "epoch": 2.3820984315846405, + "grad_norm": 1.1933242082595825, + "learning_rate": 1.2123463188200322e-05, + "loss": 1.8607, + "mean_token_accuracy": 0.5739158391952515, + "num_tokens": 4503729554.0, + "step": 8809 + }, + { + "epoch": 2.38236884802596, + "grad_norm": 1.0865596532821655, + "learning_rate": 1.2121899012688364e-05, + "loss": 1.9078, + "mean_token_accuracy": 0.5596619844436646, + "num_tokens": 4504253736.0, + "step": 8810 + }, + { + "epoch": 2.3826392644672794, + "grad_norm": 1.5065546035766602, + "learning_rate": 1.2120334802753244e-05, + "loss": 1.7931, + "mean_token_accuracy": 0.578078031539917, + "num_tokens": 4504778006.0, + "step": 8811 + }, + { + "epoch": 2.3829096809085994, + "grad_norm": 1.513203501701355, + "learning_rate": 1.2118770558442957e-05, + "loss": 2.0503, + "mean_token_accuracy": 0.5479610562324524, + "num_tokens": 4505302258.0, + "step": 8812 + }, + { + "epoch": 2.3831800973499186, + "grad_norm": 1.4109734296798706, + "learning_rate": 1.21172062798055e-05, + "loss": 1.9831, + "mean_token_accuracy": 0.5188983082771301, + "num_tokens": 4505826491.0, + "step": 8813 + }, + { + "epoch": 2.3834505137912387, + "grad_norm": 1.8339107036590576, + "learning_rate": 1.2115641966888867e-05, + "loss": 1.9743, + "mean_token_accuracy": 0.5379135012626648, + "num_tokens": 4506350682.0, + "step": 8814 + }, + { + "epoch": 2.383720930232558, + "grad_norm": 1.8838931322097778, + "learning_rate": 1.211407761974106e-05, + "loss": 1.9414, + "mean_token_accuracy": 0.5486724376678467, + "num_tokens": 4506874959.0, + "step": 8815 + }, + { + "epoch": 2.383991346673878, + "grad_norm": 1.2636488676071167, + "learning_rate": 1.2112513238410075e-05, + "loss": 2.022, + "mean_token_accuracy": 0.5415544509887695, + "num_tokens": 4507398967.0, + "step": 8816 + }, + { + "epoch": 2.384261763115197, + "grad_norm": 1.2150940895080566, + "learning_rate": 1.2110948822943914e-05, + "loss": 1.8659, + "mean_token_accuracy": 0.5804332494735718, + "num_tokens": 4507905517.0, + "step": 8817 + }, + { + "epoch": 2.384532179556517, + "grad_norm": 1.7667756080627441, + "learning_rate": 1.210938437339057e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.567440390586853, + "num_tokens": 4508395619.0, + "step": 8818 + }, + { + "epoch": 2.3848025959978365, + "grad_norm": 1.8192341327667236, + "learning_rate": 1.2107819889798057e-05, + "loss": 1.8497, + "mean_token_accuracy": 0.5429340600967407, + "num_tokens": 4508919867.0, + "step": 8819 + }, + { + "epoch": 2.385073012439156, + "grad_norm": 1.19540536403656, + "learning_rate": 1.2106255372214368e-05, + "loss": 1.9621, + "mean_token_accuracy": 0.5513556003570557, + "num_tokens": 4509380359.0, + "step": 8820 + }, + { + "epoch": 2.385343428880476, + "grad_norm": 0.5269032716751099, + "learning_rate": 1.2104690820687514e-05, + "loss": 1.1969, + "mean_token_accuracy": 0.6823695302009583, + "num_tokens": 4509904581.0, + "step": 8821 + }, + { + "epoch": 2.3856138453217954, + "grad_norm": 1.91714346408844, + "learning_rate": 1.2103126235265497e-05, + "loss": 1.8649, + "mean_token_accuracy": 0.5682015419006348, + "num_tokens": 4510428684.0, + "step": 8822 + }, + { + "epoch": 2.385884261763115, + "grad_norm": 1.8803197145462036, + "learning_rate": 1.210156161599632e-05, + "loss": 1.8263, + "mean_token_accuracy": 0.570466160774231, + "num_tokens": 4510952955.0, + "step": 8823 + }, + { + "epoch": 2.3861546782044347, + "grad_norm": 1.363939642906189, + "learning_rate": 1.2099996962927995e-05, + "loss": 1.9602, + "mean_token_accuracy": 0.5491006970405579, + "num_tokens": 4511477227.0, + "step": 8824 + }, + { + "epoch": 2.3864250946457544, + "grad_norm": 1.7603405714035034, + "learning_rate": 1.209843227610853e-05, + "loss": 1.9884, + "mean_token_accuracy": 0.5351550579071045, + "num_tokens": 4512001415.0, + "step": 8825 + }, + { + "epoch": 2.386695511087074, + "grad_norm": 1.563430666923523, + "learning_rate": 1.2096867555585928e-05, + "loss": 1.8202, + "mean_token_accuracy": 0.5773757100105286, + "num_tokens": 4512525636.0, + "step": 8826 + }, + { + "epoch": 2.3869659275283936, + "grad_norm": 1.4875001907348633, + "learning_rate": 1.2095302801408207e-05, + "loss": 1.908, + "mean_token_accuracy": 0.5704632997512817, + "num_tokens": 4513049846.0, + "step": 8827 + }, + { + "epoch": 2.3872363439697133, + "grad_norm": 1.2249127626419067, + "learning_rate": 1.2093738013623374e-05, + "loss": 1.8518, + "mean_token_accuracy": 0.561579704284668, + "num_tokens": 4513574123.0, + "step": 8828 + }, + { + "epoch": 2.387506760411033, + "grad_norm": 1.1319226026535034, + "learning_rate": 1.2092173192279442e-05, + "loss": 1.936, + "mean_token_accuracy": 0.5527387857437134, + "num_tokens": 4514045742.0, + "step": 8829 + }, + { + "epoch": 2.3877771768523526, + "grad_norm": 1.0923762321472168, + "learning_rate": 1.2090608337424424e-05, + "loss": 1.9191, + "mean_token_accuracy": 0.5602922439575195, + "num_tokens": 4514518033.0, + "step": 8830 + }, + { + "epoch": 2.388047593293672, + "grad_norm": 1.1439111232757568, + "learning_rate": 1.2089043449106337e-05, + "loss": 1.9656, + "mean_token_accuracy": 0.5475113987922668, + "num_tokens": 4515042050.0, + "step": 8831 + }, + { + "epoch": 2.388318009734992, + "grad_norm": 1.2499327659606934, + "learning_rate": 1.2087478527373193e-05, + "loss": 1.9237, + "mean_token_accuracy": 0.5442458391189575, + "num_tokens": 4515566241.0, + "step": 8832 + }, + { + "epoch": 2.3885884261763115, + "grad_norm": 1.2123504877090454, + "learning_rate": 1.208591357227301e-05, + "loss": 1.8063, + "mean_token_accuracy": 0.5699707269668579, + "num_tokens": 4516090005.0, + "step": 8833 + }, + { + "epoch": 2.388858842617631, + "grad_norm": 1.241539478302002, + "learning_rate": 1.2084348583853807e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.5740994215011597, + "num_tokens": 4516614132.0, + "step": 8834 + }, + { + "epoch": 2.389129259058951, + "grad_norm": 1.0775980949401855, + "learning_rate": 1.2082783562163598e-05, + "loss": 1.8522, + "mean_token_accuracy": 0.5726982355117798, + "num_tokens": 4517138243.0, + "step": 8835 + }, + { + "epoch": 2.3893996755002704, + "grad_norm": 1.1205679178237915, + "learning_rate": 1.2081218507250405e-05, + "loss": 1.9787, + "mean_token_accuracy": 0.5490925908088684, + "num_tokens": 4517618635.0, + "step": 8836 + }, + { + "epoch": 2.38967009194159, + "grad_norm": 1.1702797412872314, + "learning_rate": 1.2079653419162249e-05, + "loss": 1.9236, + "mean_token_accuracy": 0.559135913848877, + "num_tokens": 4518142883.0, + "step": 8837 + }, + { + "epoch": 2.3899405083829097, + "grad_norm": 1.0775336027145386, + "learning_rate": 1.207808829794715e-05, + "loss": 1.9025, + "mean_token_accuracy": 0.578325629234314, + "num_tokens": 4518666865.0, + "step": 8838 + }, + { + "epoch": 2.3902109248242294, + "grad_norm": 1.1010946035385132, + "learning_rate": 1.2076523143653134e-05, + "loss": 1.9369, + "mean_token_accuracy": 0.5463081002235413, + "num_tokens": 4519191026.0, + "step": 8839 + }, + { + "epoch": 2.390481341265549, + "grad_norm": 1.0878750085830688, + "learning_rate": 1.2074957956328222e-05, + "loss": 1.8517, + "mean_token_accuracy": 0.5870755910873413, + "num_tokens": 4519715273.0, + "step": 8840 + }, + { + "epoch": 2.3907517577068687, + "grad_norm": 0.5749319195747375, + "learning_rate": 1.2073392736020437e-05, + "loss": 1.0986, + "mean_token_accuracy": 0.7082072496414185, + "num_tokens": 4520195844.0, + "step": 8841 + }, + { + "epoch": 2.3910221741481883, + "grad_norm": 1.5052813291549683, + "learning_rate": 1.2071827482777806e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.565163254737854, + "num_tokens": 4520689918.0, + "step": 8842 + }, + { + "epoch": 2.391292590589508, + "grad_norm": 1.1891063451766968, + "learning_rate": 1.2070262196648356e-05, + "loss": 1.8946, + "mean_token_accuracy": 0.5624228715896606, + "num_tokens": 4521194113.0, + "step": 8843 + }, + { + "epoch": 2.3915630070308276, + "grad_norm": 0.9404748678207397, + "learning_rate": 1.2068696877680116e-05, + "loss": 1.7635, + "mean_token_accuracy": 0.6029366254806519, + "num_tokens": 4521718281.0, + "step": 8844 + }, + { + "epoch": 2.3918334234721472, + "grad_norm": 1.0799394845962524, + "learning_rate": 1.2067131525921112e-05, + "loss": 1.9197, + "mean_token_accuracy": 0.562454104423523, + "num_tokens": 4522242541.0, + "step": 8845 + }, + { + "epoch": 2.392103839913467, + "grad_norm": 1.2245967388153076, + "learning_rate": 1.2065566141419373e-05, + "loss": 1.9414, + "mean_token_accuracy": 0.560504138469696, + "num_tokens": 4522738501.0, + "step": 8846 + }, + { + "epoch": 2.3923742563547865, + "grad_norm": 1.062296986579895, + "learning_rate": 1.2064000724222935e-05, + "loss": 1.854, + "mean_token_accuracy": 0.5621918439865112, + "num_tokens": 4523262571.0, + "step": 8847 + }, + { + "epoch": 2.392644672796106, + "grad_norm": 1.1793211698532104, + "learning_rate": 1.2062435274379824e-05, + "loss": 1.9326, + "mean_token_accuracy": 0.5811672210693359, + "num_tokens": 4523741785.0, + "step": 8848 + }, + { + "epoch": 2.392915089237426, + "grad_norm": 1.1129802465438843, + "learning_rate": 1.2060869791938072e-05, + "loss": 1.8747, + "mean_token_accuracy": 0.5927978754043579, + "num_tokens": 4524182124.0, + "step": 8849 + }, + { + "epoch": 2.3931855056787454, + "grad_norm": 1.3176885843276978, + "learning_rate": 1.2059304276945719e-05, + "loss": 1.8753, + "mean_token_accuracy": 0.5596237182617188, + "num_tokens": 4524706396.0, + "step": 8850 + }, + { + "epoch": 2.393455922120065, + "grad_norm": 1.1167362928390503, + "learning_rate": 1.2057738729450793e-05, + "loss": 1.9299, + "mean_token_accuracy": 0.5854796171188354, + "num_tokens": 4525144761.0, + "step": 8851 + }, + { + "epoch": 2.3937263385613847, + "grad_norm": 0.9755887985229492, + "learning_rate": 1.2056173149501333e-05, + "loss": 1.7303, + "mean_token_accuracy": 0.5971873998641968, + "num_tokens": 4525669003.0, + "step": 8852 + }, + { + "epoch": 2.3939967550027044, + "grad_norm": 1.1394484043121338, + "learning_rate": 1.2054607537145373e-05, + "loss": 1.7635, + "mean_token_accuracy": 0.5862050652503967, + "num_tokens": 4526193152.0, + "step": 8853 + }, + { + "epoch": 2.3942671714440236, + "grad_norm": 1.0865952968597412, + "learning_rate": 1.2053041892430954e-05, + "loss": 1.9206, + "mean_token_accuracy": 0.563169002532959, + "num_tokens": 4526717351.0, + "step": 8854 + }, + { + "epoch": 2.3945375878853437, + "grad_norm": 1.1684657335281372, + "learning_rate": 1.2051476215406112e-05, + "loss": 1.9654, + "mean_token_accuracy": 0.5464879274368286, + "num_tokens": 4527241549.0, + "step": 8855 + }, + { + "epoch": 2.394808004326663, + "grad_norm": 1.1364412307739258, + "learning_rate": 1.2049910506118886e-05, + "loss": 1.8329, + "mean_token_accuracy": 0.5632455945014954, + "num_tokens": 4527765656.0, + "step": 8856 + }, + { + "epoch": 2.395078420767983, + "grad_norm": 1.0190290212631226, + "learning_rate": 1.2048344764617326e-05, + "loss": 1.8746, + "mean_token_accuracy": 0.5752747058868408, + "num_tokens": 4528289758.0, + "step": 8857 + }, + { + "epoch": 2.395348837209302, + "grad_norm": 1.125032901763916, + "learning_rate": 1.2046778990949457e-05, + "loss": 1.9216, + "mean_token_accuracy": 0.5612286329269409, + "num_tokens": 4528768954.0, + "step": 8858 + }, + { + "epoch": 2.395619253650622, + "grad_norm": 1.160944938659668, + "learning_rate": 1.2045213185163334e-05, + "loss": 1.8854, + "mean_token_accuracy": 0.5718766450881958, + "num_tokens": 4529293143.0, + "step": 8859 + }, + { + "epoch": 2.3958896700919414, + "grad_norm": 1.2082939147949219, + "learning_rate": 1.2043647347306994e-05, + "loss": 1.9782, + "mean_token_accuracy": 0.546169638633728, + "num_tokens": 4529817397.0, + "step": 8860 + }, + { + "epoch": 2.396160086533261, + "grad_norm": 0.5810810923576355, + "learning_rate": 1.2042081477428487e-05, + "loss": 1.1692, + "mean_token_accuracy": 0.6866676211357117, + "num_tokens": 4530284368.0, + "step": 8861 + }, + { + "epoch": 2.3964305029745807, + "grad_norm": 1.5079256296157837, + "learning_rate": 1.2040515575575852e-05, + "loss": 1.8701, + "mean_token_accuracy": 0.5424777269363403, + "num_tokens": 4530808538.0, + "step": 8862 + }, + { + "epoch": 2.3967009194159004, + "grad_norm": 1.3265047073364258, + "learning_rate": 1.2038949641797144e-05, + "loss": 2.0259, + "mean_token_accuracy": 0.5373613834381104, + "num_tokens": 4531332802.0, + "step": 8863 + }, + { + "epoch": 2.39697133585722, + "grad_norm": 0.8608620762825012, + "learning_rate": 1.2037383676140404e-05, + "loss": 1.9336, + "mean_token_accuracy": 0.5661603212356567, + "num_tokens": 4531857080.0, + "step": 8864 + }, + { + "epoch": 2.3972417522985396, + "grad_norm": 1.2391375303268433, + "learning_rate": 1.203581767865368e-05, + "loss": 1.832, + "mean_token_accuracy": 0.5777108073234558, + "num_tokens": 4532381259.0, + "step": 8865 + }, + { + "epoch": 2.3975121687398593, + "grad_norm": 1.2016470432281494, + "learning_rate": 1.2034251649385028e-05, + "loss": 1.8566, + "mean_token_accuracy": 0.5678623914718628, + "num_tokens": 4532905493.0, + "step": 8866 + }, + { + "epoch": 2.397782585181179, + "grad_norm": 1.0785398483276367, + "learning_rate": 1.2032685588382489e-05, + "loss": 1.8061, + "mean_token_accuracy": 0.589907169342041, + "num_tokens": 4533429575.0, + "step": 8867 + }, + { + "epoch": 2.3980530016224986, + "grad_norm": 1.0358606576919556, + "learning_rate": 1.203111949569412e-05, + "loss": 1.8986, + "mean_token_accuracy": 0.5763367414474487, + "num_tokens": 4533913311.0, + "step": 8868 + }, + { + "epoch": 2.398323418063818, + "grad_norm": 1.3811925649642944, + "learning_rate": 1.2029553371367974e-05, + "loss": 1.8481, + "mean_token_accuracy": 0.5979373455047607, + "num_tokens": 4534406230.0, + "step": 8869 + }, + { + "epoch": 2.398593834505138, + "grad_norm": 1.2019749879837036, + "learning_rate": 1.2027987215452104e-05, + "loss": 1.9474, + "mean_token_accuracy": 0.5504918098449707, + "num_tokens": 4534930436.0, + "step": 8870 + }, + { + "epoch": 2.3988642509464575, + "grad_norm": 1.0879963636398315, + "learning_rate": 1.2026421027994562e-05, + "loss": 1.9084, + "mean_token_accuracy": 0.5712500810623169, + "num_tokens": 4535454617.0, + "step": 8871 + }, + { + "epoch": 2.399134667387777, + "grad_norm": 0.9868953227996826, + "learning_rate": 1.2024854809043404e-05, + "loss": 1.9029, + "mean_token_accuracy": 0.5623798370361328, + "num_tokens": 4535972787.0, + "step": 8872 + }, + { + "epoch": 2.399405083829097, + "grad_norm": 1.2244689464569092, + "learning_rate": 1.2023288558646687e-05, + "loss": 2.0018, + "mean_token_accuracy": 0.5543711185455322, + "num_tokens": 4536496977.0, + "step": 8873 + }, + { + "epoch": 2.3996755002704164, + "grad_norm": 1.3845423460006714, + "learning_rate": 1.2021722276852465e-05, + "loss": 1.8941, + "mean_token_accuracy": 0.5885506868362427, + "num_tokens": 4536960707.0, + "step": 8874 + }, + { + "epoch": 2.399945916711736, + "grad_norm": 1.2319462299346924, + "learning_rate": 1.2020155963708802e-05, + "loss": 1.8657, + "mean_token_accuracy": 0.5606167316436768, + "num_tokens": 4537477222.0, + "step": 8875 + }, + { + "epoch": 2.4002163331530557, + "grad_norm": 1.290503978729248, + "learning_rate": 1.2018589619263752e-05, + "loss": 1.8869, + "mean_token_accuracy": 0.5530780553817749, + "num_tokens": 4538001450.0, + "step": 8876 + }, + { + "epoch": 2.4004867495943754, + "grad_norm": 1.0672104358673096, + "learning_rate": 1.2017023243565383e-05, + "loss": 1.7914, + "mean_token_accuracy": 0.5692269802093506, + "num_tokens": 4538525628.0, + "step": 8877 + }, + { + "epoch": 2.400757166035695, + "grad_norm": 1.5782511234283447, + "learning_rate": 1.201545683666174e-05, + "loss": 2.0275, + "mean_token_accuracy": 0.5473774671554565, + "num_tokens": 4539018720.0, + "step": 8878 + }, + { + "epoch": 2.4010275824770146, + "grad_norm": 1.4460442066192627, + "learning_rate": 1.20138903986009e-05, + "loss": 1.97, + "mean_token_accuracy": 0.5544713735580444, + "num_tokens": 4539488169.0, + "step": 8879 + }, + { + "epoch": 2.4012979989183343, + "grad_norm": 1.298271656036377, + "learning_rate": 1.2012323929430925e-05, + "loss": 1.8575, + "mean_token_accuracy": 0.5722326636314392, + "num_tokens": 4539992685.0, + "step": 8880 + }, + { + "epoch": 2.401568415359654, + "grad_norm": 0.5952913165092468, + "learning_rate": 1.2010757429199866e-05, + "loss": 1.0891, + "mean_token_accuracy": 0.7094920873641968, + "num_tokens": 4540490689.0, + "step": 8881 + }, + { + "epoch": 2.4018388318009736, + "grad_norm": 1.8009880781173706, + "learning_rate": 1.2009190897955802e-05, + "loss": 1.9278, + "mean_token_accuracy": 0.574108362197876, + "num_tokens": 4540961092.0, + "step": 8882 + }, + { + "epoch": 2.402109248242293, + "grad_norm": 1.5685319900512695, + "learning_rate": 1.2007624335746793e-05, + "loss": 1.9624, + "mean_token_accuracy": 0.538845419883728, + "num_tokens": 4541485282.0, + "step": 8883 + }, + { + "epoch": 2.402379664683613, + "grad_norm": 1.2161945104599, + "learning_rate": 1.2006057742620906e-05, + "loss": 1.9961, + "mean_token_accuracy": 0.544329047203064, + "num_tokens": 4542009557.0, + "step": 8884 + }, + { + "epoch": 2.4026500811249325, + "grad_norm": 1.435107707977295, + "learning_rate": 1.200449111862621e-05, + "loss": 1.8502, + "mean_token_accuracy": 0.5637422800064087, + "num_tokens": 4542468946.0, + "step": 8885 + }, + { + "epoch": 2.402920497566252, + "grad_norm": 3.8276844024658203, + "learning_rate": 1.2002924463810772e-05, + "loss": 1.7879, + "mean_token_accuracy": 0.5965017080307007, + "num_tokens": 4542937566.0, + "step": 8886 + }, + { + "epoch": 2.403190914007572, + "grad_norm": 1.4510308504104614, + "learning_rate": 1.2001357778222661e-05, + "loss": 1.7192, + "mean_token_accuracy": 0.5917651653289795, + "num_tokens": 4543461821.0, + "step": 8887 + }, + { + "epoch": 2.4034613304488914, + "grad_norm": 1.4251207113265991, + "learning_rate": 1.1999791061909952e-05, + "loss": 1.9464, + "mean_token_accuracy": 0.5431364178657532, + "num_tokens": 4543947344.0, + "step": 8888 + }, + { + "epoch": 2.403731746890211, + "grad_norm": 1.2630819082260132, + "learning_rate": 1.1998224314920708e-05, + "loss": 1.7028, + "mean_token_accuracy": 0.6163640022277832, + "num_tokens": 4544471557.0, + "step": 8889 + }, + { + "epoch": 2.4040021633315307, + "grad_norm": 1.2804243564605713, + "learning_rate": 1.1996657537303008e-05, + "loss": 1.9829, + "mean_token_accuracy": 0.5605607628822327, + "num_tokens": 4544977554.0, + "step": 8890 + }, + { + "epoch": 2.4042725797728504, + "grad_norm": 1.2617415189743042, + "learning_rate": 1.1995090729104923e-05, + "loss": 1.7322, + "mean_token_accuracy": 0.5882933735847473, + "num_tokens": 4545440741.0, + "step": 8891 + }, + { + "epoch": 2.40454299621417, + "grad_norm": 1.4223201274871826, + "learning_rate": 1.199352389037453e-05, + "loss": 1.7835, + "mean_token_accuracy": 0.5444735884666443, + "num_tokens": 4545965023.0, + "step": 8892 + }, + { + "epoch": 2.4048134126554896, + "grad_norm": 1.7430592775344849, + "learning_rate": 1.1991957021159903e-05, + "loss": 1.9179, + "mean_token_accuracy": 0.5739153623580933, + "num_tokens": 4546405703.0, + "step": 8893 + }, + { + "epoch": 2.4050838290968093, + "grad_norm": 1.488323450088501, + "learning_rate": 1.1990390121509115e-05, + "loss": 2.0017, + "mean_token_accuracy": 0.5425846576690674, + "num_tokens": 4546929942.0, + "step": 8894 + }, + { + "epoch": 2.4053542455381285, + "grad_norm": 1.4090007543563843, + "learning_rate": 1.1988823191470247e-05, + "loss": 1.8729, + "mean_token_accuracy": 0.5914402604103088, + "num_tokens": 4547389551.0, + "step": 8895 + }, + { + "epoch": 2.4056246619794486, + "grad_norm": 1.4736489057540894, + "learning_rate": 1.1987256231091373e-05, + "loss": 1.83, + "mean_token_accuracy": 0.5749572515487671, + "num_tokens": 4547872498.0, + "step": 8896 + }, + { + "epoch": 2.4058950784207678, + "grad_norm": 1.5420743227005005, + "learning_rate": 1.198568924042058e-05, + "loss": 1.9616, + "mean_token_accuracy": 0.5491281747817993, + "num_tokens": 4548396689.0, + "step": 8897 + }, + { + "epoch": 2.406165494862088, + "grad_norm": 1.3137590885162354, + "learning_rate": 1.1984122219505937e-05, + "loss": 1.8546, + "mean_token_accuracy": 0.5826509594917297, + "num_tokens": 4548887176.0, + "step": 8898 + }, + { + "epoch": 2.406435911303407, + "grad_norm": 1.4690132141113281, + "learning_rate": 1.1982555168395532e-05, + "loss": 1.9192, + "mean_token_accuracy": 0.577013373374939, + "num_tokens": 4549411446.0, + "step": 8899 + }, + { + "epoch": 2.4067063277447267, + "grad_norm": 1.7117183208465576, + "learning_rate": 1.1980988087137444e-05, + "loss": 1.9025, + "mean_token_accuracy": 0.5498151779174805, + "num_tokens": 4549935730.0, + "step": 8900 + }, + { + "epoch": 2.4069767441860463, + "grad_norm": 0.5276784896850586, + "learning_rate": 1.197942097577976e-05, + "loss": 1.1015, + "mean_token_accuracy": 0.7036463022232056, + "num_tokens": 4550460006.0, + "step": 8901 + }, + { + "epoch": 2.407247160627366, + "grad_norm": 1.7074288129806519, + "learning_rate": 1.1977853834370556e-05, + "loss": 1.9511, + "mean_token_accuracy": 0.5612868070602417, + "num_tokens": 4550984140.0, + "step": 8902 + }, + { + "epoch": 2.4075175770686856, + "grad_norm": 1.525604486465454, + "learning_rate": 1.1976286662957922e-05, + "loss": 1.848, + "mean_token_accuracy": 0.5790227651596069, + "num_tokens": 4551508266.0, + "step": 8903 + }, + { + "epoch": 2.4077879935100053, + "grad_norm": 1.4200139045715332, + "learning_rate": 1.1974719461589943e-05, + "loss": 1.8574, + "mean_token_accuracy": 0.5861676335334778, + "num_tokens": 4552032486.0, + "step": 8904 + }, + { + "epoch": 2.408058409951325, + "grad_norm": 1.545750617980957, + "learning_rate": 1.1973152230314702e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5596557855606079, + "num_tokens": 4552556684.0, + "step": 8905 + }, + { + "epoch": 2.4083288263926446, + "grad_norm": 2.006399631500244, + "learning_rate": 1.1971584969180289e-05, + "loss": 1.8243, + "mean_token_accuracy": 0.5811876058578491, + "num_tokens": 4553080956.0, + "step": 8906 + }, + { + "epoch": 2.408599242833964, + "grad_norm": 1.2057336568832397, + "learning_rate": 1.1970017678234797e-05, + "loss": 1.8727, + "mean_token_accuracy": 0.572053074836731, + "num_tokens": 4553558954.0, + "step": 8907 + }, + { + "epoch": 2.408869659275284, + "grad_norm": 1.3522696495056152, + "learning_rate": 1.1968450357526307e-05, + "loss": 1.6905, + "mean_token_accuracy": 0.6187427043914795, + "num_tokens": 4554019124.0, + "step": 8908 + }, + { + "epoch": 2.4091400757166035, + "grad_norm": 1.8425668478012085, + "learning_rate": 1.1966883007102912e-05, + "loss": 1.9198, + "mean_token_accuracy": 0.5504728555679321, + "num_tokens": 4554543275.0, + "step": 8909 + }, + { + "epoch": 2.409410492157923, + "grad_norm": 1.5887548923492432, + "learning_rate": 1.1965315627012702e-05, + "loss": 1.96, + "mean_token_accuracy": 0.5414485335350037, + "num_tokens": 4555067558.0, + "step": 8910 + }, + { + "epoch": 2.4096809085992428, + "grad_norm": 1.187316656112671, + "learning_rate": 1.196374821730377e-05, + "loss": 1.8867, + "mean_token_accuracy": 0.5741173028945923, + "num_tokens": 4555591795.0, + "step": 8911 + }, + { + "epoch": 2.4099513250405624, + "grad_norm": 1.6144509315490723, + "learning_rate": 1.1962180778024208e-05, + "loss": 1.9, + "mean_token_accuracy": 0.5528863668441772, + "num_tokens": 4556116041.0, + "step": 8912 + }, + { + "epoch": 2.410221741481882, + "grad_norm": 1.5385124683380127, + "learning_rate": 1.1960613309222111e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5653353929519653, + "num_tokens": 4556640151.0, + "step": 8913 + }, + { + "epoch": 2.4104921579232017, + "grad_norm": 1.2133618593215942, + "learning_rate": 1.1959045810945573e-05, + "loss": 1.9132, + "mean_token_accuracy": 0.5569239854812622, + "num_tokens": 4557164219.0, + "step": 8914 + }, + { + "epoch": 2.4107625743645213, + "grad_norm": 1.6046154499053955, + "learning_rate": 1.195747828324269e-05, + "loss": 1.9485, + "mean_token_accuracy": 0.5696812868118286, + "num_tokens": 4557688353.0, + "step": 8915 + }, + { + "epoch": 2.411032990805841, + "grad_norm": 1.7199926376342773, + "learning_rate": 1.1955910726161555e-05, + "loss": 1.8464, + "mean_token_accuracy": 0.5618501901626587, + "num_tokens": 4558212570.0, + "step": 8916 + }, + { + "epoch": 2.4113034072471606, + "grad_norm": 1.1888712644577026, + "learning_rate": 1.1954343139750268e-05, + "loss": 1.7962, + "mean_token_accuracy": 0.5939140319824219, + "num_tokens": 4558667403.0, + "step": 8917 + }, + { + "epoch": 2.4115738236884803, + "grad_norm": 1.4879506826400757, + "learning_rate": 1.1952775524056928e-05, + "loss": 1.8685, + "mean_token_accuracy": 0.5706149339675903, + "num_tokens": 4559191613.0, + "step": 8918 + }, + { + "epoch": 2.4118442401298, + "grad_norm": 1.3516656160354614, + "learning_rate": 1.1951207879129632e-05, + "loss": 1.7702, + "mean_token_accuracy": 0.5931386947631836, + "num_tokens": 4559657792.0, + "step": 8919 + }, + { + "epoch": 2.4121146565711196, + "grad_norm": 1.2003039121627808, + "learning_rate": 1.1949640205016486e-05, + "loss": 1.9061, + "mean_token_accuracy": 0.5654587149620056, + "num_tokens": 4560181991.0, + "step": 8920 + }, + { + "epoch": 2.412385073012439, + "grad_norm": 0.487751305103302, + "learning_rate": 1.1948072501765579e-05, + "loss": 1.0627, + "mean_token_accuracy": 0.726388692855835, + "num_tokens": 4560641834.0, + "step": 8921 + }, + { + "epoch": 2.412655489453759, + "grad_norm": 2.0197556018829346, + "learning_rate": 1.1946504769425025e-05, + "loss": 2.0284, + "mean_token_accuracy": 0.5377427935600281, + "num_tokens": 4561166072.0, + "step": 8922 + }, + { + "epoch": 2.4129259058950785, + "grad_norm": 1.6772669553756714, + "learning_rate": 1.1944937008042918e-05, + "loss": 2.0307, + "mean_token_accuracy": 0.5471372604370117, + "num_tokens": 4561690206.0, + "step": 8923 + }, + { + "epoch": 2.413196322336398, + "grad_norm": 1.1700515747070312, + "learning_rate": 1.1943369217667364e-05, + "loss": 1.907, + "mean_token_accuracy": 0.5724653005599976, + "num_tokens": 4562199553.0, + "step": 8924 + }, + { + "epoch": 2.413466738777718, + "grad_norm": 1.2551579475402832, + "learning_rate": 1.1941801398346469e-05, + "loss": 1.915, + "mean_token_accuracy": 0.5786488056182861, + "num_tokens": 4562723729.0, + "step": 8925 + }, + { + "epoch": 2.4137371552190374, + "grad_norm": 1.2938737869262695, + "learning_rate": 1.194023355012834e-05, + "loss": 1.8978, + "mean_token_accuracy": 0.5762332677841187, + "num_tokens": 4563221577.0, + "step": 8926 + }, + { + "epoch": 2.414007571660357, + "grad_norm": 1.1207585334777832, + "learning_rate": 1.1938665673061076e-05, + "loss": 1.811, + "mean_token_accuracy": 0.5854174494743347, + "num_tokens": 4563684661.0, + "step": 8927 + }, + { + "epoch": 2.4142779881016767, + "grad_norm": 1.1868473291397095, + "learning_rate": 1.1937097767192792e-05, + "loss": 1.8087, + "mean_token_accuracy": 0.572659432888031, + "num_tokens": 4564182368.0, + "step": 8928 + }, + { + "epoch": 2.4145484045429964, + "grad_norm": 1.158604621887207, + "learning_rate": 1.1935529832571593e-05, + "loss": 1.9089, + "mean_token_accuracy": 0.5441974401473999, + "num_tokens": 4564706593.0, + "step": 8929 + }, + { + "epoch": 2.414818820984316, + "grad_norm": 1.4895983934402466, + "learning_rate": 1.1933961869245585e-05, + "loss": 1.9671, + "mean_token_accuracy": 0.5612094402313232, + "num_tokens": 4565182891.0, + "step": 8930 + }, + { + "epoch": 2.4150892374256356, + "grad_norm": 1.1646502017974854, + "learning_rate": 1.193239387726288e-05, + "loss": 1.7933, + "mean_token_accuracy": 0.5818055272102356, + "num_tokens": 4565706964.0, + "step": 8931 + }, + { + "epoch": 2.4153596538669553, + "grad_norm": 1.2493975162506104, + "learning_rate": 1.193082585667159e-05, + "loss": 1.8886, + "mean_token_accuracy": 0.5646411776542664, + "num_tokens": 4566231084.0, + "step": 8932 + }, + { + "epoch": 2.415630070308275, + "grad_norm": 1.2830809354782104, + "learning_rate": 1.192925780751983e-05, + "loss": 1.9755, + "mean_token_accuracy": 0.5503582954406738, + "num_tokens": 4566755254.0, + "step": 8933 + }, + { + "epoch": 2.4159004867495946, + "grad_norm": 1.1921687126159668, + "learning_rate": 1.1927689729855701e-05, + "loss": 1.9094, + "mean_token_accuracy": 0.5767344236373901, + "num_tokens": 4567221447.0, + "step": 8934 + }, + { + "epoch": 2.416170903190914, + "grad_norm": 1.2034227848052979, + "learning_rate": 1.1926121623727325e-05, + "loss": 1.9348, + "mean_token_accuracy": 0.5636041760444641, + "num_tokens": 4567707561.0, + "step": 8935 + }, + { + "epoch": 2.4164413196322334, + "grad_norm": 1.3546953201293945, + "learning_rate": 1.1924553489182815e-05, + "loss": 1.861, + "mean_token_accuracy": 0.5829629302024841, + "num_tokens": 4568231788.0, + "step": 8936 + }, + { + "epoch": 2.4167117360735535, + "grad_norm": 1.1885631084442139, + "learning_rate": 1.1922985326270286e-05, + "loss": 1.9478, + "mean_token_accuracy": 0.5657697916030884, + "num_tokens": 4568756037.0, + "step": 8937 + }, + { + "epoch": 2.4169821525148727, + "grad_norm": 1.1662927865982056, + "learning_rate": 1.1921417135037849e-05, + "loss": 1.8074, + "mean_token_accuracy": 0.5883947014808655, + "num_tokens": 4569229709.0, + "step": 8938 + }, + { + "epoch": 2.417252568956193, + "grad_norm": 1.5316264629364014, + "learning_rate": 1.1919848915533627e-05, + "loss": 1.8677, + "mean_token_accuracy": 0.5830698013305664, + "num_tokens": 4569753957.0, + "step": 8939 + }, + { + "epoch": 2.417522985397512, + "grad_norm": 1.148959994316101, + "learning_rate": 1.1918280667805735e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.5731912851333618, + "num_tokens": 4570218943.0, + "step": 8940 + }, + { + "epoch": 2.4177934018388316, + "grad_norm": 0.44394710659980774, + "learning_rate": 1.1916712391902292e-05, + "loss": 1.1277, + "mean_token_accuracy": 0.7028602361679077, + "num_tokens": 4570743120.0, + "step": 8941 + }, + { + "epoch": 2.4180638182801513, + "grad_norm": 1.7836962938308716, + "learning_rate": 1.1915144087871416e-05, + "loss": 1.8254, + "mean_token_accuracy": 0.5796342492103577, + "num_tokens": 4571267309.0, + "step": 8942 + }, + { + "epoch": 2.418334234721471, + "grad_norm": 1.6732375621795654, + "learning_rate": 1.1913575755761231e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5545944571495056, + "num_tokens": 4571791584.0, + "step": 8943 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 1.2591383457183838, + "learning_rate": 1.1912007395619851e-05, + "loss": 1.9583, + "mean_token_accuracy": 0.5580888390541077, + "num_tokens": 4572315831.0, + "step": 8944 + }, + { + "epoch": 2.41887506760411, + "grad_norm": 1.2853302955627441, + "learning_rate": 1.1910439007495405e-05, + "loss": 1.7954, + "mean_token_accuracy": 0.6081644892692566, + "num_tokens": 4572775166.0, + "step": 8945 + }, + { + "epoch": 2.41914548404543, + "grad_norm": 1.7902852296829224, + "learning_rate": 1.190887059143601e-05, + "loss": 1.9129, + "mean_token_accuracy": 0.5774146318435669, + "num_tokens": 4573299440.0, + "step": 8946 + }, + { + "epoch": 2.4194159004867495, + "grad_norm": 1.319292664527893, + "learning_rate": 1.1907302147489794e-05, + "loss": 1.9464, + "mean_token_accuracy": 0.5537097454071045, + "num_tokens": 4573802225.0, + "step": 8947 + }, + { + "epoch": 2.419686316928069, + "grad_norm": 1.111337661743164, + "learning_rate": 1.1905733675704881e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.5755321383476257, + "num_tokens": 4574326338.0, + "step": 8948 + }, + { + "epoch": 2.4199567333693888, + "grad_norm": 1.1594103574752808, + "learning_rate": 1.1904165176129395e-05, + "loss": 1.8874, + "mean_token_accuracy": 0.5758249163627625, + "num_tokens": 4574802281.0, + "step": 8949 + }, + { + "epoch": 2.4202271498107084, + "grad_norm": 1.2720898389816284, + "learning_rate": 1.1902596648811462e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5422565937042236, + "num_tokens": 4575326329.0, + "step": 8950 + }, + { + "epoch": 2.420497566252028, + "grad_norm": 1.1663093566894531, + "learning_rate": 1.1901028093799209e-05, + "loss": 1.8878, + "mean_token_accuracy": 0.5530622005462646, + "num_tokens": 4575801971.0, + "step": 8951 + }, + { + "epoch": 2.4207679826933477, + "grad_norm": 1.251861810684204, + "learning_rate": 1.1899459511140763e-05, + "loss": 1.9718, + "mean_token_accuracy": 0.5511837005615234, + "num_tokens": 4576326242.0, + "step": 8952 + }, + { + "epoch": 2.4210383991346673, + "grad_norm": 1.1866868734359741, + "learning_rate": 1.1897890900884255e-05, + "loss": 1.9776, + "mean_token_accuracy": 0.547038197517395, + "num_tokens": 4576850419.0, + "step": 8953 + }, + { + "epoch": 2.421308815575987, + "grad_norm": 0.9813746809959412, + "learning_rate": 1.1896322263077815e-05, + "loss": 1.8475, + "mean_token_accuracy": 0.5725296139717102, + "num_tokens": 4577374697.0, + "step": 8954 + }, + { + "epoch": 2.4215792320173066, + "grad_norm": 1.2528293132781982, + "learning_rate": 1.1894753597769572e-05, + "loss": 1.9943, + "mean_token_accuracy": 0.5705960988998413, + "num_tokens": 4577842472.0, + "step": 8955 + }, + { + "epoch": 2.4218496484586263, + "grad_norm": 0.9595189094543457, + "learning_rate": 1.1893184905007658e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5627388954162598, + "num_tokens": 4578366702.0, + "step": 8956 + }, + { + "epoch": 2.422120064899946, + "grad_norm": 1.0849025249481201, + "learning_rate": 1.1891616184840203e-05, + "loss": 1.8647, + "mean_token_accuracy": 0.5624957084655762, + "num_tokens": 4578890987.0, + "step": 8957 + }, + { + "epoch": 2.4223904813412656, + "grad_norm": 1.2531700134277344, + "learning_rate": 1.189004743731534e-05, + "loss": 2.0096, + "mean_token_accuracy": 0.5550930500030518, + "num_tokens": 4579415213.0, + "step": 8958 + }, + { + "epoch": 2.422660897782585, + "grad_norm": 1.1469637155532837, + "learning_rate": 1.1888478662481205e-05, + "loss": 1.9367, + "mean_token_accuracy": 0.5719770789146423, + "num_tokens": 4579915215.0, + "step": 8959 + }, + { + "epoch": 2.422931314223905, + "grad_norm": 1.0697088241577148, + "learning_rate": 1.1886909860385931e-05, + "loss": 1.8933, + "mean_token_accuracy": 0.5504131317138672, + "num_tokens": 4580439462.0, + "step": 8960 + }, + { + "epoch": 2.4232017306652245, + "grad_norm": 0.5631533265113831, + "learning_rate": 1.1885341031077657e-05, + "loss": 1.1256, + "mean_token_accuracy": 0.7017562389373779, + "num_tokens": 4580963743.0, + "step": 8961 + }, + { + "epoch": 2.423472147106544, + "grad_norm": 2.009711742401123, + "learning_rate": 1.1883772174604515e-05, + "loss": 1.9782, + "mean_token_accuracy": 0.5555338263511658, + "num_tokens": 4581426951.0, + "step": 8962 + }, + { + "epoch": 2.4237425635478638, + "grad_norm": 1.3673439025878906, + "learning_rate": 1.1882203291014647e-05, + "loss": 1.8906, + "mean_token_accuracy": 0.5737922191619873, + "num_tokens": 4581951182.0, + "step": 8963 + }, + { + "epoch": 2.4240129799891834, + "grad_norm": 1.1700780391693115, + "learning_rate": 1.1880634380356182e-05, + "loss": 1.9804, + "mean_token_accuracy": 0.5461671352386475, + "num_tokens": 4582475353.0, + "step": 8964 + }, + { + "epoch": 2.424283396430503, + "grad_norm": 1.2987170219421387, + "learning_rate": 1.1879065442677267e-05, + "loss": 1.8385, + "mean_token_accuracy": 0.5666000843048096, + "num_tokens": 4582999639.0, + "step": 8965 + }, + { + "epoch": 2.4245538128718227, + "grad_norm": 1.0903325080871582, + "learning_rate": 1.1877496478026039e-05, + "loss": 1.8341, + "mean_token_accuracy": 0.5536403656005859, + "num_tokens": 4583523852.0, + "step": 8966 + }, + { + "epoch": 2.4248242293131423, + "grad_norm": 1.1678909063339233, + "learning_rate": 1.1875927486450639e-05, + "loss": 1.9059, + "mean_token_accuracy": 0.5587043762207031, + "num_tokens": 4583999019.0, + "step": 8967 + }, + { + "epoch": 2.425094645754462, + "grad_norm": 1.1769194602966309, + "learning_rate": 1.1874358467999205e-05, + "loss": 1.8425, + "mean_token_accuracy": 0.5667948722839355, + "num_tokens": 4584523218.0, + "step": 8968 + }, + { + "epoch": 2.4253650621957816, + "grad_norm": 1.0854008197784424, + "learning_rate": 1.1872789422719887e-05, + "loss": 1.8494, + "mean_token_accuracy": 0.5784378051757812, + "num_tokens": 4585047489.0, + "step": 8969 + }, + { + "epoch": 2.4256354786371013, + "grad_norm": 1.087246060371399, + "learning_rate": 1.1871220350660819e-05, + "loss": 1.9307, + "mean_token_accuracy": 0.5797994136810303, + "num_tokens": 4585514849.0, + "step": 8970 + }, + { + "epoch": 2.425905895078421, + "grad_norm": 1.2603245973587036, + "learning_rate": 1.1869651251870151e-05, + "loss": 1.8611, + "mean_token_accuracy": 0.5680537223815918, + "num_tokens": 4586039070.0, + "step": 8971 + }, + { + "epoch": 2.4261763115197406, + "grad_norm": 1.0955145359039307, + "learning_rate": 1.1868082126396023e-05, + "loss": 1.8109, + "mean_token_accuracy": 0.5735286474227905, + "num_tokens": 4586515470.0, + "step": 8972 + }, + { + "epoch": 2.42644672796106, + "grad_norm": 1.373931646347046, + "learning_rate": 1.1866512974286583e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.5372462272644043, + "num_tokens": 4587039531.0, + "step": 8973 + }, + { + "epoch": 2.42671714440238, + "grad_norm": 1.31002938747406, + "learning_rate": 1.1864943795589973e-05, + "loss": 1.9308, + "mean_token_accuracy": 0.5691062808036804, + "num_tokens": 4587543334.0, + "step": 8974 + }, + { + "epoch": 2.4269875608436995, + "grad_norm": 1.2332918643951416, + "learning_rate": 1.186337459035435e-05, + "loss": 1.9307, + "mean_token_accuracy": 0.5691307783126831, + "num_tokens": 4588036930.0, + "step": 8975 + }, + { + "epoch": 2.427257977285019, + "grad_norm": 1.5438957214355469, + "learning_rate": 1.1861805358627855e-05, + "loss": 1.9148, + "mean_token_accuracy": 0.5696943402290344, + "num_tokens": 4588561088.0, + "step": 8976 + }, + { + "epoch": 2.4275283937263383, + "grad_norm": 1.1089171171188354, + "learning_rate": 1.1860236100458633e-05, + "loss": 1.8896, + "mean_token_accuracy": 0.5666732788085938, + "num_tokens": 4589085325.0, + "step": 8977 + }, + { + "epoch": 2.4277988101676584, + "grad_norm": 1.215173363685608, + "learning_rate": 1.1858666815894841e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.5495741367340088, + "num_tokens": 4589607524.0, + "step": 8978 + }, + { + "epoch": 2.4280692266089776, + "grad_norm": 1.5606919527053833, + "learning_rate": 1.1857097504984626e-05, + "loss": 1.8541, + "mean_token_accuracy": 0.5739429593086243, + "num_tokens": 4590099786.0, + "step": 8979 + }, + { + "epoch": 2.4283396430502977, + "grad_norm": 1.2157528400421143, + "learning_rate": 1.1855528167776138e-05, + "loss": 1.8215, + "mean_token_accuracy": 0.5937049388885498, + "num_tokens": 4590581276.0, + "step": 8980 + }, + { + "epoch": 2.428610059491617, + "grad_norm": 0.6575132012367249, + "learning_rate": 1.1853958804317527e-05, + "loss": 1.0403, + "mean_token_accuracy": 0.7276147603988647, + "num_tokens": 4591096719.0, + "step": 8981 + }, + { + "epoch": 2.4288804759329365, + "grad_norm": 2.3815760612487793, + "learning_rate": 1.1852389414656953e-05, + "loss": 1.8248, + "mean_token_accuracy": 0.5662786960601807, + "num_tokens": 4591577049.0, + "step": 8982 + }, + { + "epoch": 2.429150892374256, + "grad_norm": 2.3794963359832764, + "learning_rate": 1.1850819998842565e-05, + "loss": 1.8236, + "mean_token_accuracy": 0.5715136528015137, + "num_tokens": 4592101305.0, + "step": 8983 + }, + { + "epoch": 2.429421308815576, + "grad_norm": 1.9119865894317627, + "learning_rate": 1.1849250556922513e-05, + "loss": 1.9432, + "mean_token_accuracy": 0.5640009045600891, + "num_tokens": 4592589510.0, + "step": 8984 + }, + { + "epoch": 2.4296917252568955, + "grad_norm": 1.2127739191055298, + "learning_rate": 1.1847681088944963e-05, + "loss": 1.9405, + "mean_token_accuracy": 0.559353768825531, + "num_tokens": 4593077875.0, + "step": 8985 + }, + { + "epoch": 2.429962141698215, + "grad_norm": 2.0584909915924072, + "learning_rate": 1.1846111594958058e-05, + "loss": 1.8372, + "mean_token_accuracy": 0.5904059410095215, + "num_tokens": 4593562200.0, + "step": 8986 + }, + { + "epoch": 2.4302325581395348, + "grad_norm": 1.7554248571395874, + "learning_rate": 1.1844542075009965e-05, + "loss": 1.9251, + "mean_token_accuracy": 0.5596786141395569, + "num_tokens": 4594049983.0, + "step": 8987 + }, + { + "epoch": 2.4305029745808544, + "grad_norm": 1.421716332435608, + "learning_rate": 1.1842972529148834e-05, + "loss": 1.9376, + "mean_token_accuracy": 0.5479075312614441, + "num_tokens": 4594574246.0, + "step": 8988 + }, + { + "epoch": 2.430773391022174, + "grad_norm": 1.8711321353912354, + "learning_rate": 1.184140295742283e-05, + "loss": 1.945, + "mean_token_accuracy": 0.564987301826477, + "num_tokens": 4595070975.0, + "step": 8989 + }, + { + "epoch": 2.4310438074634937, + "grad_norm": 1.5604908466339111, + "learning_rate": 1.1839833359880106e-05, + "loss": 1.9077, + "mean_token_accuracy": 0.5571027994155884, + "num_tokens": 4595595119.0, + "step": 8990 + }, + { + "epoch": 2.4313142239048133, + "grad_norm": 1.4576765298843384, + "learning_rate": 1.1838263736568825e-05, + "loss": 1.8764, + "mean_token_accuracy": 0.5639885663986206, + "num_tokens": 4596119304.0, + "step": 8991 + }, + { + "epoch": 2.431584640346133, + "grad_norm": 1.4662326574325562, + "learning_rate": 1.1836694087537154e-05, + "loss": 1.8906, + "mean_token_accuracy": 0.5531408786773682, + "num_tokens": 4596643527.0, + "step": 8992 + }, + { + "epoch": 2.4318550567874526, + "grad_norm": 1.1008973121643066, + "learning_rate": 1.1835124412833239e-05, + "loss": 1.8732, + "mean_token_accuracy": 0.5634850263595581, + "num_tokens": 4597167773.0, + "step": 8993 + }, + { + "epoch": 2.4321254732287723, + "grad_norm": 1.4195345640182495, + "learning_rate": 1.1833554712505256e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5742349624633789, + "num_tokens": 4597665442.0, + "step": 8994 + }, + { + "epoch": 2.432395889670092, + "grad_norm": 1.3550949096679688, + "learning_rate": 1.1831984986601359e-05, + "loss": 1.7965, + "mean_token_accuracy": 0.5866942405700684, + "num_tokens": 4598189662.0, + "step": 8995 + }, + { + "epoch": 2.4326663061114115, + "grad_norm": 1.3648234605789185, + "learning_rate": 1.1830415235169718e-05, + "loss": 1.8994, + "mean_token_accuracy": 0.5829099416732788, + "num_tokens": 4598650674.0, + "step": 8996 + }, + { + "epoch": 2.432936722552731, + "grad_norm": 1.4425199031829834, + "learning_rate": 1.1828845458258498e-05, + "loss": 1.6317, + "mean_token_accuracy": 0.6243615746498108, + "num_tokens": 4599132239.0, + "step": 8997 + }, + { + "epoch": 2.433207138994051, + "grad_norm": 1.4452170133590698, + "learning_rate": 1.182727565591586e-05, + "loss": 1.9158, + "mean_token_accuracy": 0.5574350953102112, + "num_tokens": 4599656510.0, + "step": 8998 + }, + { + "epoch": 2.4334775554353705, + "grad_norm": 1.1513864994049072, + "learning_rate": 1.182570582818997e-05, + "loss": 1.8888, + "mean_token_accuracy": 0.5654515624046326, + "num_tokens": 4600180737.0, + "step": 8999 + }, + { + "epoch": 2.43374797187669, + "grad_norm": 1.221189022064209, + "learning_rate": 1.1824135975129001e-05, + "loss": 1.8638, + "mean_token_accuracy": 0.5661834478378296, + "num_tokens": 4600705017.0, + "step": 9000 + }, + { + "epoch": 2.4340183883180098, + "grad_norm": 0.47394028306007385, + "learning_rate": 1.1822566096781113e-05, + "loss": 1.1601, + "mean_token_accuracy": 0.6920697689056396, + "num_tokens": 4601229168.0, + "step": 9001 + }, + { + "epoch": 2.4342888047593294, + "grad_norm": 1.649195671081543, + "learning_rate": 1.1820996193194481e-05, + "loss": 1.9643, + "mean_token_accuracy": 0.5456517934799194, + "num_tokens": 4601753363.0, + "step": 9002 + }, + { + "epoch": 2.434559221200649, + "grad_norm": 1.3384110927581787, + "learning_rate": 1.1819426264417269e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5683744549751282, + "num_tokens": 4602277541.0, + "step": 9003 + }, + { + "epoch": 2.4348296376419687, + "grad_norm": 1.0382659435272217, + "learning_rate": 1.181785631049765e-05, + "loss": 1.8746, + "mean_token_accuracy": 0.5674443244934082, + "num_tokens": 4602801740.0, + "step": 9004 + }, + { + "epoch": 2.4351000540832883, + "grad_norm": 1.332210659980774, + "learning_rate": 1.1816286331483797e-05, + "loss": 1.8545, + "mean_token_accuracy": 0.5931648015975952, + "num_tokens": 4603325988.0, + "step": 9005 + }, + { + "epoch": 2.435370470524608, + "grad_norm": 1.4312900304794312, + "learning_rate": 1.1814716327423877e-05, + "loss": 1.9059, + "mean_token_accuracy": 0.5764898061752319, + "num_tokens": 4603850167.0, + "step": 9006 + }, + { + "epoch": 2.4356408869659276, + "grad_norm": 1.5206947326660156, + "learning_rate": 1.1813146298366063e-05, + "loss": 2.0246, + "mean_token_accuracy": 0.5555872917175293, + "num_tokens": 4604329217.0, + "step": 9007 + }, + { + "epoch": 2.4359113034072473, + "grad_norm": 1.357749104499817, + "learning_rate": 1.1811576244358534e-05, + "loss": 1.8366, + "mean_token_accuracy": 0.5857913494110107, + "num_tokens": 4604853412.0, + "step": 9008 + }, + { + "epoch": 2.436181719848567, + "grad_norm": 1.3683995008468628, + "learning_rate": 1.181000616544945e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5666894316673279, + "num_tokens": 4605377668.0, + "step": 9009 + }, + { + "epoch": 2.4364521362898865, + "grad_norm": 1.317604422569275, + "learning_rate": 1.1808436061687001e-05, + "loss": 1.8851, + "mean_token_accuracy": 0.575526237487793, + "num_tokens": 4605838842.0, + "step": 9010 + }, + { + "epoch": 2.436722552731206, + "grad_norm": 1.0559309720993042, + "learning_rate": 1.1806865933119353e-05, + "loss": 1.8796, + "mean_token_accuracy": 0.5660734176635742, + "num_tokens": 4606362962.0, + "step": 9011 + }, + { + "epoch": 2.436992969172526, + "grad_norm": 1.0625866651535034, + "learning_rate": 1.180529577979469e-05, + "loss": 1.8711, + "mean_token_accuracy": 0.5687131881713867, + "num_tokens": 4606885611.0, + "step": 9012 + }, + { + "epoch": 2.4372633856138455, + "grad_norm": 1.2941913604736328, + "learning_rate": 1.1803725601761183e-05, + "loss": 1.8395, + "mean_token_accuracy": 0.5815761089324951, + "num_tokens": 4607354140.0, + "step": 9013 + }, + { + "epoch": 2.437533802055165, + "grad_norm": 1.2029423713684082, + "learning_rate": 1.1802155399067009e-05, + "loss": 2.0022, + "mean_token_accuracy": 0.5369402170181274, + "num_tokens": 4607878303.0, + "step": 9014 + }, + { + "epoch": 2.4378042184964848, + "grad_norm": 1.1680128574371338, + "learning_rate": 1.1800585171760347e-05, + "loss": 1.8807, + "mean_token_accuracy": 0.5692589282989502, + "num_tokens": 4608402491.0, + "step": 9015 + }, + { + "epoch": 2.4380746349378044, + "grad_norm": 1.149923324584961, + "learning_rate": 1.179901491988938e-05, + "loss": 1.8137, + "mean_token_accuracy": 0.5746596455574036, + "num_tokens": 4608926757.0, + "step": 9016 + }, + { + "epoch": 2.438345051379124, + "grad_norm": 1.2540870904922485, + "learning_rate": 1.1797444643502286e-05, + "loss": 1.7995, + "mean_token_accuracy": 0.5837768316268921, + "num_tokens": 4609390668.0, + "step": 9017 + }, + { + "epoch": 2.4386154678204432, + "grad_norm": 1.2948274612426758, + "learning_rate": 1.1795874342647242e-05, + "loss": 1.9039, + "mean_token_accuracy": 0.5573089122772217, + "num_tokens": 4609862369.0, + "step": 9018 + }, + { + "epoch": 2.4388858842617633, + "grad_norm": 1.1931978464126587, + "learning_rate": 1.1794304017372435e-05, + "loss": 1.7171, + "mean_token_accuracy": 0.5872471928596497, + "num_tokens": 4610346774.0, + "step": 9019 + }, + { + "epoch": 2.4391563007030825, + "grad_norm": 1.1293586492538452, + "learning_rate": 1.1792733667726047e-05, + "loss": 1.9004, + "mean_token_accuracy": 0.5575060844421387, + "num_tokens": 4610870991.0, + "step": 9020 + }, + { + "epoch": 2.4394267171444026, + "grad_norm": 0.6786199808120728, + "learning_rate": 1.1791163293756256e-05, + "loss": 1.1475, + "mean_token_accuracy": 0.7057565450668335, + "num_tokens": 4611395207.0, + "step": 9021 + }, + { + "epoch": 2.439697133585722, + "grad_norm": 1.175367832183838, + "learning_rate": 1.1789592895511248e-05, + "loss": 1.8431, + "mean_token_accuracy": 0.5620086789131165, + "num_tokens": 4611919195.0, + "step": 9022 + }, + { + "epoch": 2.4399675500270415, + "grad_norm": 0.9995549321174622, + "learning_rate": 1.1788022473039212e-05, + "loss": 2.0155, + "mean_token_accuracy": 0.5377626419067383, + "num_tokens": 4612425930.0, + "step": 9023 + }, + { + "epoch": 2.440237966468361, + "grad_norm": 0.9659914374351501, + "learning_rate": 1.1786452026388322e-05, + "loss": 1.8236, + "mean_token_accuracy": 0.5863139629364014, + "num_tokens": 4612894527.0, + "step": 9024 + }, + { + "epoch": 2.4405083829096808, + "grad_norm": 1.0355156660079956, + "learning_rate": 1.1784881555606777e-05, + "loss": 1.8882, + "mean_token_accuracy": 0.5761117339134216, + "num_tokens": 4613418688.0, + "step": 9025 + }, + { + "epoch": 2.4407787993510004, + "grad_norm": 0.9769672155380249, + "learning_rate": 1.1783311060742755e-05, + "loss": 1.9778, + "mean_token_accuracy": 0.5483985543251038, + "num_tokens": 4613942859.0, + "step": 9026 + }, + { + "epoch": 2.44104921579232, + "grad_norm": 0.9668053388595581, + "learning_rate": 1.178174054184445e-05, + "loss": 1.8809, + "mean_token_accuracy": 0.5662237405776978, + "num_tokens": 4614467133.0, + "step": 9027 + }, + { + "epoch": 2.4413196322336397, + "grad_norm": 1.0508829355239868, + "learning_rate": 1.1780169998960042e-05, + "loss": 1.8255, + "mean_token_accuracy": 0.5807707905769348, + "num_tokens": 4614991402.0, + "step": 9028 + }, + { + "epoch": 2.4415900486749593, + "grad_norm": 1.1898034811019897, + "learning_rate": 1.1778599432137726e-05, + "loss": 1.9356, + "mean_token_accuracy": 0.5578495264053345, + "num_tokens": 4615515689.0, + "step": 9029 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 0.9983845353126526, + "learning_rate": 1.1777028841425688e-05, + "loss": 1.8334, + "mean_token_accuracy": 0.5791149139404297, + "num_tokens": 4616039936.0, + "step": 9030 + }, + { + "epoch": 2.4421308815575986, + "grad_norm": 1.196478009223938, + "learning_rate": 1.1775458226872118e-05, + "loss": 1.7863, + "mean_token_accuracy": 0.5960248708724976, + "num_tokens": 4616505766.0, + "step": 9031 + }, + { + "epoch": 2.4424012979989183, + "grad_norm": 1.1594336032867432, + "learning_rate": 1.1773887588525212e-05, + "loss": 1.9005, + "mean_token_accuracy": 0.5664968490600586, + "num_tokens": 4616995655.0, + "step": 9032 + }, + { + "epoch": 2.442671714440238, + "grad_norm": 1.104894757270813, + "learning_rate": 1.177231692643316e-05, + "loss": 1.898, + "mean_token_accuracy": 0.5574828386306763, + "num_tokens": 4617519736.0, + "step": 9033 + }, + { + "epoch": 2.4429421308815575, + "grad_norm": 1.1617218255996704, + "learning_rate": 1.1770746240644152e-05, + "loss": 1.9088, + "mean_token_accuracy": 0.5559927225112915, + "num_tokens": 4618044011.0, + "step": 9034 + }, + { + "epoch": 2.443212547322877, + "grad_norm": 1.024999737739563, + "learning_rate": 1.1769175531206385e-05, + "loss": 1.8202, + "mean_token_accuracy": 0.5647266507148743, + "num_tokens": 4618568136.0, + "step": 9035 + }, + { + "epoch": 2.443482963764197, + "grad_norm": 1.1466397047042847, + "learning_rate": 1.1767604798168048e-05, + "loss": 1.8171, + "mean_token_accuracy": 0.5897599458694458, + "num_tokens": 4619092417.0, + "step": 9036 + }, + { + "epoch": 2.4437533802055165, + "grad_norm": 1.2704499959945679, + "learning_rate": 1.176603404157734e-05, + "loss": 1.8501, + "mean_token_accuracy": 0.5748291611671448, + "num_tokens": 4619610495.0, + "step": 9037 + }, + { + "epoch": 2.444023796646836, + "grad_norm": 1.1437809467315674, + "learning_rate": 1.1764463261482453e-05, + "loss": 1.885, + "mean_token_accuracy": 0.5601736903190613, + "num_tokens": 4620134729.0, + "step": 9038 + }, + { + "epoch": 2.4442942130881558, + "grad_norm": 1.1247618198394775, + "learning_rate": 1.1762892457931587e-05, + "loss": 1.8009, + "mean_token_accuracy": 0.5751346945762634, + "num_tokens": 4620658959.0, + "step": 9039 + }, + { + "epoch": 2.4445646295294754, + "grad_norm": 1.1027699708938599, + "learning_rate": 1.1761321630972935e-05, + "loss": 1.9137, + "mean_token_accuracy": 0.5613884925842285, + "num_tokens": 4621183217.0, + "step": 9040 + }, + { + "epoch": 2.444835045970795, + "grad_norm": 0.5134360790252686, + "learning_rate": 1.1759750780654699e-05, + "loss": 1.2154, + "mean_token_accuracy": 0.6755983829498291, + "num_tokens": 4621707496.0, + "step": 9041 + }, + { + "epoch": 2.4451054624121147, + "grad_norm": 1.4839764833450317, + "learning_rate": 1.1758179907025073e-05, + "loss": 1.8414, + "mean_token_accuracy": 0.5804831981658936, + "num_tokens": 4622180598.0, + "step": 9042 + }, + { + "epoch": 2.4453758788534343, + "grad_norm": 1.226006031036377, + "learning_rate": 1.1756609010132259e-05, + "loss": 1.8912, + "mean_token_accuracy": 0.5616939067840576, + "num_tokens": 4622699965.0, + "step": 9043 + }, + { + "epoch": 2.445646295294754, + "grad_norm": 1.0948314666748047, + "learning_rate": 1.1755038090024453e-05, + "loss": 1.8261, + "mean_token_accuracy": 0.5813207626342773, + "num_tokens": 4623224234.0, + "step": 9044 + }, + { + "epoch": 2.4459167117360736, + "grad_norm": 0.9941508769989014, + "learning_rate": 1.1753467146749859e-05, + "loss": 1.94, + "mean_token_accuracy": 0.5534132122993469, + "num_tokens": 4623739606.0, + "step": 9045 + }, + { + "epoch": 2.4461871281773933, + "grad_norm": 1.0988667011260986, + "learning_rate": 1.1751896180356676e-05, + "loss": 1.8469, + "mean_token_accuracy": 0.5492848753929138, + "num_tokens": 4624263675.0, + "step": 9046 + }, + { + "epoch": 2.446457544618713, + "grad_norm": 1.4693361520767212, + "learning_rate": 1.175032519089311e-05, + "loss": 1.9381, + "mean_token_accuracy": 0.5456020832061768, + "num_tokens": 4624787954.0, + "step": 9047 + }, + { + "epoch": 2.4467279610600325, + "grad_norm": 1.1894065141677856, + "learning_rate": 1.174875417840736e-05, + "loss": 1.8082, + "mean_token_accuracy": 0.573641836643219, + "num_tokens": 4625253022.0, + "step": 9048 + }, + { + "epoch": 2.446998377501352, + "grad_norm": 0.9676339030265808, + "learning_rate": 1.1747183142947632e-05, + "loss": 1.8834, + "mean_token_accuracy": 0.5585136413574219, + "num_tokens": 4625777240.0, + "step": 9049 + }, + { + "epoch": 2.447268793942672, + "grad_norm": 1.2048815488815308, + "learning_rate": 1.1745612084562124e-05, + "loss": 1.9028, + "mean_token_accuracy": 0.5855493545532227, + "num_tokens": 4626286954.0, + "step": 9050 + }, + { + "epoch": 2.4475392103839915, + "grad_norm": 1.0291064977645874, + "learning_rate": 1.1744041003299043e-05, + "loss": 1.7689, + "mean_token_accuracy": 0.5944260358810425, + "num_tokens": 4626811127.0, + "step": 9051 + }, + { + "epoch": 2.447809626825311, + "grad_norm": 1.0484066009521484, + "learning_rate": 1.1742469899206599e-05, + "loss": 1.7904, + "mean_token_accuracy": 0.6037065982818604, + "num_tokens": 4627332511.0, + "step": 9052 + }, + { + "epoch": 2.4480800432666308, + "grad_norm": 1.076680302619934, + "learning_rate": 1.1740898772332996e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.571782648563385, + "num_tokens": 4627833915.0, + "step": 9053 + }, + { + "epoch": 2.4483504597079504, + "grad_norm": 1.1350922584533691, + "learning_rate": 1.173932762272644e-05, + "loss": 1.8567, + "mean_token_accuracy": 0.5784996151924133, + "num_tokens": 4628358084.0, + "step": 9054 + }, + { + "epoch": 2.44862087614927, + "grad_norm": 1.1080963611602783, + "learning_rate": 1.1737756450435134e-05, + "loss": 1.8444, + "mean_token_accuracy": 0.5857925415039062, + "num_tokens": 4628851636.0, + "step": 9055 + }, + { + "epoch": 2.4488912925905897, + "grad_norm": 0.9176394939422607, + "learning_rate": 1.1736185255507292e-05, + "loss": 1.8595, + "mean_token_accuracy": 0.5713961124420166, + "num_tokens": 4629375869.0, + "step": 9056 + }, + { + "epoch": 2.4491617090319093, + "grad_norm": 1.1086993217468262, + "learning_rate": 1.1734614037991123e-05, + "loss": 2.0042, + "mean_token_accuracy": 0.5390623211860657, + "num_tokens": 4629900119.0, + "step": 9057 + }, + { + "epoch": 2.449432125473229, + "grad_norm": 1.153521180152893, + "learning_rate": 1.1733042797934838e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.5527938008308411, + "num_tokens": 4630421483.0, + "step": 9058 + }, + { + "epoch": 2.449702541914548, + "grad_norm": 1.0802265405654907, + "learning_rate": 1.1731471535386636e-05, + "loss": 1.7133, + "mean_token_accuracy": 0.5820429921150208, + "num_tokens": 4630905902.0, + "step": 9059 + }, + { + "epoch": 2.4499729583558683, + "grad_norm": 0.9968311190605164, + "learning_rate": 1.1729900250394741e-05, + "loss": 1.8332, + "mean_token_accuracy": 0.56816565990448, + "num_tokens": 4631430028.0, + "step": 9060 + }, + { + "epoch": 2.4502433747971875, + "grad_norm": 0.44728219509124756, + "learning_rate": 1.1728328943007363e-05, + "loss": 1.138, + "mean_token_accuracy": 0.7035660147666931, + "num_tokens": 4631954206.0, + "step": 9061 + }, + { + "epoch": 2.4505137912385075, + "grad_norm": 1.2152531147003174, + "learning_rate": 1.1726757613272706e-05, + "loss": 1.9954, + "mean_token_accuracy": 0.5414978265762329, + "num_tokens": 4632448387.0, + "step": 9062 + }, + { + "epoch": 2.4507842076798267, + "grad_norm": 1.40015709400177, + "learning_rate": 1.1725186261238992e-05, + "loss": 1.9465, + "mean_token_accuracy": 0.564182698726654, + "num_tokens": 4632946081.0, + "step": 9063 + }, + { + "epoch": 2.4510546241211464, + "grad_norm": 1.2788816690444946, + "learning_rate": 1.1723614886954429e-05, + "loss": 1.9571, + "mean_token_accuracy": 0.5542263984680176, + "num_tokens": 4633430225.0, + "step": 9064 + }, + { + "epoch": 2.451325040562466, + "grad_norm": 1.1427520513534546, + "learning_rate": 1.1722043490467232e-05, + "loss": 1.9629, + "mean_token_accuracy": 0.565043568611145, + "num_tokens": 4633954358.0, + "step": 9065 + }, + { + "epoch": 2.4515954570037857, + "grad_norm": 1.272537350654602, + "learning_rate": 1.1720472071825616e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5651088953018188, + "num_tokens": 4634460549.0, + "step": 9066 + }, + { + "epoch": 2.4518658734451053, + "grad_norm": 1.2728626728057861, + "learning_rate": 1.1718900631077803e-05, + "loss": 1.8895, + "mean_token_accuracy": 0.5665244460105896, + "num_tokens": 4634934784.0, + "step": 9067 + }, + { + "epoch": 2.452136289886425, + "grad_norm": 1.1147167682647705, + "learning_rate": 1.1717329168272002e-05, + "loss": 1.8505, + "mean_token_accuracy": 0.580875813961029, + "num_tokens": 4635459053.0, + "step": 9068 + }, + { + "epoch": 2.4524067063277446, + "grad_norm": 1.2152998447418213, + "learning_rate": 1.1715757683456431e-05, + "loss": 1.8267, + "mean_token_accuracy": 0.5693051218986511, + "num_tokens": 4635983298.0, + "step": 9069 + }, + { + "epoch": 2.4526771227690642, + "grad_norm": 1.2519104480743408, + "learning_rate": 1.1714186176679311e-05, + "loss": 1.9082, + "mean_token_accuracy": 0.5886799693107605, + "num_tokens": 4636458975.0, + "step": 9070 + }, + { + "epoch": 2.452947539210384, + "grad_norm": 1.2161654233932495, + "learning_rate": 1.171261464798886e-05, + "loss": 1.8047, + "mean_token_accuracy": 0.5967807769775391, + "num_tokens": 4636925706.0, + "step": 9071 + }, + { + "epoch": 2.4532179556517035, + "grad_norm": 1.373257040977478, + "learning_rate": 1.1711043097433295e-05, + "loss": 1.8129, + "mean_token_accuracy": 0.5918930172920227, + "num_tokens": 4637427791.0, + "step": 9072 + }, + { + "epoch": 2.453488372093023, + "grad_norm": 1.3681596517562866, + "learning_rate": 1.1709471525060836e-05, + "loss": 1.8008, + "mean_token_accuracy": 0.5888799428939819, + "num_tokens": 4637951944.0, + "step": 9073 + }, + { + "epoch": 2.453758788534343, + "grad_norm": 0.9515214562416077, + "learning_rate": 1.1707899930919706e-05, + "loss": 1.81, + "mean_token_accuracy": 0.5890705585479736, + "num_tokens": 4638443549.0, + "step": 9074 + }, + { + "epoch": 2.4540292049756625, + "grad_norm": 1.3921366930007935, + "learning_rate": 1.1706328315058121e-05, + "loss": 1.7884, + "mean_token_accuracy": 0.5902683138847351, + "num_tokens": 4638967773.0, + "step": 9075 + }, + { + "epoch": 2.454299621416982, + "grad_norm": 1.2766780853271484, + "learning_rate": 1.1704756677524307e-05, + "loss": 1.923, + "mean_token_accuracy": 0.5549455285072327, + "num_tokens": 4639491913.0, + "step": 9076 + }, + { + "epoch": 2.4545700378583017, + "grad_norm": 1.158860445022583, + "learning_rate": 1.170318501836649e-05, + "loss": 1.9565, + "mean_token_accuracy": 0.5638884902000427, + "num_tokens": 4640016181.0, + "step": 9077 + }, + { + "epoch": 2.4548404542996214, + "grad_norm": 1.124650478363037, + "learning_rate": 1.1701613337632885e-05, + "loss": 1.8428, + "mean_token_accuracy": 0.5689266920089722, + "num_tokens": 4640540326.0, + "step": 9078 + }, + { + "epoch": 2.455110870740941, + "grad_norm": 1.2647168636322021, + "learning_rate": 1.170004163537172e-05, + "loss": 1.8913, + "mean_token_accuracy": 0.5577917098999023, + "num_tokens": 4641064597.0, + "step": 9079 + }, + { + "epoch": 2.4553812871822607, + "grad_norm": 1.2403488159179688, + "learning_rate": 1.1698469911631219e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5563262701034546, + "num_tokens": 4641588880.0, + "step": 9080 + }, + { + "epoch": 2.4556517036235803, + "grad_norm": 0.5317742824554443, + "learning_rate": 1.1696898166459605e-05, + "loss": 1.1818, + "mean_token_accuracy": 0.6913775205612183, + "num_tokens": 4642104342.0, + "step": 9081 + }, + { + "epoch": 2.4559221200649, + "grad_norm": 1.342872977256775, + "learning_rate": 1.1695326399905107e-05, + "loss": 1.8574, + "mean_token_accuracy": 0.5591967105865479, + "num_tokens": 4642628582.0, + "step": 9082 + }, + { + "epoch": 2.4561925365062196, + "grad_norm": 1.4234986305236816, + "learning_rate": 1.169375461201595e-05, + "loss": 1.8495, + "mean_token_accuracy": 0.5655680894851685, + "num_tokens": 4643152829.0, + "step": 9083 + }, + { + "epoch": 2.4564629529475392, + "grad_norm": 1.101485252380371, + "learning_rate": 1.1692182802840366e-05, + "loss": 2.0072, + "mean_token_accuracy": 0.5513874292373657, + "num_tokens": 4643677016.0, + "step": 9084 + }, + { + "epoch": 2.456733369388859, + "grad_norm": 1.1726305484771729, + "learning_rate": 1.1690610972426573e-05, + "loss": 1.9353, + "mean_token_accuracy": 0.567520022392273, + "num_tokens": 4644201299.0, + "step": 9085 + }, + { + "epoch": 2.4570037858301785, + "grad_norm": 1.3364660739898682, + "learning_rate": 1.1689039120822805e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5741877555847168, + "num_tokens": 4644689381.0, + "step": 9086 + }, + { + "epoch": 2.457274202271498, + "grad_norm": 1.3516710996627808, + "learning_rate": 1.1687467248077291e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.5439584255218506, + "num_tokens": 4645191425.0, + "step": 9087 + }, + { + "epoch": 2.457544618712818, + "grad_norm": 1.2716470956802368, + "learning_rate": 1.168589535423826e-05, + "loss": 1.891, + "mean_token_accuracy": 0.5592449903488159, + "num_tokens": 4645715625.0, + "step": 9088 + }, + { + "epoch": 2.4578150351541375, + "grad_norm": 1.1711113452911377, + "learning_rate": 1.1684323439353942e-05, + "loss": 1.8634, + "mean_token_accuracy": 0.5719213485717773, + "num_tokens": 4646239885.0, + "step": 9089 + }, + { + "epoch": 2.458085451595457, + "grad_norm": 1.0606231689453125, + "learning_rate": 1.1682751503472569e-05, + "loss": 1.8459, + "mean_token_accuracy": 0.5656499266624451, + "num_tokens": 4646764003.0, + "step": 9090 + }, + { + "epoch": 2.4583558680367767, + "grad_norm": 1.1770614385604858, + "learning_rate": 1.1681179546642374e-05, + "loss": 1.8454, + "mean_token_accuracy": 0.5709362626075745, + "num_tokens": 4647288117.0, + "step": 9091 + }, + { + "epoch": 2.4586262844780964, + "grad_norm": 1.164795994758606, + "learning_rate": 1.1679607568911585e-05, + "loss": 1.988, + "mean_token_accuracy": 0.5428032875061035, + "num_tokens": 4647812326.0, + "step": 9092 + }, + { + "epoch": 2.458896700919416, + "grad_norm": 1.07801353931427, + "learning_rate": 1.1678035570328437e-05, + "loss": 1.8881, + "mean_token_accuracy": 0.574376106262207, + "num_tokens": 4648285281.0, + "step": 9093 + }, + { + "epoch": 2.4591671173607357, + "grad_norm": 3.251574754714966, + "learning_rate": 1.1676463550941164e-05, + "loss": 1.7114, + "mean_token_accuracy": 0.6202459335327148, + "num_tokens": 4648809540.0, + "step": 9094 + }, + { + "epoch": 2.4594375338020553, + "grad_norm": 1.5818711519241333, + "learning_rate": 1.1674891510797999e-05, + "loss": 1.9633, + "mean_token_accuracy": 0.5482747554779053, + "num_tokens": 4649333691.0, + "step": 9095 + }, + { + "epoch": 2.459707950243375, + "grad_norm": 1.293413758277893, + "learning_rate": 1.1673319449947179e-05, + "loss": 1.9125, + "mean_token_accuracy": 0.5595257878303528, + "num_tokens": 4649857960.0, + "step": 9096 + }, + { + "epoch": 2.4599783666846946, + "grad_norm": 1.346670150756836, + "learning_rate": 1.167174736843694e-05, + "loss": 1.9909, + "mean_token_accuracy": 0.5473019480705261, + "num_tokens": 4650382242.0, + "step": 9097 + }, + { + "epoch": 2.4602487831260142, + "grad_norm": 1.8153868913650513, + "learning_rate": 1.1670175266315516e-05, + "loss": 1.9729, + "mean_token_accuracy": 0.5692336559295654, + "num_tokens": 4650877032.0, + "step": 9098 + }, + { + "epoch": 2.460519199567334, + "grad_norm": 1.282260775566101, + "learning_rate": 1.1668603143631142e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.5492578744888306, + "num_tokens": 4651401311.0, + "step": 9099 + }, + { + "epoch": 2.460789616008653, + "grad_norm": 1.5319504737854004, + "learning_rate": 1.1667031000432058e-05, + "loss": 1.9013, + "mean_token_accuracy": 0.5796724557876587, + "num_tokens": 4651830925.0, + "step": 9100 + }, + { + "epoch": 2.461060032449973, + "grad_norm": 0.5226890444755554, + "learning_rate": 1.1665458836766498e-05, + "loss": 1.1838, + "mean_token_accuracy": 0.6925269365310669, + "num_tokens": 4652355023.0, + "step": 9101 + }, + { + "epoch": 2.4613304488912924, + "grad_norm": 1.585693120956421, + "learning_rate": 1.1663886652682708e-05, + "loss": 1.9166, + "mean_token_accuracy": 0.5708813667297363, + "num_tokens": 4652879177.0, + "step": 9102 + }, + { + "epoch": 2.4616008653326125, + "grad_norm": 1.1849814653396606, + "learning_rate": 1.1662314448228923e-05, + "loss": 1.8826, + "mean_token_accuracy": 0.5611029863357544, + "num_tokens": 4653403422.0, + "step": 9103 + }, + { + "epoch": 2.4618712817739317, + "grad_norm": 1.0911681652069092, + "learning_rate": 1.166074222345338e-05, + "loss": 1.8234, + "mean_token_accuracy": 0.572799026966095, + "num_tokens": 4653927697.0, + "step": 9104 + }, + { + "epoch": 2.4621416982152513, + "grad_norm": 1.1449143886566162, + "learning_rate": 1.1659169978404326e-05, + "loss": 1.9326, + "mean_token_accuracy": 0.5643835067749023, + "num_tokens": 4654451968.0, + "step": 9105 + }, + { + "epoch": 2.462412114656571, + "grad_norm": 1.0322329998016357, + "learning_rate": 1.1657597713129996e-05, + "loss": 1.7933, + "mean_token_accuracy": 0.5795814394950867, + "num_tokens": 4654976170.0, + "step": 9106 + }, + { + "epoch": 2.4626825310978906, + "grad_norm": 1.239033579826355, + "learning_rate": 1.1656025427678636e-05, + "loss": 1.9246, + "mean_token_accuracy": 0.5657694339752197, + "num_tokens": 4655500246.0, + "step": 9107 + }, + { + "epoch": 2.4629529475392102, + "grad_norm": 1.3698123693466187, + "learning_rate": 1.1654453122098486e-05, + "loss": 1.912, + "mean_token_accuracy": 0.551560640335083, + "num_tokens": 4656024414.0, + "step": 9108 + }, + { + "epoch": 2.46322336398053, + "grad_norm": 1.3522226810455322, + "learning_rate": 1.1652880796437787e-05, + "loss": 2.0426, + "mean_token_accuracy": 0.550638735294342, + "num_tokens": 4656548665.0, + "step": 9109 + }, + { + "epoch": 2.4634937804218495, + "grad_norm": 1.1384327411651611, + "learning_rate": 1.165130845074479e-05, + "loss": 1.8815, + "mean_token_accuracy": 0.5803968906402588, + "num_tokens": 4657047525.0, + "step": 9110 + }, + { + "epoch": 2.463764196863169, + "grad_norm": 1.3809396028518677, + "learning_rate": 1.164973608506773e-05, + "loss": 1.8271, + "mean_token_accuracy": 0.5761756300926208, + "num_tokens": 4657571777.0, + "step": 9111 + }, + { + "epoch": 2.464034613304489, + "grad_norm": 1.784421682357788, + "learning_rate": 1.164816369945486e-05, + "loss": 2.0375, + "mean_token_accuracy": 0.5324006676673889, + "num_tokens": 4658095875.0, + "step": 9112 + }, + { + "epoch": 2.4643050297458085, + "grad_norm": 1.179911494255066, + "learning_rate": 1.164659129395442e-05, + "loss": 1.8674, + "mean_token_accuracy": 0.5795764327049255, + "num_tokens": 4658553277.0, + "step": 9113 + }, + { + "epoch": 2.464575446187128, + "grad_norm": 1.1657475233078003, + "learning_rate": 1.1645018868614658e-05, + "loss": 1.8205, + "mean_token_accuracy": 0.5825138092041016, + "num_tokens": 4659077396.0, + "step": 9114 + }, + { + "epoch": 2.4648458626284477, + "grad_norm": 1.6518237590789795, + "learning_rate": 1.1643446423483819e-05, + "loss": 2.0275, + "mean_token_accuracy": 0.5560753345489502, + "num_tokens": 4659601581.0, + "step": 9115 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 1.0451008081436157, + "learning_rate": 1.164187395861015e-05, + "loss": 1.9261, + "mean_token_accuracy": 0.5651177763938904, + "num_tokens": 4660101791.0, + "step": 9116 + }, + { + "epoch": 2.465386695511087, + "grad_norm": 1.402828574180603, + "learning_rate": 1.1640301474041905e-05, + "loss": 2.0232, + "mean_token_accuracy": 0.5554556846618652, + "num_tokens": 4660625926.0, + "step": 9117 + }, + { + "epoch": 2.4656571119524067, + "grad_norm": 1.1116220951080322, + "learning_rate": 1.1638728969827325e-05, + "loss": 1.8415, + "mean_token_accuracy": 0.5756773948669434, + "num_tokens": 4661150147.0, + "step": 9118 + }, + { + "epoch": 2.4659275283937263, + "grad_norm": 1.015343189239502, + "learning_rate": 1.1637156446014662e-05, + "loss": 1.8798, + "mean_token_accuracy": 0.5692571401596069, + "num_tokens": 4661618071.0, + "step": 9119 + }, + { + "epoch": 2.466197944835046, + "grad_norm": 1.1396217346191406, + "learning_rate": 1.1635583902652166e-05, + "loss": 1.8562, + "mean_token_accuracy": 0.5843114852905273, + "num_tokens": 4662130927.0, + "step": 9120 + }, + { + "epoch": 2.4664683612763656, + "grad_norm": 0.48275256156921387, + "learning_rate": 1.1634011339788084e-05, + "loss": 1.0781, + "mean_token_accuracy": 0.7172501087188721, + "num_tokens": 4662655150.0, + "step": 9121 + }, + { + "epoch": 2.4667387777176852, + "grad_norm": 1.5990641117095947, + "learning_rate": 1.1632438757470671e-05, + "loss": 1.9017, + "mean_token_accuracy": 0.5662940740585327, + "num_tokens": 4663179410.0, + "step": 9122 + }, + { + "epoch": 2.467009194159005, + "grad_norm": 1.288456678390503, + "learning_rate": 1.163086615574818e-05, + "loss": 1.959, + "mean_token_accuracy": 0.5532549023628235, + "num_tokens": 4663703668.0, + "step": 9123 + }, + { + "epoch": 2.4672796106003245, + "grad_norm": 1.4636586904525757, + "learning_rate": 1.1629293534668858e-05, + "loss": 1.6267, + "mean_token_accuracy": 0.6000598669052124, + "num_tokens": 4664227848.0, + "step": 9124 + }, + { + "epoch": 2.467550027041644, + "grad_norm": 1.399357795715332, + "learning_rate": 1.1627720894280959e-05, + "loss": 1.8368, + "mean_token_accuracy": 0.5676179528236389, + "num_tokens": 4664720580.0, + "step": 9125 + }, + { + "epoch": 2.467820443482964, + "grad_norm": 1.4367343187332153, + "learning_rate": 1.1626148234632738e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.5703138113021851, + "num_tokens": 4665244779.0, + "step": 9126 + }, + { + "epoch": 2.4680908599242835, + "grad_norm": 1.1574618816375732, + "learning_rate": 1.1624575555772447e-05, + "loss": 1.893, + "mean_token_accuracy": 0.5818145275115967, + "num_tokens": 4665700885.0, + "step": 9127 + }, + { + "epoch": 2.468361276365603, + "grad_norm": 1.1719720363616943, + "learning_rate": 1.1623002857748342e-05, + "loss": 1.9201, + "mean_token_accuracy": 0.5626841187477112, + "num_tokens": 4666191717.0, + "step": 9128 + }, + { + "epoch": 2.4686316928069227, + "grad_norm": 1.5870229005813599, + "learning_rate": 1.1621430140608676e-05, + "loss": 1.9303, + "mean_token_accuracy": 0.5558470487594604, + "num_tokens": 4666715923.0, + "step": 9129 + }, + { + "epoch": 2.4689021092482424, + "grad_norm": 1.4035422801971436, + "learning_rate": 1.1619857404401706e-05, + "loss": 1.9311, + "mean_token_accuracy": 0.5619474649429321, + "num_tokens": 4667240050.0, + "step": 9130 + }, + { + "epoch": 2.469172525689562, + "grad_norm": 1.1690332889556885, + "learning_rate": 1.1618284649175688e-05, + "loss": 1.9533, + "mean_token_accuracy": 0.5577956438064575, + "num_tokens": 4667764207.0, + "step": 9131 + }, + { + "epoch": 2.4694429421308817, + "grad_norm": 1.2085046768188477, + "learning_rate": 1.161671187497888e-05, + "loss": 1.8757, + "mean_token_accuracy": 0.548561692237854, + "num_tokens": 4668288378.0, + "step": 9132 + }, + { + "epoch": 2.4697133585722013, + "grad_norm": 1.3061670064926147, + "learning_rate": 1.1615139081859539e-05, + "loss": 1.8897, + "mean_token_accuracy": 0.5731275677680969, + "num_tokens": 4668812536.0, + "step": 9133 + }, + { + "epoch": 2.469983775013521, + "grad_norm": 1.2490333318710327, + "learning_rate": 1.161356626986592e-05, + "loss": 1.8915, + "mean_token_accuracy": 0.5617523789405823, + "num_tokens": 4669336788.0, + "step": 9134 + }, + { + "epoch": 2.4702541914548406, + "grad_norm": 1.093442440032959, + "learning_rate": 1.1611993439046283e-05, + "loss": 1.8601, + "mean_token_accuracy": 0.5821949243545532, + "num_tokens": 4669814252.0, + "step": 9135 + }, + { + "epoch": 2.4705246078961602, + "grad_norm": 1.2432054281234741, + "learning_rate": 1.1610420589448888e-05, + "loss": 1.9284, + "mean_token_accuracy": 0.5608620047569275, + "num_tokens": 4670301641.0, + "step": 9136 + }, + { + "epoch": 2.47079502433748, + "grad_norm": 2.618746519088745, + "learning_rate": 1.1608847721121994e-05, + "loss": 1.7332, + "mean_token_accuracy": 0.6016944646835327, + "num_tokens": 4670825831.0, + "step": 9137 + }, + { + "epoch": 2.4710654407787995, + "grad_norm": 1.3289990425109863, + "learning_rate": 1.1607274834113862e-05, + "loss": 1.8708, + "mean_token_accuracy": 0.5710955858230591, + "num_tokens": 4671349998.0, + "step": 9138 + }, + { + "epoch": 2.471335857220119, + "grad_norm": 1.3669147491455078, + "learning_rate": 1.1605701928472752e-05, + "loss": 1.8886, + "mean_token_accuracy": 0.5733934640884399, + "num_tokens": 4671825304.0, + "step": 9139 + }, + { + "epoch": 2.471606273661439, + "grad_norm": 1.011584758758545, + "learning_rate": 1.1604129004246927e-05, + "loss": 1.8951, + "mean_token_accuracy": 0.5601680278778076, + "num_tokens": 4672349469.0, + "step": 9140 + }, + { + "epoch": 2.471876690102758, + "grad_norm": 0.4413298964500427, + "learning_rate": 1.1602556061484647e-05, + "loss": 1.0456, + "mean_token_accuracy": 0.7109935283660889, + "num_tokens": 4672873691.0, + "step": 9141 + }, + { + "epoch": 2.472147106544078, + "grad_norm": 1.7010644674301147, + "learning_rate": 1.1600983100234174e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.562599778175354, + "num_tokens": 4673397885.0, + "step": 9142 + }, + { + "epoch": 2.4724175229853973, + "grad_norm": 1.5034865140914917, + "learning_rate": 1.1599410120543773e-05, + "loss": 1.8482, + "mean_token_accuracy": 0.5513577461242676, + "num_tokens": 4673921958.0, + "step": 9143 + }, + { + "epoch": 2.4726879394267174, + "grad_norm": 1.2360318899154663, + "learning_rate": 1.1597837122461703e-05, + "loss": 1.8844, + "mean_token_accuracy": 0.5665169358253479, + "num_tokens": 4674354023.0, + "step": 9144 + }, + { + "epoch": 2.4729583558680366, + "grad_norm": 1.4829611778259277, + "learning_rate": 1.1596264106036234e-05, + "loss": 1.9136, + "mean_token_accuracy": 0.5695138573646545, + "num_tokens": 4674875203.0, + "step": 9145 + }, + { + "epoch": 2.4732287723093562, + "grad_norm": 1.658660888671875, + "learning_rate": 1.1594691071315634e-05, + "loss": 1.7831, + "mean_token_accuracy": 0.5992798209190369, + "num_tokens": 4675399277.0, + "step": 9146 + }, + { + "epoch": 2.473499188750676, + "grad_norm": 1.472556710243225, + "learning_rate": 1.1593118018348155e-05, + "loss": 1.8326, + "mean_token_accuracy": 0.5917311906814575, + "num_tokens": 4675923546.0, + "step": 9147 + }, + { + "epoch": 2.4737696051919955, + "grad_norm": 1.3468321561813354, + "learning_rate": 1.1591544947182077e-05, + "loss": 1.8931, + "mean_token_accuracy": 0.5603879690170288, + "num_tokens": 4676373315.0, + "step": 9148 + }, + { + "epoch": 2.474040021633315, + "grad_norm": 1.1287952661514282, + "learning_rate": 1.1589971857865657e-05, + "loss": 1.7172, + "mean_token_accuracy": 0.5762403011322021, + "num_tokens": 4676897597.0, + "step": 9149 + }, + { + "epoch": 2.474310438074635, + "grad_norm": 1.3869106769561768, + "learning_rate": 1.1588398750447168e-05, + "loss": 1.7412, + "mean_token_accuracy": 0.5888732671737671, + "num_tokens": 4677421871.0, + "step": 9150 + }, + { + "epoch": 2.4745808545159544, + "grad_norm": 1.409320592880249, + "learning_rate": 1.1586825624974872e-05, + "loss": 1.8336, + "mean_token_accuracy": 0.570483386516571, + "num_tokens": 4677946123.0, + "step": 9151 + }, + { + "epoch": 2.474851270957274, + "grad_norm": 1.3949992656707764, + "learning_rate": 1.158525248149704e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5789724588394165, + "num_tokens": 4678470405.0, + "step": 9152 + }, + { + "epoch": 2.4751216873985937, + "grad_norm": 1.3584434986114502, + "learning_rate": 1.1583679320061944e-05, + "loss": 1.9888, + "mean_token_accuracy": 0.5495513677597046, + "num_tokens": 4678994643.0, + "step": 9153 + }, + { + "epoch": 2.4753921038399134, + "grad_norm": 1.143574595451355, + "learning_rate": 1.1582106140717848e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5659953951835632, + "num_tokens": 4679462610.0, + "step": 9154 + }, + { + "epoch": 2.475662520281233, + "grad_norm": 1.4446690082550049, + "learning_rate": 1.1580532943513025e-05, + "loss": 1.961, + "mean_token_accuracy": 0.5666028261184692, + "num_tokens": 4679986881.0, + "step": 9155 + }, + { + "epoch": 2.4759329367225527, + "grad_norm": 1.0706963539123535, + "learning_rate": 1.1578959728495745e-05, + "loss": 1.8581, + "mean_token_accuracy": 0.5448594093322754, + "num_tokens": 4680511138.0, + "step": 9156 + }, + { + "epoch": 2.4762033531638723, + "grad_norm": 1.1604554653167725, + "learning_rate": 1.1577386495714274e-05, + "loss": 1.8173, + "mean_token_accuracy": 0.5636183023452759, + "num_tokens": 4681035249.0, + "step": 9157 + }, + { + "epoch": 2.476473769605192, + "grad_norm": 1.242952585220337, + "learning_rate": 1.157581324521689e-05, + "loss": 1.9569, + "mean_token_accuracy": 0.5562207698822021, + "num_tokens": 4681559401.0, + "step": 9158 + }, + { + "epoch": 2.4767441860465116, + "grad_norm": 1.2526962757110596, + "learning_rate": 1.157423997705186e-05, + "loss": 1.8619, + "mean_token_accuracy": 0.5596983432769775, + "num_tokens": 4682083616.0, + "step": 9159 + }, + { + "epoch": 2.4770146024878312, + "grad_norm": 0.9842329621315002, + "learning_rate": 1.157266669126746e-05, + "loss": 1.4831, + "mean_token_accuracy": 0.6753566265106201, + "num_tokens": 4682607898.0, + "step": 9160 + }, + { + "epoch": 2.477285018929151, + "grad_norm": 0.4880791902542114, + "learning_rate": 1.1571093387911963e-05, + "loss": 1.1445, + "mean_token_accuracy": 0.7026314735412598, + "num_tokens": 4683115088.0, + "step": 9161 + }, + { + "epoch": 2.4775554353704705, + "grad_norm": 1.715762972831726, + "learning_rate": 1.156952006703364e-05, + "loss": 1.8725, + "mean_token_accuracy": 0.5665915012359619, + "num_tokens": 4683639308.0, + "step": 9162 + }, + { + "epoch": 2.47782585181179, + "grad_norm": 1.778278112411499, + "learning_rate": 1.156794672868077e-05, + "loss": 2.0598, + "mean_token_accuracy": 0.5518344640731812, + "num_tokens": 4684101441.0, + "step": 9163 + }, + { + "epoch": 2.47809626825311, + "grad_norm": 1.108786702156067, + "learning_rate": 1.1566373372901618e-05, + "loss": 1.9803, + "mean_token_accuracy": 0.5618321895599365, + "num_tokens": 4684625523.0, + "step": 9164 + }, + { + "epoch": 2.4783666846944294, + "grad_norm": 1.1275482177734375, + "learning_rate": 1.1564799999744469e-05, + "loss": 1.8274, + "mean_token_accuracy": 0.5458425283432007, + "num_tokens": 4685149773.0, + "step": 9165 + }, + { + "epoch": 2.478637101135749, + "grad_norm": 1.2627110481262207, + "learning_rate": 1.1563226609257596e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5721038579940796, + "num_tokens": 4685625397.0, + "step": 9166 + }, + { + "epoch": 2.4789075175770687, + "grad_norm": 1.0993989706039429, + "learning_rate": 1.1561653201489271e-05, + "loss": 1.704, + "mean_token_accuracy": 0.5950917601585388, + "num_tokens": 4686149636.0, + "step": 9167 + }, + { + "epoch": 2.4791779340183884, + "grad_norm": 1.0520273447036743, + "learning_rate": 1.1560079776487777e-05, + "loss": 1.8504, + "mean_token_accuracy": 0.5777605772018433, + "num_tokens": 4686673725.0, + "step": 9168 + }, + { + "epoch": 2.479448350459708, + "grad_norm": 1.1368430852890015, + "learning_rate": 1.155850633430139e-05, + "loss": 2.0606, + "mean_token_accuracy": 0.5292001962661743, + "num_tokens": 4687197944.0, + "step": 9169 + }, + { + "epoch": 2.4797187669010277, + "grad_norm": 1.2158087491989136, + "learning_rate": 1.1556932874978382e-05, + "loss": 1.9077, + "mean_token_accuracy": 0.5442764163017273, + "num_tokens": 4687722114.0, + "step": 9170 + }, + { + "epoch": 2.4799891833423473, + "grad_norm": 1.209713101387024, + "learning_rate": 1.1555359398567036e-05, + "loss": 1.8909, + "mean_token_accuracy": 0.5483927130699158, + "num_tokens": 4688246089.0, + "step": 9171 + }, + { + "epoch": 2.480259599783667, + "grad_norm": 1.172377586364746, + "learning_rate": 1.1553785905115631e-05, + "loss": 1.8112, + "mean_token_accuracy": 0.566709578037262, + "num_tokens": 4688770367.0, + "step": 9172 + }, + { + "epoch": 2.4805300162249866, + "grad_norm": 1.1292140483856201, + "learning_rate": 1.1552212394672446e-05, + "loss": 1.9437, + "mean_token_accuracy": 0.5683426856994629, + "num_tokens": 4689294453.0, + "step": 9173 + }, + { + "epoch": 2.4808004326663062, + "grad_norm": 1.2970775365829468, + "learning_rate": 1.1550638867285762e-05, + "loss": 1.8279, + "mean_token_accuracy": 0.568507969379425, + "num_tokens": 4689818546.0, + "step": 9174 + }, + { + "epoch": 2.481070849107626, + "grad_norm": 1.223854899406433, + "learning_rate": 1.1549065323003856e-05, + "loss": 1.8952, + "mean_token_accuracy": 0.566699743270874, + "num_tokens": 4690342670.0, + "step": 9175 + }, + { + "epoch": 2.4813412655489455, + "grad_norm": 1.1392147541046143, + "learning_rate": 1.1547491761875014e-05, + "loss": 1.8952, + "mean_token_accuracy": 0.5567024946212769, + "num_tokens": 4690866908.0, + "step": 9176 + }, + { + "epoch": 2.481611681990265, + "grad_norm": 1.4323937892913818, + "learning_rate": 1.1545918183947513e-05, + "loss": 1.9355, + "mean_token_accuracy": 0.5683547258377075, + "num_tokens": 4691391159.0, + "step": 9177 + }, + { + "epoch": 2.481882098431585, + "grad_norm": 1.1825037002563477, + "learning_rate": 1.1544344589269636e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5624681711196899, + "num_tokens": 4691915332.0, + "step": 9178 + }, + { + "epoch": 2.4821525148729044, + "grad_norm": 1.0094702243804932, + "learning_rate": 1.1542770977889668e-05, + "loss": 1.8441, + "mean_token_accuracy": 0.5679678916931152, + "num_tokens": 4692439451.0, + "step": 9179 + }, + { + "epoch": 2.482422931314224, + "grad_norm": 1.3571803569793701, + "learning_rate": 1.154119734985589e-05, + "loss": 1.913, + "mean_token_accuracy": 0.5648798942565918, + "num_tokens": 4692922894.0, + "step": 9180 + }, + { + "epoch": 2.4826933477555437, + "grad_norm": 0.6886782050132751, + "learning_rate": 1.1539623705216582e-05, + "loss": 1.1292, + "mean_token_accuracy": 0.7068402767181396, + "num_tokens": 4693447127.0, + "step": 9181 + }, + { + "epoch": 2.482963764196863, + "grad_norm": 1.7856673002243042, + "learning_rate": 1.153805004402004e-05, + "loss": 2.0023, + "mean_token_accuracy": 0.5521853566169739, + "num_tokens": 4693971297.0, + "step": 9182 + }, + { + "epoch": 2.483234180638183, + "grad_norm": 1.3075408935546875, + "learning_rate": 1.1536476366314536e-05, + "loss": 2.0033, + "mean_token_accuracy": 0.5479853749275208, + "num_tokens": 4694495475.0, + "step": 9183 + }, + { + "epoch": 2.483504597079502, + "grad_norm": 1.1333742141723633, + "learning_rate": 1.1534902672148356e-05, + "loss": 1.995, + "mean_token_accuracy": 0.5585565567016602, + "num_tokens": 4695019740.0, + "step": 9184 + }, + { + "epoch": 2.4837750135208223, + "grad_norm": 1.2249393463134766, + "learning_rate": 1.153332896156979e-05, + "loss": 1.9906, + "mean_token_accuracy": 0.5759596228599548, + "num_tokens": 4695543923.0, + "step": 9185 + }, + { + "epoch": 2.4840454299621415, + "grad_norm": 1.352603793144226, + "learning_rate": 1.1531755234627124e-05, + "loss": 1.9568, + "mean_token_accuracy": 0.5502512454986572, + "num_tokens": 4696068072.0, + "step": 9186 + }, + { + "epoch": 2.484315846403461, + "grad_norm": 1.2788389921188354, + "learning_rate": 1.1530181491368646e-05, + "loss": 1.8975, + "mean_token_accuracy": 0.5793191194534302, + "num_tokens": 4696592245.0, + "step": 9187 + }, + { + "epoch": 2.484586262844781, + "grad_norm": 1.1660736799240112, + "learning_rate": 1.1528607731842639e-05, + "loss": 1.8574, + "mean_token_accuracy": 0.5819332003593445, + "num_tokens": 4697116504.0, + "step": 9188 + }, + { + "epoch": 2.4848566792861004, + "grad_norm": 1.2366493940353394, + "learning_rate": 1.1527033956097393e-05, + "loss": 1.8427, + "mean_token_accuracy": 0.5749398469924927, + "num_tokens": 4697640581.0, + "step": 9189 + }, + { + "epoch": 2.48512709572742, + "grad_norm": 1.2224348783493042, + "learning_rate": 1.1525460164181193e-05, + "loss": 1.8057, + "mean_token_accuracy": 0.593185544013977, + "num_tokens": 4698106657.0, + "step": 9190 + }, + { + "epoch": 2.4853975121687397, + "grad_norm": 1.1518256664276123, + "learning_rate": 1.1523886356142333e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.5511785745620728, + "num_tokens": 4698602583.0, + "step": 9191 + }, + { + "epoch": 2.4856679286100594, + "grad_norm": 1.0050128698349, + "learning_rate": 1.1522312532029098e-05, + "loss": 1.8833, + "mean_token_accuracy": 0.5728328227996826, + "num_tokens": 4699126824.0, + "step": 9192 + }, + { + "epoch": 2.485938345051379, + "grad_norm": 1.1968967914581299, + "learning_rate": 1.1520738691889776e-05, + "loss": 1.9133, + "mean_token_accuracy": 0.5714439153671265, + "num_tokens": 4699651093.0, + "step": 9193 + }, + { + "epoch": 2.4862087614926986, + "grad_norm": 1.3966224193572998, + "learning_rate": 1.151916483577266e-05, + "loss": 1.8781, + "mean_token_accuracy": 0.5752866268157959, + "num_tokens": 4700142075.0, + "step": 9194 + }, + { + "epoch": 2.4864791779340183, + "grad_norm": 1.1803022623062134, + "learning_rate": 1.1517590963726041e-05, + "loss": 1.9156, + "mean_token_accuracy": 0.5671306252479553, + "num_tokens": 4700655098.0, + "step": 9195 + }, + { + "epoch": 2.486749594375338, + "grad_norm": 1.4193323850631714, + "learning_rate": 1.1516017075798211e-05, + "loss": 1.7882, + "mean_token_accuracy": 0.5945056676864624, + "num_tokens": 4701179378.0, + "step": 9196 + }, + { + "epoch": 2.4870200108166576, + "grad_norm": 1.4972306489944458, + "learning_rate": 1.1514443172037459e-05, + "loss": 1.8741, + "mean_token_accuracy": 0.5776148438453674, + "num_tokens": 4701653739.0, + "step": 9197 + }, + { + "epoch": 2.487290427257977, + "grad_norm": 1.0526468753814697, + "learning_rate": 1.1512869252492076e-05, + "loss": 1.9039, + "mean_token_accuracy": 0.5637317895889282, + "num_tokens": 4702142673.0, + "step": 9198 + }, + { + "epoch": 2.487560843699297, + "grad_norm": 0.9796486496925354, + "learning_rate": 1.1511295317210358e-05, + "loss": 1.6898, + "mean_token_accuracy": 0.6114177703857422, + "num_tokens": 4702666894.0, + "step": 9199 + }, + { + "epoch": 2.4878312601406165, + "grad_norm": 1.034832239151001, + "learning_rate": 1.1509721366240595e-05, + "loss": 1.7611, + "mean_token_accuracy": 0.5828307271003723, + "num_tokens": 4703191057.0, + "step": 9200 + }, + { + "epoch": 2.488101676581936, + "grad_norm": 0.48270735144615173, + "learning_rate": 1.1508147399631077e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.7000621557235718, + "num_tokens": 4703715268.0, + "step": 9201 + }, + { + "epoch": 2.488372093023256, + "grad_norm": 1.472808837890625, + "learning_rate": 1.1506573417430112e-05, + "loss": 1.9267, + "mean_token_accuracy": 0.588629961013794, + "num_tokens": 4704096878.0, + "step": 9202 + }, + { + "epoch": 2.4886425094645754, + "grad_norm": 1.2138842344284058, + "learning_rate": 1.1504999419685979e-05, + "loss": 2.002, + "mean_token_accuracy": 0.5421420335769653, + "num_tokens": 4704620990.0, + "step": 9203 + }, + { + "epoch": 2.488912925905895, + "grad_norm": 1.0835143327713013, + "learning_rate": 1.1503425406446977e-05, + "loss": 1.9415, + "mean_token_accuracy": 0.5708630084991455, + "num_tokens": 4705133161.0, + "step": 9204 + }, + { + "epoch": 2.4891833423472147, + "grad_norm": 1.1134706735610962, + "learning_rate": 1.1501851377761409e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.5715986490249634, + "num_tokens": 4705657389.0, + "step": 9205 + }, + { + "epoch": 2.4894537587885344, + "grad_norm": 0.9939489364624023, + "learning_rate": 1.1500277333677562e-05, + "loss": 1.798, + "mean_token_accuracy": 0.5715700387954712, + "num_tokens": 4706181660.0, + "step": 9206 + }, + { + "epoch": 2.489724175229854, + "grad_norm": 1.1108193397521973, + "learning_rate": 1.1498703274243732e-05, + "loss": 1.7057, + "mean_token_accuracy": 0.6010159254074097, + "num_tokens": 4706705815.0, + "step": 9207 + }, + { + "epoch": 2.4899945916711737, + "grad_norm": 1.614444613456726, + "learning_rate": 1.149712919950822e-05, + "loss": 1.9989, + "mean_token_accuracy": 0.5533497929573059, + "num_tokens": 4707213961.0, + "step": 9208 + }, + { + "epoch": 2.4902650081124933, + "grad_norm": 1.2640107870101929, + "learning_rate": 1.1495555109519323e-05, + "loss": 1.918, + "mean_token_accuracy": 0.5578933954238892, + "num_tokens": 4707737923.0, + "step": 9209 + }, + { + "epoch": 2.490535424553813, + "grad_norm": 1.1844654083251953, + "learning_rate": 1.1493981004325338e-05, + "loss": 1.976, + "mean_token_accuracy": 0.5533285140991211, + "num_tokens": 4708262105.0, + "step": 9210 + }, + { + "epoch": 2.4908058409951326, + "grad_norm": 1.2392823696136475, + "learning_rate": 1.1492406883974565e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.5476480722427368, + "num_tokens": 4708786305.0, + "step": 9211 + }, + { + "epoch": 2.4910762574364522, + "grad_norm": 1.1699193716049194, + "learning_rate": 1.1490832748515302e-05, + "loss": 1.9735, + "mean_token_accuracy": 0.546307384967804, + "num_tokens": 4709310522.0, + "step": 9212 + }, + { + "epoch": 2.491346673877772, + "grad_norm": 1.0754379034042358, + "learning_rate": 1.1489258597995842e-05, + "loss": 1.945, + "mean_token_accuracy": 0.5480212569236755, + "num_tokens": 4709834729.0, + "step": 9213 + }, + { + "epoch": 2.4916170903190915, + "grad_norm": 1.1551378965377808, + "learning_rate": 1.1487684432464492e-05, + "loss": 1.9223, + "mean_token_accuracy": 0.5509417057037354, + "num_tokens": 4710358967.0, + "step": 9214 + }, + { + "epoch": 2.491887506760411, + "grad_norm": 1.1122277975082397, + "learning_rate": 1.1486110251969546e-05, + "loss": 1.8646, + "mean_token_accuracy": 0.5675047039985657, + "num_tokens": 4710883179.0, + "step": 9215 + }, + { + "epoch": 2.492157923201731, + "grad_norm": 1.1041041612625122, + "learning_rate": 1.1484536056559312e-05, + "loss": 1.9272, + "mean_token_accuracy": 0.5560268759727478, + "num_tokens": 4711381645.0, + "step": 9216 + }, + { + "epoch": 2.4924283396430504, + "grad_norm": 1.2174543142318726, + "learning_rate": 1.1482961846282088e-05, + "loss": 1.9256, + "mean_token_accuracy": 0.5586096048355103, + "num_tokens": 4711905806.0, + "step": 9217 + }, + { + "epoch": 2.49269875608437, + "grad_norm": 1.0635972023010254, + "learning_rate": 1.1481387621186168e-05, + "loss": 1.988, + "mean_token_accuracy": 0.5456838011741638, + "num_tokens": 4712430000.0, + "step": 9218 + }, + { + "epoch": 2.4929691725256897, + "grad_norm": 1.0815967321395874, + "learning_rate": 1.1479813381319866e-05, + "loss": 1.6644, + "mean_token_accuracy": 0.633599579334259, + "num_tokens": 4712927807.0, + "step": 9219 + }, + { + "epoch": 2.4932395889670094, + "grad_norm": 1.025539517402649, + "learning_rate": 1.1478239126731477e-05, + "loss": 1.9041, + "mean_token_accuracy": 0.5688619017601013, + "num_tokens": 4713451931.0, + "step": 9220 + }, + { + "epoch": 2.493510005408329, + "grad_norm": 0.5214962959289551, + "learning_rate": 1.1476664857469306e-05, + "loss": 1.1802, + "mean_token_accuracy": 0.6815565824508667, + "num_tokens": 4713976055.0, + "step": 9221 + }, + { + "epoch": 2.4937804218496487, + "grad_norm": 1.611085057258606, + "learning_rate": 1.1475090573581652e-05, + "loss": 1.7975, + "mean_token_accuracy": 0.5758612155914307, + "num_tokens": 4714500324.0, + "step": 9222 + }, + { + "epoch": 2.494050838290968, + "grad_norm": 1.4252488613128662, + "learning_rate": 1.1473516275116825e-05, + "loss": 1.9662, + "mean_token_accuracy": 0.5395232439041138, + "num_tokens": 4715024600.0, + "step": 9223 + }, + { + "epoch": 2.494321254732288, + "grad_norm": 1.2018858194351196, + "learning_rate": 1.1471941962123125e-05, + "loss": 1.8479, + "mean_token_accuracy": 0.5588420629501343, + "num_tokens": 4715548739.0, + "step": 9224 + }, + { + "epoch": 2.494591671173607, + "grad_norm": 1.4531450271606445, + "learning_rate": 1.147036763464886e-05, + "loss": 1.8798, + "mean_token_accuracy": 0.556071937084198, + "num_tokens": 4716072927.0, + "step": 9225 + }, + { + "epoch": 2.4948620876149272, + "grad_norm": 1.121414065361023, + "learning_rate": 1.146879329274233e-05, + "loss": 1.8238, + "mean_token_accuracy": 0.5683582425117493, + "num_tokens": 4716597178.0, + "step": 9226 + }, + { + "epoch": 2.4951325040562464, + "grad_norm": 1.1009513139724731, + "learning_rate": 1.1467218936451842e-05, + "loss": 1.8874, + "mean_token_accuracy": 0.5637375116348267, + "num_tokens": 4717121420.0, + "step": 9227 + }, + { + "epoch": 2.495402920497566, + "grad_norm": 1.1087254285812378, + "learning_rate": 1.146564456582571e-05, + "loss": 1.9471, + "mean_token_accuracy": 0.555557906627655, + "num_tokens": 4717613165.0, + "step": 9228 + }, + { + "epoch": 2.4956733369388857, + "grad_norm": 1.3328946828842163, + "learning_rate": 1.1464070180912223e-05, + "loss": 1.9098, + "mean_token_accuracy": 0.571831464767456, + "num_tokens": 4718137418.0, + "step": 9229 + }, + { + "epoch": 2.4959437533802054, + "grad_norm": 1.101939082145691, + "learning_rate": 1.1462495781759704e-05, + "loss": 1.96, + "mean_token_accuracy": 0.5608422160148621, + "num_tokens": 4718661669.0, + "step": 9230 + }, + { + "epoch": 2.496214169821525, + "grad_norm": 0.9942423701286316, + "learning_rate": 1.1460921368416453e-05, + "loss": 1.7993, + "mean_token_accuracy": 0.5677392482757568, + "num_tokens": 4719185824.0, + "step": 9231 + }, + { + "epoch": 2.4964845862628446, + "grad_norm": 1.0928293466567993, + "learning_rate": 1.145934694093078e-05, + "loss": 1.8066, + "mean_token_accuracy": 0.5901584625244141, + "num_tokens": 4719652673.0, + "step": 9232 + }, + { + "epoch": 2.4967550027041643, + "grad_norm": 1.0516326427459717, + "learning_rate": 1.1457772499350992e-05, + "loss": 1.8594, + "mean_token_accuracy": 0.5834478139877319, + "num_tokens": 4720116480.0, + "step": 9233 + }, + { + "epoch": 2.497025419145484, + "grad_norm": 1.2670366764068604, + "learning_rate": 1.1456198043725398e-05, + "loss": 1.8647, + "mean_token_accuracy": 0.563700795173645, + "num_tokens": 4720640627.0, + "step": 9234 + }, + { + "epoch": 2.4972958355868036, + "grad_norm": 1.2383164167404175, + "learning_rate": 1.1454623574102306e-05, + "loss": 1.8939, + "mean_token_accuracy": 0.5494192838668823, + "num_tokens": 4721164905.0, + "step": 9235 + }, + { + "epoch": 2.497566252028123, + "grad_norm": 1.470125675201416, + "learning_rate": 1.1453049090530024e-05, + "loss": 1.9086, + "mean_token_accuracy": 0.5707482099533081, + "num_tokens": 4721689061.0, + "step": 9236 + }, + { + "epoch": 2.497836668469443, + "grad_norm": 1.1970205307006836, + "learning_rate": 1.1451474593056863e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.554007351398468, + "num_tokens": 4722213201.0, + "step": 9237 + }, + { + "epoch": 2.4981070849107625, + "grad_norm": 1.404576063156128, + "learning_rate": 1.1449900081731138e-05, + "loss": 1.9222, + "mean_token_accuracy": 0.5653765201568604, + "num_tokens": 4722737345.0, + "step": 9238 + }, + { + "epoch": 2.498377501352082, + "grad_norm": 1.335107445716858, + "learning_rate": 1.1448325556601154e-05, + "loss": 1.8842, + "mean_token_accuracy": 0.5545773506164551, + "num_tokens": 4723261552.0, + "step": 9239 + }, + { + "epoch": 2.498647917793402, + "grad_norm": 1.1961383819580078, + "learning_rate": 1.1446751017715223e-05, + "loss": 1.8694, + "mean_token_accuracy": 0.5506695508956909, + "num_tokens": 4723785748.0, + "step": 9240 + }, + { + "epoch": 2.4989183342347214, + "grad_norm": 0.4495481252670288, + "learning_rate": 1.1445176465121654e-05, + "loss": 1.0395, + "mean_token_accuracy": 0.7176722288131714, + "num_tokens": 4724310023.0, + "step": 9241 + }, + { + "epoch": 2.499188750676041, + "grad_norm": 1.641504168510437, + "learning_rate": 1.1443601898868768e-05, + "loss": 1.876, + "mean_token_accuracy": 0.567817747592926, + "num_tokens": 4724749364.0, + "step": 9242 + }, + { + "epoch": 2.4994591671173607, + "grad_norm": 1.4776285886764526, + "learning_rate": 1.1442027319004865e-05, + "loss": 1.7741, + "mean_token_accuracy": 0.5692111253738403, + "num_tokens": 4725273646.0, + "step": 9243 + }, + { + "epoch": 2.4997295835586804, + "grad_norm": 1.1159162521362305, + "learning_rate": 1.1440452725578265e-05, + "loss": 1.7587, + "mean_token_accuracy": 0.5693255066871643, + "num_tokens": 4725766468.0, + "step": 9244 + }, + { + "epoch": 2.5, + "grad_norm": 1.428576946258545, + "learning_rate": 1.1438878118637283e-05, + "loss": 1.92, + "mean_token_accuracy": 0.5658882260322571, + "num_tokens": 4726238359.0, + "step": 9245 + }, + { + "epoch": 2.5002704164413196, + "grad_norm": 1.388208031654358, + "learning_rate": 1.1437303498230227e-05, + "loss": 2.0113, + "mean_token_accuracy": 0.5414552688598633, + "num_tokens": 4726762637.0, + "step": 9246 + }, + { + "epoch": 2.5005408328826393, + "grad_norm": 1.243204951286316, + "learning_rate": 1.1435728864405415e-05, + "loss": 1.8766, + "mean_token_accuracy": 0.5782797336578369, + "num_tokens": 4727189678.0, + "step": 9247 + }, + { + "epoch": 2.500811249323959, + "grad_norm": 1.1553200483322144, + "learning_rate": 1.1434154217211164e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5462860465049744, + "num_tokens": 4727713803.0, + "step": 9248 + }, + { + "epoch": 2.5010816657652786, + "grad_norm": 1.3302457332611084, + "learning_rate": 1.143257955669578e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.5386509299278259, + "num_tokens": 4728237980.0, + "step": 9249 + }, + { + "epoch": 2.501352082206598, + "grad_norm": 1.2618913650512695, + "learning_rate": 1.1431004882907583e-05, + "loss": 1.7603, + "mean_token_accuracy": 0.5937872529029846, + "num_tokens": 4728730821.0, + "step": 9250 + }, + { + "epoch": 2.501622498647918, + "grad_norm": 1.601582407951355, + "learning_rate": 1.1429430195894888e-05, + "loss": 2.0167, + "mean_token_accuracy": 0.5360531806945801, + "num_tokens": 4729254985.0, + "step": 9251 + }, + { + "epoch": 2.5018929150892375, + "grad_norm": 1.4210612773895264, + "learning_rate": 1.1427855495706015e-05, + "loss": 1.9824, + "mean_token_accuracy": 0.5479574799537659, + "num_tokens": 4729729367.0, + "step": 9252 + }, + { + "epoch": 2.502163331530557, + "grad_norm": 1.1698517799377441, + "learning_rate": 1.1426280782389274e-05, + "loss": 1.8777, + "mean_token_accuracy": 0.5561801195144653, + "num_tokens": 4730226037.0, + "step": 9253 + }, + { + "epoch": 2.502433747971877, + "grad_norm": 1.4540890455245972, + "learning_rate": 1.1424706055992987e-05, + "loss": 1.8838, + "mean_token_accuracy": 0.5800584554672241, + "num_tokens": 4730723485.0, + "step": 9254 + }, + { + "epoch": 2.5027041644131964, + "grad_norm": 1.5526601076126099, + "learning_rate": 1.142313131656547e-05, + "loss": 1.9146, + "mean_token_accuracy": 0.5637243986129761, + "num_tokens": 4731224551.0, + "step": 9255 + }, + { + "epoch": 2.502974580854516, + "grad_norm": 1.3190116882324219, + "learning_rate": 1.1421556564155038e-05, + "loss": 1.8892, + "mean_token_accuracy": 0.5735989212989807, + "num_tokens": 4731748737.0, + "step": 9256 + }, + { + "epoch": 2.5032449972958357, + "grad_norm": 1.3390755653381348, + "learning_rate": 1.1419981798810012e-05, + "loss": 1.7571, + "mean_token_accuracy": 0.5596926212310791, + "num_tokens": 4732272881.0, + "step": 9257 + }, + { + "epoch": 2.5035154137371554, + "grad_norm": 1.512819766998291, + "learning_rate": 1.1418407020578708e-05, + "loss": 1.9816, + "mean_token_accuracy": 0.5560945868492126, + "num_tokens": 4732797122.0, + "step": 9258 + }, + { + "epoch": 2.503785830178475, + "grad_norm": 1.6316676139831543, + "learning_rate": 1.1416832229509447e-05, + "loss": 1.9841, + "mean_token_accuracy": 0.5533527731895447, + "num_tokens": 4733321359.0, + "step": 9259 + }, + { + "epoch": 2.5040562466197946, + "grad_norm": 1.3208050727844238, + "learning_rate": 1.1415257425650546e-05, + "loss": 1.8579, + "mean_token_accuracy": 0.5707699060440063, + "num_tokens": 4733845559.0, + "step": 9260 + }, + { + "epoch": 2.5043266630611143, + "grad_norm": 0.5263083577156067, + "learning_rate": 1.141368260905033e-05, + "loss": 1.0619, + "mean_token_accuracy": 0.7076284289360046, + "num_tokens": 4734369827.0, + "step": 9261 + }, + { + "epoch": 2.5045970795024335, + "grad_norm": 2.0052096843719482, + "learning_rate": 1.1412107779757111e-05, + "loss": 1.8402, + "mean_token_accuracy": 0.5815292596817017, + "num_tokens": 4734893984.0, + "step": 9262 + }, + { + "epoch": 2.5048674959437536, + "grad_norm": 1.9624156951904297, + "learning_rate": 1.1410532937819214e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.564042329788208, + "num_tokens": 4735396700.0, + "step": 9263 + }, + { + "epoch": 2.5051379123850728, + "grad_norm": 1.4809190034866333, + "learning_rate": 1.140895808328496e-05, + "loss": 1.9094, + "mean_token_accuracy": 0.5562708973884583, + "num_tokens": 4735920889.0, + "step": 9264 + }, + { + "epoch": 2.505408328826393, + "grad_norm": 1.207426905632019, + "learning_rate": 1.1407383216202668e-05, + "loss": 1.7983, + "mean_token_accuracy": 0.58168625831604, + "num_tokens": 4736444986.0, + "step": 9265 + }, + { + "epoch": 2.505678745267712, + "grad_norm": 1.8097294569015503, + "learning_rate": 1.140580833662066e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.546832799911499, + "num_tokens": 4736969063.0, + "step": 9266 + }, + { + "epoch": 2.505949161709032, + "grad_norm": 1.297081708908081, + "learning_rate": 1.1404233444587262e-05, + "loss": 1.9457, + "mean_token_accuracy": 0.5570518374443054, + "num_tokens": 4737493337.0, + "step": 9267 + }, + { + "epoch": 2.5062195781503513, + "grad_norm": 1.0573933124542236, + "learning_rate": 1.1402658540150795e-05, + "loss": 1.9162, + "mean_token_accuracy": 0.5457106232643127, + "num_tokens": 4738010889.0, + "step": 9268 + }, + { + "epoch": 2.5064899945916714, + "grad_norm": 1.328458309173584, + "learning_rate": 1.1401083623359576e-05, + "loss": 1.7744, + "mean_token_accuracy": 0.580805778503418, + "num_tokens": 4738535017.0, + "step": 9269 + }, + { + "epoch": 2.5067604110329906, + "grad_norm": 1.1612820625305176, + "learning_rate": 1.1399508694261934e-05, + "loss": 1.8939, + "mean_token_accuracy": 0.5698184370994568, + "num_tokens": 4739050763.0, + "step": 9270 + }, + { + "epoch": 2.5070308274743103, + "grad_norm": 1.069224238395691, + "learning_rate": 1.1397933752906187e-05, + "loss": 1.8891, + "mean_token_accuracy": 0.5732810497283936, + "num_tokens": 4739502772.0, + "step": 9271 + }, + { + "epoch": 2.50730124391563, + "grad_norm": 1.3203665018081665, + "learning_rate": 1.1396358799340664e-05, + "loss": 1.7255, + "mean_token_accuracy": 0.6024186611175537, + "num_tokens": 4739997450.0, + "step": 9272 + }, + { + "epoch": 2.5075716603569496, + "grad_norm": 1.145308017730713, + "learning_rate": 1.1394783833613687e-05, + "loss": 1.8633, + "mean_token_accuracy": 0.5696269273757935, + "num_tokens": 4740521729.0, + "step": 9273 + }, + { + "epoch": 2.507842076798269, + "grad_norm": 1.1797677278518677, + "learning_rate": 1.1393208855773585e-05, + "loss": 1.7836, + "mean_token_accuracy": 0.5799506902694702, + "num_tokens": 4741023034.0, + "step": 9274 + }, + { + "epoch": 2.508112493239589, + "grad_norm": 1.218640685081482, + "learning_rate": 1.1391633865868674e-05, + "loss": 1.877, + "mean_token_accuracy": 0.5778146982192993, + "num_tokens": 4741547265.0, + "step": 9275 + }, + { + "epoch": 2.5083829096809085, + "grad_norm": 1.0850462913513184, + "learning_rate": 1.1390058863947287e-05, + "loss": 1.8135, + "mean_token_accuracy": 0.5733243823051453, + "num_tokens": 4742071467.0, + "step": 9276 + }, + { + "epoch": 2.508653326122228, + "grad_norm": 1.0611687898635864, + "learning_rate": 1.1388483850057746e-05, + "loss": 1.8361, + "mean_token_accuracy": 0.576750636100769, + "num_tokens": 4742575549.0, + "step": 9277 + }, + { + "epoch": 2.5089237425635478, + "grad_norm": 1.2431108951568604, + "learning_rate": 1.1386908824248379e-05, + "loss": 1.8606, + "mean_token_accuracy": 0.565627932548523, + "num_tokens": 4743099806.0, + "step": 9278 + }, + { + "epoch": 2.5091941590048674, + "grad_norm": 1.1411216259002686, + "learning_rate": 1.138533378656751e-05, + "loss": 2.0108, + "mean_token_accuracy": 0.5478113889694214, + "num_tokens": 4743617453.0, + "step": 9279 + }, + { + "epoch": 2.509464575446187, + "grad_norm": 1.1586495637893677, + "learning_rate": 1.1383758737063466e-05, + "loss": 1.8901, + "mean_token_accuracy": 0.5624303817749023, + "num_tokens": 4744141468.0, + "step": 9280 + }, + { + "epoch": 2.5097349918875067, + "grad_norm": 0.4862288534641266, + "learning_rate": 1.1382183675784577e-05, + "loss": 1.0969, + "mean_token_accuracy": 0.7155752182006836, + "num_tokens": 4744611487.0, + "step": 9281 + }, + { + "epoch": 2.5100054083288263, + "grad_norm": 1.5916814804077148, + "learning_rate": 1.138060860277917e-05, + "loss": 1.9738, + "mean_token_accuracy": 0.5488513708114624, + "num_tokens": 4745094394.0, + "step": 9282 + }, + { + "epoch": 2.510275824770146, + "grad_norm": 1.0802360773086548, + "learning_rate": 1.137903351809557e-05, + "loss": 1.8801, + "mean_token_accuracy": 0.5742653608322144, + "num_tokens": 4745585836.0, + "step": 9283 + }, + { + "epoch": 2.5105462412114656, + "grad_norm": 0.9215701222419739, + "learning_rate": 1.137745842178211e-05, + "loss": 1.9661, + "mean_token_accuracy": 0.5612262487411499, + "num_tokens": 4746110051.0, + "step": 9284 + }, + { + "epoch": 2.5108166576527853, + "grad_norm": 1.1627285480499268, + "learning_rate": 1.1375883313887113e-05, + "loss": 1.8058, + "mean_token_accuracy": 0.5727255344390869, + "num_tokens": 4746634255.0, + "step": 9285 + }, + { + "epoch": 2.511087074094105, + "grad_norm": 1.162107229232788, + "learning_rate": 1.137430819445891e-05, + "loss": 1.8328, + "mean_token_accuracy": 0.5780404806137085, + "num_tokens": 4747158407.0, + "step": 9286 + }, + { + "epoch": 2.5113574905354246, + "grad_norm": 1.1297417879104614, + "learning_rate": 1.137273306354583e-05, + "loss": 1.8559, + "mean_token_accuracy": 0.5745680928230286, + "num_tokens": 4747641052.0, + "step": 9287 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 1.1653739213943481, + "learning_rate": 1.1371157921196207e-05, + "loss": 1.7637, + "mean_token_accuracy": 0.5930806398391724, + "num_tokens": 4748145595.0, + "step": 9288 + }, + { + "epoch": 2.511898323418064, + "grad_norm": 1.408103585243225, + "learning_rate": 1.1369582767458365e-05, + "loss": 1.9061, + "mean_token_accuracy": 0.564012348651886, + "num_tokens": 4748669828.0, + "step": 9289 + }, + { + "epoch": 2.5121687398593835, + "grad_norm": 1.2578470706939697, + "learning_rate": 1.1368007602380638e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.5374468564987183, + "num_tokens": 4749194048.0, + "step": 9290 + }, + { + "epoch": 2.512439156300703, + "grad_norm": 1.4088010787963867, + "learning_rate": 1.1366432426011356e-05, + "loss": 1.8405, + "mean_token_accuracy": 0.5761423110961914, + "num_tokens": 4749628025.0, + "step": 9291 + }, + { + "epoch": 2.512709572742023, + "grad_norm": 1.3006510734558105, + "learning_rate": 1.1364857238398847e-05, + "loss": 1.8572, + "mean_token_accuracy": 0.5845493674278259, + "num_tokens": 4750152304.0, + "step": 9292 + }, + { + "epoch": 2.5129799891833424, + "grad_norm": 1.047145962715149, + "learning_rate": 1.1363282039591447e-05, + "loss": 1.9772, + "mean_token_accuracy": 0.5481109619140625, + "num_tokens": 4750668962.0, + "step": 9293 + }, + { + "epoch": 2.513250405624662, + "grad_norm": 1.5326764583587646, + "learning_rate": 1.1361706829637487e-05, + "loss": 2.0054, + "mean_token_accuracy": 0.5587459206581116, + "num_tokens": 4751122836.0, + "step": 9294 + }, + { + "epoch": 2.5135208220659817, + "grad_norm": 1.477755069732666, + "learning_rate": 1.1360131608585295e-05, + "loss": 1.8455, + "mean_token_accuracy": 0.5688447952270508, + "num_tokens": 4751646995.0, + "step": 9295 + }, + { + "epoch": 2.5137912385073014, + "grad_norm": 1.0799336433410645, + "learning_rate": 1.1358556376483205e-05, + "loss": 1.7905, + "mean_token_accuracy": 0.6075970530509949, + "num_tokens": 4752171263.0, + "step": 9296 + }, + { + "epoch": 2.514061654948621, + "grad_norm": 1.2772352695465088, + "learning_rate": 1.1356981133379557e-05, + "loss": 2.0554, + "mean_token_accuracy": 0.5776946544647217, + "num_tokens": 4752589411.0, + "step": 9297 + }, + { + "epoch": 2.5143320713899406, + "grad_norm": 1.281266689300537, + "learning_rate": 1.1355405879322672e-05, + "loss": 1.8776, + "mean_token_accuracy": 0.5790002346038818, + "num_tokens": 4753068734.0, + "step": 9298 + }, + { + "epoch": 2.5146024878312603, + "grad_norm": 1.0477267503738403, + "learning_rate": 1.1353830614360893e-05, + "loss": 1.9627, + "mean_token_accuracy": 0.5706050992012024, + "num_tokens": 4753592936.0, + "step": 9299 + }, + { + "epoch": 2.51487290427258, + "grad_norm": 1.2015446424484253, + "learning_rate": 1.1352255338542547e-05, + "loss": 1.9586, + "mean_token_accuracy": 0.5551308393478394, + "num_tokens": 4754062934.0, + "step": 9300 + }, + { + "epoch": 2.5151433207138996, + "grad_norm": 0.44008105993270874, + "learning_rate": 1.1350680051915972e-05, + "loss": 1.0018, + "mean_token_accuracy": 0.7289210557937622, + "num_tokens": 4754587108.0, + "step": 9301 + }, + { + "epoch": 2.515413737155219, + "grad_norm": 1.8515832424163818, + "learning_rate": 1.1349104754529503e-05, + "loss": 1.9722, + "mean_token_accuracy": 0.5462777614593506, + "num_tokens": 4755111350.0, + "step": 9302 + }, + { + "epoch": 2.5156841535965384, + "grad_norm": 1.1940914392471313, + "learning_rate": 1.1347529446431473e-05, + "loss": 1.8942, + "mean_token_accuracy": 0.5528521537780762, + "num_tokens": 4755635494.0, + "step": 9303 + }, + { + "epoch": 2.5159545700378585, + "grad_norm": 1.0796493291854858, + "learning_rate": 1.134595412767022e-05, + "loss": 1.9115, + "mean_token_accuracy": 0.5493407845497131, + "num_tokens": 4756150813.0, + "step": 9304 + }, + { + "epoch": 2.5162249864791777, + "grad_norm": 1.2001352310180664, + "learning_rate": 1.1344378798294073e-05, + "loss": 1.8858, + "mean_token_accuracy": 0.5945752263069153, + "num_tokens": 4756610562.0, + "step": 9305 + }, + { + "epoch": 2.516495402920498, + "grad_norm": 1.4557079076766968, + "learning_rate": 1.1342803458351375e-05, + "loss": 1.9618, + "mean_token_accuracy": 0.5598871111869812, + "num_tokens": 4757134811.0, + "step": 9306 + }, + { + "epoch": 2.516765819361817, + "grad_norm": 1.20845627784729, + "learning_rate": 1.1341228107890456e-05, + "loss": 1.8717, + "mean_token_accuracy": 0.5749416351318359, + "num_tokens": 4757659069.0, + "step": 9307 + }, + { + "epoch": 2.517036235803137, + "grad_norm": 1.2085814476013184, + "learning_rate": 1.1339652746959655e-05, + "loss": 2.024, + "mean_token_accuracy": 0.5462145209312439, + "num_tokens": 4758183355.0, + "step": 9308 + }, + { + "epoch": 2.5173066522444563, + "grad_norm": 1.270471453666687, + "learning_rate": 1.133807737560731e-05, + "loss": 1.8687, + "mean_token_accuracy": 0.562137246131897, + "num_tokens": 4758707582.0, + "step": 9309 + }, + { + "epoch": 2.5175770686857764, + "grad_norm": 1.1910645961761475, + "learning_rate": 1.1336501993881757e-05, + "loss": 1.8941, + "mean_token_accuracy": 0.566781222820282, + "num_tokens": 4759231814.0, + "step": 9310 + }, + { + "epoch": 2.5178474851270956, + "grad_norm": 1.2801960706710815, + "learning_rate": 1.1334926601831336e-05, + "loss": 1.7956, + "mean_token_accuracy": 0.5938827991485596, + "num_tokens": 4759756055.0, + "step": 9311 + }, + { + "epoch": 2.518117901568415, + "grad_norm": 0.9723873138427734, + "learning_rate": 1.1333351199504379e-05, + "loss": 1.8485, + "mean_token_accuracy": 0.5765194892883301, + "num_tokens": 4760280331.0, + "step": 9312 + }, + { + "epoch": 2.518388318009735, + "grad_norm": 1.2810603380203247, + "learning_rate": 1.1331775786949226e-05, + "loss": 1.745, + "mean_token_accuracy": 0.6117091774940491, + "num_tokens": 4760767986.0, + "step": 9313 + }, + { + "epoch": 2.5186587344510545, + "grad_norm": 1.275606632232666, + "learning_rate": 1.1330200364214217e-05, + "loss": 1.9598, + "mean_token_accuracy": 0.5421150922775269, + "num_tokens": 4761292268.0, + "step": 9314 + }, + { + "epoch": 2.518929150892374, + "grad_norm": 1.0451323986053467, + "learning_rate": 1.1328624931347689e-05, + "loss": 1.8447, + "mean_token_accuracy": 0.5813011527061462, + "num_tokens": 4761816504.0, + "step": 9315 + }, + { + "epoch": 2.5191995673336938, + "grad_norm": 1.2306514978408813, + "learning_rate": 1.1327049488397985e-05, + "loss": 1.8692, + "mean_token_accuracy": 0.5735572576522827, + "num_tokens": 4762340678.0, + "step": 9316 + }, + { + "epoch": 2.5194699837750134, + "grad_norm": 1.0937366485595703, + "learning_rate": 1.132547403541344e-05, + "loss": 1.9426, + "mean_token_accuracy": 0.5603146553039551, + "num_tokens": 4762856410.0, + "step": 9317 + }, + { + "epoch": 2.519740400216333, + "grad_norm": 1.0148440599441528, + "learning_rate": 1.1323898572442394e-05, + "loss": 1.9285, + "mean_token_accuracy": 0.5646722316741943, + "num_tokens": 4763380684.0, + "step": 9318 + }, + { + "epoch": 2.5200108166576527, + "grad_norm": 1.5055474042892456, + "learning_rate": 1.1322323099533186e-05, + "loss": 1.8859, + "mean_token_accuracy": 0.5761871337890625, + "num_tokens": 4763871297.0, + "step": 9319 + }, + { + "epoch": 2.5202812330989723, + "grad_norm": 1.1734914779663086, + "learning_rate": 1.1320747616734157e-05, + "loss": 1.7486, + "mean_token_accuracy": 0.610429048538208, + "num_tokens": 4764395445.0, + "step": 9320 + }, + { + "epoch": 2.520551649540292, + "grad_norm": 0.41295552253723145, + "learning_rate": 1.1319172124093647e-05, + "loss": 1.0855, + "mean_token_accuracy": 0.7165169715881348, + "num_tokens": 4764919617.0, + "step": 9321 + }, + { + "epoch": 2.5208220659816116, + "grad_norm": 1.6257163286209106, + "learning_rate": 1.1317596621660001e-05, + "loss": 1.8661, + "mean_token_accuracy": 0.5687457919120789, + "num_tokens": 4765443665.0, + "step": 9322 + }, + { + "epoch": 2.5210924824229313, + "grad_norm": 1.5113879442214966, + "learning_rate": 1.1316021109481552e-05, + "loss": 1.9207, + "mean_token_accuracy": 0.5764285326004028, + "num_tokens": 4765967868.0, + "step": 9323 + }, + { + "epoch": 2.521362898864251, + "grad_norm": 1.1396043300628662, + "learning_rate": 1.1314445587606653e-05, + "loss": 1.9116, + "mean_token_accuracy": 0.5597830414772034, + "num_tokens": 4766492125.0, + "step": 9324 + }, + { + "epoch": 2.5216333153055706, + "grad_norm": 1.370273232460022, + "learning_rate": 1.1312870056083631e-05, + "loss": 1.8023, + "mean_token_accuracy": 0.5964168310165405, + "num_tokens": 4767016336.0, + "step": 9325 + }, + { + "epoch": 2.52190373174689, + "grad_norm": 1.4970920085906982, + "learning_rate": 1.1311294514960843e-05, + "loss": 1.9487, + "mean_token_accuracy": 0.5628026723861694, + "num_tokens": 4767512051.0, + "step": 9326 + }, + { + "epoch": 2.52217414818821, + "grad_norm": 1.197220802307129, + "learning_rate": 1.1309718964286617e-05, + "loss": 1.8681, + "mean_token_accuracy": 0.5861825942993164, + "num_tokens": 4768010349.0, + "step": 9327 + }, + { + "epoch": 2.5224445646295295, + "grad_norm": 1.301340103149414, + "learning_rate": 1.130814340410931e-05, + "loss": 1.888, + "mean_token_accuracy": 0.5693992376327515, + "num_tokens": 4768534498.0, + "step": 9328 + }, + { + "epoch": 2.522714981070849, + "grad_norm": 0.9991385340690613, + "learning_rate": 1.1306567834477248e-05, + "loss": 1.8605, + "mean_token_accuracy": 0.5712624788284302, + "num_tokens": 4769058715.0, + "step": 9329 + }, + { + "epoch": 2.5229853975121688, + "grad_norm": 1.1927967071533203, + "learning_rate": 1.130499225543879e-05, + "loss": 1.9028, + "mean_token_accuracy": 0.5626071691513062, + "num_tokens": 4769580660.0, + "step": 9330 + }, + { + "epoch": 2.5232558139534884, + "grad_norm": 1.1243712902069092, + "learning_rate": 1.1303416667042269e-05, + "loss": 1.8984, + "mean_token_accuracy": 0.5689729452133179, + "num_tokens": 4770104926.0, + "step": 9331 + }, + { + "epoch": 2.523526230394808, + "grad_norm": 1.387205958366394, + "learning_rate": 1.1301841069336035e-05, + "loss": 2.0446, + "mean_token_accuracy": 0.5446515083312988, + "num_tokens": 4770568307.0, + "step": 9332 + }, + { + "epoch": 2.5237966468361277, + "grad_norm": 0.9687219262123108, + "learning_rate": 1.1300265462368427e-05, + "loss": 1.8616, + "mean_token_accuracy": 0.5706839561462402, + "num_tokens": 4771092590.0, + "step": 9333 + }, + { + "epoch": 2.5240670632774473, + "grad_norm": 0.9891135096549988, + "learning_rate": 1.1298689846187794e-05, + "loss": 1.8584, + "mean_token_accuracy": 0.5804711580276489, + "num_tokens": 4771616834.0, + "step": 9334 + }, + { + "epoch": 2.524337479718767, + "grad_norm": 1.055310606956482, + "learning_rate": 1.1297114220842478e-05, + "loss": 1.8763, + "mean_token_accuracy": 0.5881765484809875, + "num_tokens": 4772141028.0, + "step": 9335 + }, + { + "epoch": 2.5246078961600866, + "grad_norm": 1.1305451393127441, + "learning_rate": 1.1295538586380819e-05, + "loss": 1.9114, + "mean_token_accuracy": 0.5466346740722656, + "num_tokens": 4772655842.0, + "step": 9336 + }, + { + "epoch": 2.5248783126014063, + "grad_norm": 0.8857030868530273, + "learning_rate": 1.1293962942851173e-05, + "loss": 1.8486, + "mean_token_accuracy": 0.5604581832885742, + "num_tokens": 4773180038.0, + "step": 9337 + }, + { + "epoch": 2.525148729042726, + "grad_norm": 1.132614016532898, + "learning_rate": 1.1292387290301876e-05, + "loss": 1.9914, + "mean_token_accuracy": 0.5399301648139954, + "num_tokens": 4773704282.0, + "step": 9338 + }, + { + "epoch": 2.5254191454840456, + "grad_norm": 1.2590851783752441, + "learning_rate": 1.129081162878128e-05, + "loss": 1.8788, + "mean_token_accuracy": 0.577301025390625, + "num_tokens": 4774173218.0, + "step": 9339 + }, + { + "epoch": 2.525689561925365, + "grad_norm": 0.9727911949157715, + "learning_rate": 1.1289235958337728e-05, + "loss": 1.8423, + "mean_token_accuracy": 0.581229567527771, + "num_tokens": 4774697476.0, + "step": 9340 + }, + { + "epoch": 2.525959978366685, + "grad_norm": 0.47755369544029236, + "learning_rate": 1.1287660279019564e-05, + "loss": 1.1181, + "mean_token_accuracy": 0.6902182102203369, + "num_tokens": 4775221561.0, + "step": 9341 + }, + { + "epoch": 2.5262303948080045, + "grad_norm": 1.193168044090271, + "learning_rate": 1.1286084590875137e-05, + "loss": 1.8918, + "mean_token_accuracy": 0.5436803698539734, + "num_tokens": 4775745812.0, + "step": 9342 + }, + { + "epoch": 2.526500811249324, + "grad_norm": 1.2722243070602417, + "learning_rate": 1.1284508893952793e-05, + "loss": 1.9327, + "mean_token_accuracy": 0.5521903038024902, + "num_tokens": 4776270015.0, + "step": 9343 + }, + { + "epoch": 2.5267712276906433, + "grad_norm": 1.0888410806655884, + "learning_rate": 1.128293318830088e-05, + "loss": 1.9057, + "mean_token_accuracy": 0.5622044801712036, + "num_tokens": 4776794294.0, + "step": 9344 + }, + { + "epoch": 2.5270416441319634, + "grad_norm": 1.5141916275024414, + "learning_rate": 1.1281357473967744e-05, + "loss": 1.9784, + "mean_token_accuracy": 0.5259653925895691, + "num_tokens": 4777318477.0, + "step": 9345 + }, + { + "epoch": 2.5273120605732826, + "grad_norm": 1.0512185096740723, + "learning_rate": 1.1279781751001735e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5805879831314087, + "num_tokens": 4777842755.0, + "step": 9346 + }, + { + "epoch": 2.5275824770146027, + "grad_norm": 1.2380956411361694, + "learning_rate": 1.1278206019451198e-05, + "loss": 1.917, + "mean_token_accuracy": 0.5729140043258667, + "num_tokens": 4778361176.0, + "step": 9347 + }, + { + "epoch": 2.527852893455922, + "grad_norm": 1.0685936212539673, + "learning_rate": 1.1276630279364485e-05, + "loss": 1.8344, + "mean_token_accuracy": 0.5837885141372681, + "num_tokens": 4778860877.0, + "step": 9348 + }, + { + "epoch": 2.528123309897242, + "grad_norm": 1.2939599752426147, + "learning_rate": 1.1275054530789938e-05, + "loss": 1.9335, + "mean_token_accuracy": 0.5441049337387085, + "num_tokens": 4779385145.0, + "step": 9349 + }, + { + "epoch": 2.528393726338561, + "grad_norm": 1.3807703256607056, + "learning_rate": 1.127347877377591e-05, + "loss": 1.9642, + "mean_token_accuracy": 0.5634868741035461, + "num_tokens": 4779909350.0, + "step": 9350 + }, + { + "epoch": 2.5286641427798813, + "grad_norm": 1.179671287536621, + "learning_rate": 1.1271903008370748e-05, + "loss": 1.8281, + "mean_token_accuracy": 0.5747007131576538, + "num_tokens": 4780422112.0, + "step": 9351 + }, + { + "epoch": 2.5289345592212005, + "grad_norm": 1.078857183456421, + "learning_rate": 1.1270327234622802e-05, + "loss": 1.9058, + "mean_token_accuracy": 0.5592542886734009, + "num_tokens": 4780895777.0, + "step": 9352 + }, + { + "epoch": 2.52920497566252, + "grad_norm": 1.214362621307373, + "learning_rate": 1.1268751452580424e-05, + "loss": 1.9354, + "mean_token_accuracy": 0.5540338754653931, + "num_tokens": 4781419985.0, + "step": 9353 + }, + { + "epoch": 2.5294753921038398, + "grad_norm": 1.041933298110962, + "learning_rate": 1.126717566229196e-05, + "loss": 1.8146, + "mean_token_accuracy": 0.5711358785629272, + "num_tokens": 4781944181.0, + "step": 9354 + }, + { + "epoch": 2.5297458085451594, + "grad_norm": 1.079758882522583, + "learning_rate": 1.1265599863805761e-05, + "loss": 1.8528, + "mean_token_accuracy": 0.5739299654960632, + "num_tokens": 4782468368.0, + "step": 9355 + }, + { + "epoch": 2.530016224986479, + "grad_norm": 1.245988368988037, + "learning_rate": 1.1264024057170175e-05, + "loss": 1.9079, + "mean_token_accuracy": 0.5592389106750488, + "num_tokens": 4782956025.0, + "step": 9356 + }, + { + "epoch": 2.5302866414277987, + "grad_norm": 1.1425224542617798, + "learning_rate": 1.1262448242433557e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.5461361408233643, + "num_tokens": 4783480233.0, + "step": 9357 + }, + { + "epoch": 2.5305570578691183, + "grad_norm": 1.1578444242477417, + "learning_rate": 1.1260872419644255e-05, + "loss": 2.0488, + "mean_token_accuracy": 0.5423935651779175, + "num_tokens": 4784004514.0, + "step": 9358 + }, + { + "epoch": 2.530827474310438, + "grad_norm": 1.3974446058273315, + "learning_rate": 1.1259296588850618e-05, + "loss": 1.9617, + "mean_token_accuracy": 0.5419774055480957, + "num_tokens": 4784528690.0, + "step": 9359 + }, + { + "epoch": 2.5310978907517576, + "grad_norm": 1.5276376008987427, + "learning_rate": 1.1257720750101003e-05, + "loss": 1.8729, + "mean_token_accuracy": 0.5656397342681885, + "num_tokens": 4785017086.0, + "step": 9360 + }, + { + "epoch": 2.5313683071930773, + "grad_norm": 0.5225109457969666, + "learning_rate": 1.1256144903443753e-05, + "loss": 1.199, + "mean_token_accuracy": 0.6785707473754883, + "num_tokens": 4785541233.0, + "step": 9361 + }, + { + "epoch": 2.531638723634397, + "grad_norm": 1.4516061544418335, + "learning_rate": 1.125456904892723e-05, + "loss": 1.8168, + "mean_token_accuracy": 0.557587742805481, + "num_tokens": 4786065312.0, + "step": 9362 + }, + { + "epoch": 2.5319091400757165, + "grad_norm": 1.551321029663086, + "learning_rate": 1.1252993186599774e-05, + "loss": 1.9798, + "mean_token_accuracy": 0.5544981956481934, + "num_tokens": 4786589520.0, + "step": 9363 + }, + { + "epoch": 2.532179556517036, + "grad_norm": 1.2790714502334595, + "learning_rate": 1.1251417316509743e-05, + "loss": 1.9334, + "mean_token_accuracy": 0.5589600205421448, + "num_tokens": 4787113773.0, + "step": 9364 + }, + { + "epoch": 2.532449972958356, + "grad_norm": 1.1444334983825684, + "learning_rate": 1.1249841438705491e-05, + "loss": 1.8943, + "mean_token_accuracy": 0.5572808980941772, + "num_tokens": 4787637942.0, + "step": 9365 + }, + { + "epoch": 2.5327203893996755, + "grad_norm": 1.269066572189331, + "learning_rate": 1.1248265553235373e-05, + "loss": 1.8749, + "mean_token_accuracy": 0.5667800307273865, + "num_tokens": 4788162180.0, + "step": 9366 + }, + { + "epoch": 2.532990805840995, + "grad_norm": 1.2073136568069458, + "learning_rate": 1.1246689660147731e-05, + "loss": 1.8667, + "mean_token_accuracy": 0.5775936841964722, + "num_tokens": 4788686416.0, + "step": 9367 + }, + { + "epoch": 2.5332612222823148, + "grad_norm": 1.35536527633667, + "learning_rate": 1.1245113759490929e-05, + "loss": 1.8512, + "mean_token_accuracy": 0.5784777402877808, + "num_tokens": 4789210514.0, + "step": 9368 + }, + { + "epoch": 2.5335316387236344, + "grad_norm": 1.6493687629699707, + "learning_rate": 1.1243537851313313e-05, + "loss": 1.9143, + "mean_token_accuracy": 0.5726339817047119, + "num_tokens": 4789734771.0, + "step": 9369 + }, + { + "epoch": 2.533802055164954, + "grad_norm": 1.325732946395874, + "learning_rate": 1.1241961935663246e-05, + "loss": 2.0167, + "mean_token_accuracy": 0.5433209538459778, + "num_tokens": 4790258835.0, + "step": 9370 + }, + { + "epoch": 2.5340724716062737, + "grad_norm": 1.2236573696136475, + "learning_rate": 1.1240386012589066e-05, + "loss": 1.9655, + "mean_token_accuracy": 0.5527465343475342, + "num_tokens": 4790782939.0, + "step": 9371 + }, + { + "epoch": 2.5343428880475933, + "grad_norm": 1.0885374546051025, + "learning_rate": 1.123881008213914e-05, + "loss": 1.822, + "mean_token_accuracy": 0.5812663435935974, + "num_tokens": 4791307074.0, + "step": 9372 + }, + { + "epoch": 2.534613304488913, + "grad_norm": 1.0366181135177612, + "learning_rate": 1.1237234144361822e-05, + "loss": 1.7354, + "mean_token_accuracy": 0.6018503904342651, + "num_tokens": 4791831204.0, + "step": 9373 + }, + { + "epoch": 2.5348837209302326, + "grad_norm": 1.0464061498641968, + "learning_rate": 1.123565819930546e-05, + "loss": 1.8441, + "mean_token_accuracy": 0.5659355521202087, + "num_tokens": 4792355322.0, + "step": 9374 + }, + { + "epoch": 2.5351541373715523, + "grad_norm": 1.0621819496154785, + "learning_rate": 1.1234082247018411e-05, + "loss": 1.9173, + "mean_token_accuracy": 0.5612657070159912, + "num_tokens": 4792879558.0, + "step": 9375 + }, + { + "epoch": 2.535424553812872, + "grad_norm": 1.3975214958190918, + "learning_rate": 1.1232506287549032e-05, + "loss": 1.8404, + "mean_token_accuracy": 0.5518611669540405, + "num_tokens": 4793403783.0, + "step": 9376 + }, + { + "epoch": 2.5356949702541915, + "grad_norm": 1.5673465728759766, + "learning_rate": 1.1230930320945674e-05, + "loss": 1.9782, + "mean_token_accuracy": 0.547511875629425, + "num_tokens": 4793891181.0, + "step": 9377 + }, + { + "epoch": 2.535965386695511, + "grad_norm": 1.2530983686447144, + "learning_rate": 1.1229354347256695e-05, + "loss": 1.8621, + "mean_token_accuracy": 0.5708907842636108, + "num_tokens": 4794415320.0, + "step": 9378 + }, + { + "epoch": 2.536235803136831, + "grad_norm": 1.3110846281051636, + "learning_rate": 1.1227778366530451e-05, + "loss": 1.9306, + "mean_token_accuracy": 0.5541706681251526, + "num_tokens": 4794939506.0, + "step": 9379 + }, + { + "epoch": 2.5365062195781505, + "grad_norm": 1.266116738319397, + "learning_rate": 1.1226202378815296e-05, + "loss": 1.8905, + "mean_token_accuracy": 0.5537763833999634, + "num_tokens": 4795463658.0, + "step": 9380 + }, + { + "epoch": 2.53677663601947, + "grad_norm": 0.5230453610420227, + "learning_rate": 1.122462638415959e-05, + "loss": 1.0825, + "mean_token_accuracy": 0.7257003784179688, + "num_tokens": 4795890417.0, + "step": 9381 + }, + { + "epoch": 2.5370470524607898, + "grad_norm": 2.0737781524658203, + "learning_rate": 1.1223050382611681e-05, + "loss": 1.9796, + "mean_token_accuracy": 0.564826488494873, + "num_tokens": 4796379189.0, + "step": 9382 + }, + { + "epoch": 2.5373174689021094, + "grad_norm": 1.8793307542800903, + "learning_rate": 1.1221474374219933e-05, + "loss": 1.9161, + "mean_token_accuracy": 0.5638788342475891, + "num_tokens": 4796867383.0, + "step": 9383 + }, + { + "epoch": 2.537587885343429, + "grad_norm": 1.1739704608917236, + "learning_rate": 1.1219898359032701e-05, + "loss": 1.9525, + "mean_token_accuracy": 0.5613600611686707, + "num_tokens": 4797391582.0, + "step": 9384 + }, + { + "epoch": 2.5378583017847482, + "grad_norm": 1.376418113708496, + "learning_rate": 1.121832233709834e-05, + "loss": 1.8893, + "mean_token_accuracy": 0.5764518976211548, + "num_tokens": 4797915851.0, + "step": 9385 + }, + { + "epoch": 2.5381287182260683, + "grad_norm": 1.7108116149902344, + "learning_rate": 1.1216746308465204e-05, + "loss": 1.9825, + "mean_token_accuracy": 0.5556536912918091, + "num_tokens": 4798440088.0, + "step": 9386 + }, + { + "epoch": 2.5383991346673875, + "grad_norm": 1.438403844833374, + "learning_rate": 1.1215170273181656e-05, + "loss": 1.7805, + "mean_token_accuracy": 0.5728572607040405, + "num_tokens": 4798964239.0, + "step": 9387 + }, + { + "epoch": 2.5386695511087076, + "grad_norm": 1.3749196529388428, + "learning_rate": 1.1213594231296052e-05, + "loss": 1.6528, + "mean_token_accuracy": 0.6163749694824219, + "num_tokens": 4799488494.0, + "step": 9388 + }, + { + "epoch": 2.538939967550027, + "grad_norm": 1.4013793468475342, + "learning_rate": 1.1212018182856752e-05, + "loss": 1.8902, + "mean_token_accuracy": 0.5572155714035034, + "num_tokens": 4800012636.0, + "step": 9389 + }, + { + "epoch": 2.539210383991347, + "grad_norm": 1.0196200609207153, + "learning_rate": 1.1210442127912107e-05, + "loss": 1.7827, + "mean_token_accuracy": 0.5839642286300659, + "num_tokens": 4800536655.0, + "step": 9390 + }, + { + "epoch": 2.539480800432666, + "grad_norm": 3.1555469036102295, + "learning_rate": 1.1208866066510478e-05, + "loss": 1.6003, + "mean_token_accuracy": 0.6152523756027222, + "num_tokens": 4801050984.0, + "step": 9391 + }, + { + "epoch": 2.539751216873986, + "grad_norm": 1.8057631254196167, + "learning_rate": 1.1207289998700226e-05, + "loss": 1.9963, + "mean_token_accuracy": 0.5472688674926758, + "num_tokens": 4801575108.0, + "step": 9392 + }, + { + "epoch": 2.5400216333153054, + "grad_norm": 1.6050156354904175, + "learning_rate": 1.1205713924529705e-05, + "loss": 1.8693, + "mean_token_accuracy": 0.5684012174606323, + "num_tokens": 4802058811.0, + "step": 9393 + }, + { + "epoch": 2.540292049756625, + "grad_norm": 1.0988696813583374, + "learning_rate": 1.1204137844047278e-05, + "loss": 1.7534, + "mean_token_accuracy": 0.6126935482025146, + "num_tokens": 4802583001.0, + "step": 9394 + }, + { + "epoch": 2.5405624661979447, + "grad_norm": 1.179537296295166, + "learning_rate": 1.12025617573013e-05, + "loss": 1.8432, + "mean_token_accuracy": 0.564923107624054, + "num_tokens": 4803107172.0, + "step": 9395 + }, + { + "epoch": 2.5408328826392643, + "grad_norm": 1.3710148334503174, + "learning_rate": 1.1200985664340133e-05, + "loss": 1.8025, + "mean_token_accuracy": 0.5703558921813965, + "num_tokens": 4803631363.0, + "step": 9396 + }, + { + "epoch": 2.541103299080584, + "grad_norm": 1.154530644416809, + "learning_rate": 1.1199409565212136e-05, + "loss": 1.7779, + "mean_token_accuracy": 0.5722556114196777, + "num_tokens": 4804096431.0, + "step": 9397 + }, + { + "epoch": 2.5413737155219036, + "grad_norm": 1.4417991638183594, + "learning_rate": 1.1197833459965662e-05, + "loss": 1.8379, + "mean_token_accuracy": 0.576708197593689, + "num_tokens": 4804620494.0, + "step": 9398 + }, + { + "epoch": 2.5416441319632233, + "grad_norm": 1.2789686918258667, + "learning_rate": 1.119625734864908e-05, + "loss": 1.7744, + "mean_token_accuracy": 0.5844234228134155, + "num_tokens": 4805144706.0, + "step": 9399 + }, + { + "epoch": 2.541914548404543, + "grad_norm": 1.0447149276733398, + "learning_rate": 1.1194681231310742e-05, + "loss": 1.8693, + "mean_token_accuracy": 0.5735296607017517, + "num_tokens": 4805596317.0, + "step": 9400 + }, + { + "epoch": 2.5421849648458625, + "grad_norm": 0.5269099473953247, + "learning_rate": 1.1193105107999017e-05, + "loss": 0.9467, + "mean_token_accuracy": 0.7441345453262329, + "num_tokens": 4806075929.0, + "step": 9401 + }, + { + "epoch": 2.542455381287182, + "grad_norm": 1.7862199544906616, + "learning_rate": 1.1191528978762254e-05, + "loss": 1.8511, + "mean_token_accuracy": 0.5803454518318176, + "num_tokens": 4806600068.0, + "step": 9402 + }, + { + "epoch": 2.542725797728502, + "grad_norm": 1.5298978090286255, + "learning_rate": 1.1189952843648822e-05, + "loss": 1.8385, + "mean_token_accuracy": 0.5596314668655396, + "num_tokens": 4807124137.0, + "step": 9403 + }, + { + "epoch": 2.5429962141698215, + "grad_norm": 1.0642024278640747, + "learning_rate": 1.1188376702707075e-05, + "loss": 1.8961, + "mean_token_accuracy": 0.560804545879364, + "num_tokens": 4807648187.0, + "step": 9404 + }, + { + "epoch": 2.543266630611141, + "grad_norm": 1.4759085178375244, + "learning_rate": 1.118680055598538e-05, + "loss": 1.8932, + "mean_token_accuracy": 0.5725423097610474, + "num_tokens": 4808172360.0, + "step": 9405 + }, + { + "epoch": 2.5435370470524608, + "grad_norm": 1.3442304134368896, + "learning_rate": 1.1185224403532093e-05, + "loss": 1.9523, + "mean_token_accuracy": 0.5585691928863525, + "num_tokens": 4808696641.0, + "step": 9406 + }, + { + "epoch": 2.5438074634937804, + "grad_norm": 1.0462710857391357, + "learning_rate": 1.1183648245395576e-05, + "loss": 1.7594, + "mean_token_accuracy": 0.5861021280288696, + "num_tokens": 4809220819.0, + "step": 9407 + }, + { + "epoch": 2.5440778799351, + "grad_norm": 1.3686860799789429, + "learning_rate": 1.1182072081624189e-05, + "loss": 1.9201, + "mean_token_accuracy": 0.5666335225105286, + "num_tokens": 4809732441.0, + "step": 9408 + }, + { + "epoch": 2.5443482963764197, + "grad_norm": 1.134329915046692, + "learning_rate": 1.11804959122663e-05, + "loss": 1.8395, + "mean_token_accuracy": 0.5754239559173584, + "num_tokens": 4810223947.0, + "step": 9409 + }, + { + "epoch": 2.5446187128177393, + "grad_norm": 0.9659603834152222, + "learning_rate": 1.1178919737370262e-05, + "loss": 1.7816, + "mean_token_accuracy": 0.5948247909545898, + "num_tokens": 4810748089.0, + "step": 9410 + }, + { + "epoch": 2.544889129259059, + "grad_norm": 1.3079107999801636, + "learning_rate": 1.1177343556984444e-05, + "loss": 1.9276, + "mean_token_accuracy": 0.5714402198791504, + "num_tokens": 4811272301.0, + "step": 9411 + }, + { + "epoch": 2.5451595457003786, + "grad_norm": 1.131921410560608, + "learning_rate": 1.1175767371157202e-05, + "loss": 1.7551, + "mean_token_accuracy": 0.5899238586425781, + "num_tokens": 4811796519.0, + "step": 9412 + }, + { + "epoch": 2.5454299621416983, + "grad_norm": 1.1514298915863037, + "learning_rate": 1.1174191179936899e-05, + "loss": 1.9261, + "mean_token_accuracy": 0.5725592374801636, + "num_tokens": 4812320710.0, + "step": 9413 + }, + { + "epoch": 2.545700378583018, + "grad_norm": 0.9624239206314087, + "learning_rate": 1.1172614983371896e-05, + "loss": 1.8105, + "mean_token_accuracy": 0.5684043765068054, + "num_tokens": 4812844930.0, + "step": 9414 + }, + { + "epoch": 2.5459707950243375, + "grad_norm": 0.915841281414032, + "learning_rate": 1.1171038781510562e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.5766779184341431, + "num_tokens": 4813335362.0, + "step": 9415 + }, + { + "epoch": 2.546241211465657, + "grad_norm": 1.1479226350784302, + "learning_rate": 1.1169462574401256e-05, + "loss": 1.8643, + "mean_token_accuracy": 0.5961430668830872, + "num_tokens": 4813760717.0, + "step": 9416 + }, + { + "epoch": 2.546511627906977, + "grad_norm": 1.1208577156066895, + "learning_rate": 1.1167886362092336e-05, + "loss": 1.8818, + "mean_token_accuracy": 0.5653311014175415, + "num_tokens": 4814284970.0, + "step": 9417 + }, + { + "epoch": 2.5467820443482965, + "grad_norm": 1.0751473903656006, + "learning_rate": 1.1166310144632171e-05, + "loss": 1.8187, + "mean_token_accuracy": 0.5769935846328735, + "num_tokens": 4814809076.0, + "step": 9418 + }, + { + "epoch": 2.547052460789616, + "grad_norm": 1.311809778213501, + "learning_rate": 1.1164733922069121e-05, + "loss": 1.9479, + "mean_token_accuracy": 0.5492810010910034, + "num_tokens": 4815333259.0, + "step": 9419 + }, + { + "epoch": 2.5473228772309358, + "grad_norm": 1.0184441804885864, + "learning_rate": 1.1163157694451552e-05, + "loss": 2.0359, + "mean_token_accuracy": 0.5391759872436523, + "num_tokens": 4815856316.0, + "step": 9420 + }, + { + "epoch": 2.5475932936722554, + "grad_norm": 0.460260808467865, + "learning_rate": 1.116158146182782e-05, + "loss": 1.1056, + "mean_token_accuracy": 0.695534348487854, + "num_tokens": 4816380525.0, + "step": 9421 + }, + { + "epoch": 2.547863710113575, + "grad_norm": 1.5138436555862427, + "learning_rate": 1.1160005224246298e-05, + "loss": 1.9673, + "mean_token_accuracy": 0.5532714128494263, + "num_tokens": 4816855341.0, + "step": 9422 + }, + { + "epoch": 2.5481341265548947, + "grad_norm": 1.3783732652664185, + "learning_rate": 1.1158428981755343e-05, + "loss": 1.8539, + "mean_token_accuracy": 0.5781550407409668, + "num_tokens": 4817379551.0, + "step": 9423 + }, + { + "epoch": 2.5484045429962143, + "grad_norm": 1.22885000705719, + "learning_rate": 1.115685273440332e-05, + "loss": 1.9943, + "mean_token_accuracy": 0.5536150932312012, + "num_tokens": 4817903792.0, + "step": 9424 + }, + { + "epoch": 2.548674959437534, + "grad_norm": 1.3100086450576782, + "learning_rate": 1.1155276482238599e-05, + "loss": 1.8725, + "mean_token_accuracy": 0.5669701099395752, + "num_tokens": 4818427973.0, + "step": 9425 + }, + { + "epoch": 2.548945375878853, + "grad_norm": 1.3319507837295532, + "learning_rate": 1.1153700225309535e-05, + "loss": 2.0154, + "mean_token_accuracy": 0.5402964353561401, + "num_tokens": 4818907897.0, + "step": 9426 + }, + { + "epoch": 2.5492157923201733, + "grad_norm": 1.4260122776031494, + "learning_rate": 1.1152123963664496e-05, + "loss": 1.9127, + "mean_token_accuracy": 0.567794680595398, + "num_tokens": 4819432065.0, + "step": 9427 + }, + { + "epoch": 2.5494862087614925, + "grad_norm": 1.4255375862121582, + "learning_rate": 1.1150547697351847e-05, + "loss": 1.8917, + "mean_token_accuracy": 0.5529676675796509, + "num_tokens": 4819907965.0, + "step": 9428 + }, + { + "epoch": 2.5497566252028125, + "grad_norm": 1.205259919166565, + "learning_rate": 1.1148971426419952e-05, + "loss": 1.841, + "mean_token_accuracy": 0.5782836675643921, + "num_tokens": 4820370471.0, + "step": 9429 + }, + { + "epoch": 2.5500270416441317, + "grad_norm": 1.2839934825897217, + "learning_rate": 1.1147395150917176e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.5614674687385559, + "num_tokens": 4820881123.0, + "step": 9430 + }, + { + "epoch": 2.550297458085452, + "grad_norm": 1.3223727941513062, + "learning_rate": 1.1145818870891882e-05, + "loss": 1.8351, + "mean_token_accuracy": 0.5922521948814392, + "num_tokens": 4821368087.0, + "step": 9431 + }, + { + "epoch": 2.550567874526771, + "grad_norm": 1.2751661539077759, + "learning_rate": 1.114424258639244e-05, + "loss": 1.9499, + "mean_token_accuracy": 0.5657689571380615, + "num_tokens": 4821839488.0, + "step": 9432 + }, + { + "epoch": 2.550838290968091, + "grad_norm": 1.3688561916351318, + "learning_rate": 1.1142666297467207e-05, + "loss": 1.8611, + "mean_token_accuracy": 0.5708407759666443, + "num_tokens": 4822363653.0, + "step": 9433 + }, + { + "epoch": 2.5511087074094103, + "grad_norm": 1.2881702184677124, + "learning_rate": 1.1141090004164553e-05, + "loss": 1.8754, + "mean_token_accuracy": 0.5491961240768433, + "num_tokens": 4822835201.0, + "step": 9434 + }, + { + "epoch": 2.5513791238507304, + "grad_norm": 1.0183169841766357, + "learning_rate": 1.1139513706532842e-05, + "loss": 1.8122, + "mean_token_accuracy": 0.5600342750549316, + "num_tokens": 4823359450.0, + "step": 9435 + }, + { + "epoch": 2.5516495402920496, + "grad_norm": 1.2381095886230469, + "learning_rate": 1.1137937404620443e-05, + "loss": 1.9357, + "mean_token_accuracy": 0.5597712397575378, + "num_tokens": 4823822914.0, + "step": 9436 + }, + { + "epoch": 2.5519199567333692, + "grad_norm": 1.219973087310791, + "learning_rate": 1.1136361098475716e-05, + "loss": 1.8296, + "mean_token_accuracy": 0.5783982276916504, + "num_tokens": 4824340651.0, + "step": 9437 + }, + { + "epoch": 2.552190373174689, + "grad_norm": 1.340699315071106, + "learning_rate": 1.113478478814703e-05, + "loss": 1.9445, + "mean_token_accuracy": 0.5660825967788696, + "num_tokens": 4824864857.0, + "step": 9438 + }, + { + "epoch": 2.5524607896160085, + "grad_norm": 1.5698124170303345, + "learning_rate": 1.1133208473682754e-05, + "loss": 1.8266, + "mean_token_accuracy": 0.5897551774978638, + "num_tokens": 4825342327.0, + "step": 9439 + }, + { + "epoch": 2.552731206057328, + "grad_norm": 1.2822812795639038, + "learning_rate": 1.1131632155131246e-05, + "loss": 1.7733, + "mean_token_accuracy": 0.5906210541725159, + "num_tokens": 4825866595.0, + "step": 9440 + }, + { + "epoch": 2.553001622498648, + "grad_norm": 0.5161066055297852, + "learning_rate": 1.1130055832540878e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.6959017515182495, + "num_tokens": 4826327294.0, + "step": 9441 + }, + { + "epoch": 2.5532720389399675, + "grad_norm": 1.4403578042984009, + "learning_rate": 1.1128479505960014e-05, + "loss": 1.9211, + "mean_token_accuracy": 0.5639097690582275, + "num_tokens": 4826792624.0, + "step": 9442 + }, + { + "epoch": 2.553542455381287, + "grad_norm": 1.297412633895874, + "learning_rate": 1.1126903175437022e-05, + "loss": 1.8848, + "mean_token_accuracy": 0.5748111605644226, + "num_tokens": 4827316841.0, + "step": 9443 + }, + { + "epoch": 2.5538128718226067, + "grad_norm": 2.342240810394287, + "learning_rate": 1.1125326841020265e-05, + "loss": 1.6385, + "mean_token_accuracy": 0.6238491535186768, + "num_tokens": 4827841049.0, + "step": 9444 + }, + { + "epoch": 2.5540832882639264, + "grad_norm": 1.088405728340149, + "learning_rate": 1.1123750502758115e-05, + "loss": 1.8112, + "mean_token_accuracy": 0.5677058696746826, + "num_tokens": 4828365184.0, + "step": 9445 + }, + { + "epoch": 2.554353704705246, + "grad_norm": 1.3258105516433716, + "learning_rate": 1.1122174160698935e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.5681054592132568, + "num_tokens": 4828889412.0, + "step": 9446 + }, + { + "epoch": 2.5546241211465657, + "grad_norm": 1.8065521717071533, + "learning_rate": 1.1120597814891093e-05, + "loss": 1.6736, + "mean_token_accuracy": 0.6211323142051697, + "num_tokens": 4829357695.0, + "step": 9447 + }, + { + "epoch": 2.5548945375878853, + "grad_norm": 1.2902426719665527, + "learning_rate": 1.1119021465382955e-05, + "loss": 1.8478, + "mean_token_accuracy": 0.569572925567627, + "num_tokens": 4829854751.0, + "step": 9448 + }, + { + "epoch": 2.555164954029205, + "grad_norm": 1.3575607538223267, + "learning_rate": 1.1117445112222886e-05, + "loss": 1.9008, + "mean_token_accuracy": 0.5684571266174316, + "num_tokens": 4830378966.0, + "step": 9449 + }, + { + "epoch": 2.5554353704705246, + "grad_norm": 1.2818782329559326, + "learning_rate": 1.1115868755459255e-05, + "loss": 1.9392, + "mean_token_accuracy": 0.5647578239440918, + "num_tokens": 4830850386.0, + "step": 9450 + }, + { + "epoch": 2.5557057869118442, + "grad_norm": 1.5908035039901733, + "learning_rate": 1.1114292395140434e-05, + "loss": 1.8863, + "mean_token_accuracy": 0.5660989284515381, + "num_tokens": 4831374617.0, + "step": 9451 + }, + { + "epoch": 2.555976203353164, + "grad_norm": 1.5782318115234375, + "learning_rate": 1.1112716031314786e-05, + "loss": 1.8781, + "mean_token_accuracy": 0.5785306692123413, + "num_tokens": 4831855256.0, + "step": 9452 + }, + { + "epoch": 2.5562466197944835, + "grad_norm": 1.3245080709457397, + "learning_rate": 1.1111139664030675e-05, + "loss": 1.8538, + "mean_token_accuracy": 0.5559982657432556, + "num_tokens": 4832379495.0, + "step": 9453 + }, + { + "epoch": 2.556517036235803, + "grad_norm": 1.252299189567566, + "learning_rate": 1.1109563293336476e-05, + "loss": 1.8394, + "mean_token_accuracy": 0.5840861797332764, + "num_tokens": 4832867545.0, + "step": 9454 + }, + { + "epoch": 2.556787452677123, + "grad_norm": 1.443118691444397, + "learning_rate": 1.110798691928055e-05, + "loss": 1.8391, + "mean_token_accuracy": 0.5833332538604736, + "num_tokens": 4833391767.0, + "step": 9455 + }, + { + "epoch": 2.5570578691184425, + "grad_norm": 1.3928982019424438, + "learning_rate": 1.1106410541911268e-05, + "loss": 1.7812, + "mean_token_accuracy": 0.5990538001060486, + "num_tokens": 4833915864.0, + "step": 9456 + }, + { + "epoch": 2.557328285559762, + "grad_norm": 1.172243356704712, + "learning_rate": 1.1104834161276998e-05, + "loss": 1.9248, + "mean_token_accuracy": 0.5613232254981995, + "num_tokens": 4834439946.0, + "step": 9457 + }, + { + "epoch": 2.5575987020010817, + "grad_norm": 1.7960059642791748, + "learning_rate": 1.1103257777426109e-05, + "loss": 1.9593, + "mean_token_accuracy": 0.5710229873657227, + "num_tokens": 4834953708.0, + "step": 9458 + }, + { + "epoch": 2.5578691184424014, + "grad_norm": 1.6546990871429443, + "learning_rate": 1.1101681390406966e-05, + "loss": 1.9375, + "mean_token_accuracy": 0.5651788711547852, + "num_tokens": 4835477930.0, + "step": 9459 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 1.0159765481948853, + "learning_rate": 1.1100105000267943e-05, + "loss": 1.7614, + "mean_token_accuracy": 0.605543851852417, + "num_tokens": 4836002160.0, + "step": 9460 + }, + { + "epoch": 2.5584099513250407, + "grad_norm": 0.5170329809188843, + "learning_rate": 1.1098528607057397e-05, + "loss": 1.1294, + "mean_token_accuracy": 0.6916718482971191, + "num_tokens": 4836526262.0, + "step": 9461 + }, + { + "epoch": 2.5586803677663603, + "grad_norm": 1.9377869367599487, + "learning_rate": 1.109695221082371e-05, + "loss": 1.8193, + "mean_token_accuracy": 0.5870464444160461, + "num_tokens": 4837003964.0, + "step": 9462 + }, + { + "epoch": 2.55895078420768, + "grad_norm": 1.6711033582687378, + "learning_rate": 1.109537581161524e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5692075490951538, + "num_tokens": 4837476804.0, + "step": 9463 + }, + { + "epoch": 2.5592212006489996, + "grad_norm": 1.1730172634124756, + "learning_rate": 1.1093799409480361e-05, + "loss": 1.9111, + "mean_token_accuracy": 0.5637384057044983, + "num_tokens": 4838001066.0, + "step": 9464 + }, + { + "epoch": 2.5594916170903192, + "grad_norm": 1.1669985055923462, + "learning_rate": 1.1092223004467441e-05, + "loss": 1.7236, + "mean_token_accuracy": 0.5953903198242188, + "num_tokens": 4838525232.0, + "step": 9465 + }, + { + "epoch": 2.559762033531639, + "grad_norm": 1.3470702171325684, + "learning_rate": 1.109064659662485e-05, + "loss": 1.9029, + "mean_token_accuracy": 0.5722631216049194, + "num_tokens": 4839049411.0, + "step": 9466 + }, + { + "epoch": 2.560032449972958, + "grad_norm": 1.104509949684143, + "learning_rate": 1.1089070186000952e-05, + "loss": 1.8513, + "mean_token_accuracy": 0.5849339962005615, + "num_tokens": 4839573688.0, + "step": 9467 + }, + { + "epoch": 2.560302866414278, + "grad_norm": 1.28592050075531, + "learning_rate": 1.108749377264412e-05, + "loss": 1.8837, + "mean_token_accuracy": 0.5631188154220581, + "num_tokens": 4840097972.0, + "step": 9468 + }, + { + "epoch": 2.5605732828555974, + "grad_norm": 1.2276806831359863, + "learning_rate": 1.1085917356602722e-05, + "loss": 1.7915, + "mean_token_accuracy": 0.5871102809906006, + "num_tokens": 4840621965.0, + "step": 9469 + }, + { + "epoch": 2.5608436992969175, + "grad_norm": 1.3049973249435425, + "learning_rate": 1.1084340937925125e-05, + "loss": 1.8508, + "mean_token_accuracy": 0.5801194906234741, + "num_tokens": 4841136167.0, + "step": 9470 + }, + { + "epoch": 2.5611141157382367, + "grad_norm": 1.2307653427124023, + "learning_rate": 1.1082764516659701e-05, + "loss": 1.926, + "mean_token_accuracy": 0.5597574710845947, + "num_tokens": 4841660433.0, + "step": 9471 + }, + { + "epoch": 2.5613845321795568, + "grad_norm": 1.0754274129867554, + "learning_rate": 1.1081188092854815e-05, + "loss": 1.8251, + "mean_token_accuracy": 0.5770050287246704, + "num_tokens": 4842184640.0, + "step": 9472 + }, + { + "epoch": 2.561654948620876, + "grad_norm": 1.2622233629226685, + "learning_rate": 1.1079611666558846e-05, + "loss": 1.8914, + "mean_token_accuracy": 0.5796448588371277, + "num_tokens": 4842692966.0, + "step": 9473 + }, + { + "epoch": 2.561925365062196, + "grad_norm": 1.1246068477630615, + "learning_rate": 1.107803523782015e-05, + "loss": 1.9122, + "mean_token_accuracy": 0.5569689273834229, + "num_tokens": 4843217149.0, + "step": 9474 + }, + { + "epoch": 2.5621957815035152, + "grad_norm": 1.2623695135116577, + "learning_rate": 1.107645880668711e-05, + "loss": 1.9144, + "mean_token_accuracy": 0.5683449506759644, + "num_tokens": 4843741387.0, + "step": 9475 + }, + { + "epoch": 2.5624661979448353, + "grad_norm": 1.2541643381118774, + "learning_rate": 1.1074882373208084e-05, + "loss": 1.9335, + "mean_token_accuracy": 0.5529288649559021, + "num_tokens": 4844265530.0, + "step": 9476 + }, + { + "epoch": 2.5627366143861545, + "grad_norm": 1.239681601524353, + "learning_rate": 1.1073305937431449e-05, + "loss": 1.8848, + "mean_token_accuracy": 0.56669020652771, + "num_tokens": 4844789747.0, + "step": 9477 + }, + { + "epoch": 2.563007030827474, + "grad_norm": 1.30458664894104, + "learning_rate": 1.1071729499405573e-05, + "loss": 1.8429, + "mean_token_accuracy": 0.5615501999855042, + "num_tokens": 4845313911.0, + "step": 9478 + }, + { + "epoch": 2.563277447268794, + "grad_norm": 1.6167508363723755, + "learning_rate": 1.107015305917882e-05, + "loss": 1.9405, + "mean_token_accuracy": 0.5625450611114502, + "num_tokens": 4845832419.0, + "step": 9479 + }, + { + "epoch": 2.5635478637101135, + "grad_norm": 1.0899732112884521, + "learning_rate": 1.1068576616799569e-05, + "loss": 1.9029, + "mean_token_accuracy": 0.5646514892578125, + "num_tokens": 4846356615.0, + "step": 9480 + }, + { + "epoch": 2.563818280151433, + "grad_norm": 0.46186700463294983, + "learning_rate": 1.1067000172316188e-05, + "loss": 1.0504, + "mean_token_accuracy": 0.7222763895988464, + "num_tokens": 4846857999.0, + "step": 9481 + }, + { + "epoch": 2.5640886965927527, + "grad_norm": 2.2334063053131104, + "learning_rate": 1.106542372577704e-05, + "loss": 1.9362, + "mean_token_accuracy": 0.5671830177307129, + "num_tokens": 4847382097.0, + "step": 9482 + }, + { + "epoch": 2.5643591130340724, + "grad_norm": 1.5705749988555908, + "learning_rate": 1.1063847277230502e-05, + "loss": 1.8686, + "mean_token_accuracy": 0.5586751699447632, + "num_tokens": 4847906321.0, + "step": 9483 + }, + { + "epoch": 2.564629529475392, + "grad_norm": 1.232542872428894, + "learning_rate": 1.1062270826724941e-05, + "loss": 1.8834, + "mean_token_accuracy": 0.5687615275382996, + "num_tokens": 4848430461.0, + "step": 9484 + }, + { + "epoch": 2.5648999459167117, + "grad_norm": 1.5317842960357666, + "learning_rate": 1.1060694374308729e-05, + "loss": 1.8007, + "mean_token_accuracy": 0.616743803024292, + "num_tokens": 4848866490.0, + "step": 9485 + }, + { + "epoch": 2.5651703623580313, + "grad_norm": 1.6399965286254883, + "learning_rate": 1.1059117920030233e-05, + "loss": 1.8776, + "mean_token_accuracy": 0.5772817134857178, + "num_tokens": 4849390579.0, + "step": 9486 + }, + { + "epoch": 2.565440778799351, + "grad_norm": 1.3665894269943237, + "learning_rate": 1.1057541463937827e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.5785936117172241, + "num_tokens": 4849914779.0, + "step": 9487 + }, + { + "epoch": 2.5657111952406706, + "grad_norm": 1.4354523420333862, + "learning_rate": 1.105596500607988e-05, + "loss": 1.8738, + "mean_token_accuracy": 0.5778459310531616, + "num_tokens": 4850438883.0, + "step": 9488 + }, + { + "epoch": 2.5659816116819902, + "grad_norm": 1.5133063793182373, + "learning_rate": 1.1054388546504764e-05, + "loss": 1.8314, + "mean_token_accuracy": 0.5732758641242981, + "num_tokens": 4850963055.0, + "step": 9489 + }, + { + "epoch": 2.56625202812331, + "grad_norm": 1.2831494808197021, + "learning_rate": 1.1052812085260844e-05, + "loss": 1.9126, + "mean_token_accuracy": 0.5475404858589172, + "num_tokens": 4851487241.0, + "step": 9490 + }, + { + "epoch": 2.5665224445646295, + "grad_norm": 1.1023688316345215, + "learning_rate": 1.1051235622396495e-05, + "loss": 1.8534, + "mean_token_accuracy": 0.5741571187973022, + "num_tokens": 4851994429.0, + "step": 9491 + }, + { + "epoch": 2.566792861005949, + "grad_norm": 1.3166660070419312, + "learning_rate": 1.1049659157960084e-05, + "loss": 1.8946, + "mean_token_accuracy": 0.5684624910354614, + "num_tokens": 4852518617.0, + "step": 9492 + }, + { + "epoch": 2.567063277447269, + "grad_norm": 1.0685092210769653, + "learning_rate": 1.1048082691999988e-05, + "loss": 1.9951, + "mean_token_accuracy": 0.5374870300292969, + "num_tokens": 4853042843.0, + "step": 9493 + }, + { + "epoch": 2.5673336938885885, + "grad_norm": 1.1147332191467285, + "learning_rate": 1.1046506224564573e-05, + "loss": 1.9253, + "mean_token_accuracy": 0.5697062015533447, + "num_tokens": 4853530942.0, + "step": 9494 + }, + { + "epoch": 2.567604110329908, + "grad_norm": 1.1478089094161987, + "learning_rate": 1.1044929755702207e-05, + "loss": 1.7764, + "mean_token_accuracy": 0.5991394519805908, + "num_tokens": 4854055098.0, + "step": 9495 + }, + { + "epoch": 2.5678745267712277, + "grad_norm": 1.3626028299331665, + "learning_rate": 1.1043353285461267e-05, + "loss": 1.8465, + "mean_token_accuracy": 0.5774381160736084, + "num_tokens": 4854521028.0, + "step": 9496 + }, + { + "epoch": 2.5681449432125474, + "grad_norm": 1.2339481115341187, + "learning_rate": 1.104177681389012e-05, + "loss": 1.7998, + "mean_token_accuracy": 0.5824517011642456, + "num_tokens": 4855045156.0, + "step": 9497 + }, + { + "epoch": 2.568415359653867, + "grad_norm": 1.1663395166397095, + "learning_rate": 1.104020034103714e-05, + "loss": 1.7705, + "mean_token_accuracy": 0.5780588388442993, + "num_tokens": 4855569255.0, + "step": 9498 + }, + { + "epoch": 2.5686857760951867, + "grad_norm": 1.2892463207244873, + "learning_rate": 1.1038623866950689e-05, + "loss": 1.9133, + "mean_token_accuracy": 0.5645265579223633, + "num_tokens": 4856062406.0, + "step": 9499 + }, + { + "epoch": 2.5689561925365063, + "grad_norm": 1.3861442804336548, + "learning_rate": 1.1037047391679148e-05, + "loss": 1.8911, + "mean_token_accuracy": 0.5698384046554565, + "num_tokens": 4856578977.0, + "step": 9500 + }, + { + "epoch": 2.569226608977826, + "grad_norm": 0.39441725611686707, + "learning_rate": 1.1035470915270885e-05, + "loss": 1.1669, + "mean_token_accuracy": 0.6900880336761475, + "num_tokens": 4857103254.0, + "step": 9501 + }, + { + "epoch": 2.5694970254191456, + "grad_norm": 1.4667118787765503, + "learning_rate": 1.1033894437774269e-05, + "loss": 1.879, + "mean_token_accuracy": 0.5475412607192993, + "num_tokens": 4857627507.0, + "step": 9502 + }, + { + "epoch": 2.5697674418604652, + "grad_norm": 1.5861918926239014, + "learning_rate": 1.1032317959237673e-05, + "loss": 1.9267, + "mean_token_accuracy": 0.5803938508033752, + "num_tokens": 4858151727.0, + "step": 9503 + }, + { + "epoch": 2.570037858301785, + "grad_norm": 0.9822932481765747, + "learning_rate": 1.1030741479709466e-05, + "loss": 1.8797, + "mean_token_accuracy": 0.5873218774795532, + "num_tokens": 4858675884.0, + "step": 9504 + }, + { + "epoch": 2.5703082747431045, + "grad_norm": 1.0011857748031616, + "learning_rate": 1.102916499923802e-05, + "loss": 1.914, + "mean_token_accuracy": 0.5622701644897461, + "num_tokens": 4859195164.0, + "step": 9505 + }, + { + "epoch": 2.570578691184424, + "grad_norm": 1.2596755027770996, + "learning_rate": 1.1027588517871703e-05, + "loss": 1.8419, + "mean_token_accuracy": 0.5769738554954529, + "num_tokens": 4859661769.0, + "step": 9506 + }, + { + "epoch": 2.570849107625744, + "grad_norm": 1.079903483390808, + "learning_rate": 1.1026012035658891e-05, + "loss": 1.719, + "mean_token_accuracy": 0.6002907752990723, + "num_tokens": 4860186046.0, + "step": 9507 + }, + { + "epoch": 2.571119524067063, + "grad_norm": 1.1948713064193726, + "learning_rate": 1.1024435552647956e-05, + "loss": 1.9949, + "mean_token_accuracy": 0.5441259145736694, + "num_tokens": 4860710318.0, + "step": 9508 + }, + { + "epoch": 2.571389940508383, + "grad_norm": 1.1312904357910156, + "learning_rate": 1.1022859068887264e-05, + "loss": 1.8342, + "mean_token_accuracy": 0.5569931864738464, + "num_tokens": 4861234551.0, + "step": 9509 + }, + { + "epoch": 2.5716603569497023, + "grad_norm": 1.32101309299469, + "learning_rate": 1.102128258442519e-05, + "loss": 1.8048, + "mean_token_accuracy": 0.5869539976119995, + "num_tokens": 4861714219.0, + "step": 9510 + }, + { + "epoch": 2.5719307733910224, + "grad_norm": 1.1443300247192383, + "learning_rate": 1.1019706099310104e-05, + "loss": 1.8618, + "mean_token_accuracy": 0.5685408711433411, + "num_tokens": 4862238370.0, + "step": 9511 + }, + { + "epoch": 2.5722011898323416, + "grad_norm": 1.2091937065124512, + "learning_rate": 1.1018129613590375e-05, + "loss": 1.8183, + "mean_token_accuracy": 0.5808605551719666, + "num_tokens": 4862762644.0, + "step": 9512 + }, + { + "epoch": 2.5724716062736617, + "grad_norm": 1.2955777645111084, + "learning_rate": 1.1016553127314378e-05, + "loss": 1.8842, + "mean_token_accuracy": 0.5802055597305298, + "num_tokens": 4863286831.0, + "step": 9513 + }, + { + "epoch": 2.572742022714981, + "grad_norm": 1.1949753761291504, + "learning_rate": 1.1014976640530482e-05, + "loss": 1.8073, + "mean_token_accuracy": 0.588492751121521, + "num_tokens": 4863795022.0, + "step": 9514 + }, + { + "epoch": 2.573012439156301, + "grad_norm": 1.1040654182434082, + "learning_rate": 1.1013400153287056e-05, + "loss": 1.9949, + "mean_token_accuracy": 0.5524915456771851, + "num_tokens": 4864319244.0, + "step": 9515 + }, + { + "epoch": 2.57328285559762, + "grad_norm": 1.243055820465088, + "learning_rate": 1.1011823665632478e-05, + "loss": 1.8758, + "mean_token_accuracy": 0.5573923587799072, + "num_tokens": 4864843349.0, + "step": 9516 + }, + { + "epoch": 2.5735532720389402, + "grad_norm": 1.2010387182235718, + "learning_rate": 1.1010247177615111e-05, + "loss": 1.9078, + "mean_token_accuracy": 0.5696971416473389, + "num_tokens": 4865367518.0, + "step": 9517 + }, + { + "epoch": 2.5738236884802594, + "grad_norm": 1.3087201118469238, + "learning_rate": 1.1008670689283333e-05, + "loss": 1.9577, + "mean_token_accuracy": 0.5700900554656982, + "num_tokens": 4865891792.0, + "step": 9518 + }, + { + "epoch": 2.574094104921579, + "grad_norm": 1.0738369226455688, + "learning_rate": 1.1007094200685512e-05, + "loss": 1.8612, + "mean_token_accuracy": 0.5823275446891785, + "num_tokens": 4866416049.0, + "step": 9519 + }, + { + "epoch": 2.5743645213628987, + "grad_norm": 0.9574242830276489, + "learning_rate": 1.100551771187002e-05, + "loss": 1.7813, + "mean_token_accuracy": 0.572535514831543, + "num_tokens": 4866926605.0, + "step": 9520 + }, + { + "epoch": 2.5746349378042184, + "grad_norm": 0.5991353988647461, + "learning_rate": 1.1003941222885228e-05, + "loss": 1.1632, + "mean_token_accuracy": 0.6939632892608643, + "num_tokens": 4867419594.0, + "step": 9521 + }, + { + "epoch": 2.574905354245538, + "grad_norm": 1.6473928689956665, + "learning_rate": 1.1002364733779509e-05, + "loss": 1.9648, + "mean_token_accuracy": 0.5510095357894897, + "num_tokens": 4867943766.0, + "step": 9522 + }, + { + "epoch": 2.5751757706868577, + "grad_norm": 1.369060754776001, + "learning_rate": 1.1000788244601233e-05, + "loss": 1.8749, + "mean_token_accuracy": 0.5612474679946899, + "num_tokens": 4868412515.0, + "step": 9523 + }, + { + "epoch": 2.5754461871281773, + "grad_norm": 1.1721199750900269, + "learning_rate": 1.0999211755398768e-05, + "loss": 1.8821, + "mean_token_accuracy": 0.5579084157943726, + "num_tokens": 4868891097.0, + "step": 9524 + }, + { + "epoch": 2.575716603569497, + "grad_norm": 1.2548139095306396, + "learning_rate": 1.0997635266220494e-05, + "loss": 1.963, + "mean_token_accuracy": 0.5663962960243225, + "num_tokens": 4869315221.0, + "step": 9525 + }, + { + "epoch": 2.5759870200108166, + "grad_norm": 1.241410732269287, + "learning_rate": 1.0996058777114775e-05, + "loss": 1.8224, + "mean_token_accuracy": 0.5730401873588562, + "num_tokens": 4869839431.0, + "step": 9526 + }, + { + "epoch": 2.5762574364521362, + "grad_norm": 1.1787418127059937, + "learning_rate": 1.0994482288129982e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.5376126766204834, + "num_tokens": 4870363574.0, + "step": 9527 + }, + { + "epoch": 2.576527852893456, + "grad_norm": 1.3442078828811646, + "learning_rate": 1.099290579931449e-05, + "loss": 1.8323, + "mean_token_accuracy": 0.5752492547035217, + "num_tokens": 4870856609.0, + "step": 9528 + }, + { + "epoch": 2.5767982693347755, + "grad_norm": 1.2846001386642456, + "learning_rate": 1.0991329310716671e-05, + "loss": 1.9356, + "mean_token_accuracy": 0.5513601303100586, + "num_tokens": 4871380813.0, + "step": 9529 + }, + { + "epoch": 2.577068685776095, + "grad_norm": 1.1284977197647095, + "learning_rate": 1.0989752822384891e-05, + "loss": 1.9224, + "mean_token_accuracy": 0.559330403804779, + "num_tokens": 4871905099.0, + "step": 9530 + }, + { + "epoch": 2.577339102217415, + "grad_norm": 1.1386280059814453, + "learning_rate": 1.0988176334367528e-05, + "loss": 1.8436, + "mean_token_accuracy": 0.5705004930496216, + "num_tokens": 4872378265.0, + "step": 9531 + }, + { + "epoch": 2.5776095186587344, + "grad_norm": 1.1452977657318115, + "learning_rate": 1.0986599846712948e-05, + "loss": 1.8391, + "mean_token_accuracy": 0.5630878210067749, + "num_tokens": 4872890440.0, + "step": 9532 + }, + { + "epoch": 2.577879935100054, + "grad_norm": 1.1210888624191284, + "learning_rate": 1.0985023359469523e-05, + "loss": 1.8761, + "mean_token_accuracy": 0.5605292320251465, + "num_tokens": 4873414527.0, + "step": 9533 + }, + { + "epoch": 2.5781503515413737, + "grad_norm": 1.118927001953125, + "learning_rate": 1.0983446872685626e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5712299942970276, + "num_tokens": 4873903567.0, + "step": 9534 + }, + { + "epoch": 2.5784207679826934, + "grad_norm": 1.268784523010254, + "learning_rate": 1.0981870386409629e-05, + "loss": 1.8159, + "mean_token_accuracy": 0.554771363735199, + "num_tokens": 4874410031.0, + "step": 9535 + }, + { + "epoch": 2.578691184424013, + "grad_norm": 1.0141687393188477, + "learning_rate": 1.09802939006899e-05, + "loss": 1.8544, + "mean_token_accuracy": 0.5626212358474731, + "num_tokens": 4874868878.0, + "step": 9536 + }, + { + "epoch": 2.5789616008653327, + "grad_norm": 1.2967404127120972, + "learning_rate": 1.0978717415574813e-05, + "loss": 2.0024, + "mean_token_accuracy": 0.564527153968811, + "num_tokens": 4875329446.0, + "step": 9537 + }, + { + "epoch": 2.5792320173066523, + "grad_norm": 1.2434216737747192, + "learning_rate": 1.0977140931112737e-05, + "loss": 1.9137, + "mean_token_accuracy": 0.581096351146698, + "num_tokens": 4875853646.0, + "step": 9538 + }, + { + "epoch": 2.579502433747972, + "grad_norm": 1.2185858488082886, + "learning_rate": 1.0975564447352047e-05, + "loss": 1.8006, + "mean_token_accuracy": 0.5899292230606079, + "num_tokens": 4876377795.0, + "step": 9539 + }, + { + "epoch": 2.5797728501892916, + "grad_norm": 1.2159432172775269, + "learning_rate": 1.0973987964341113e-05, + "loss": 1.8906, + "mean_token_accuracy": 0.5729342699050903, + "num_tokens": 4876901968.0, + "step": 9540 + }, + { + "epoch": 2.5800432666306112, + "grad_norm": 0.5648332238197327, + "learning_rate": 1.0972411482128298e-05, + "loss": 1.0314, + "mean_token_accuracy": 0.7279084920883179, + "num_tokens": 4877418450.0, + "step": 9541 + }, + { + "epoch": 2.580313683071931, + "grad_norm": 1.6594840288162231, + "learning_rate": 1.0970835000761983e-05, + "loss": 1.8399, + "mean_token_accuracy": 0.5681632161140442, + "num_tokens": 4877942703.0, + "step": 9542 + }, + { + "epoch": 2.5805840995132505, + "grad_norm": 1.652255654335022, + "learning_rate": 1.096925852029054e-05, + "loss": 1.8413, + "mean_token_accuracy": 0.5829372406005859, + "num_tokens": 4878454226.0, + "step": 9543 + }, + { + "epoch": 2.58085451595457, + "grad_norm": 1.1236481666564941, + "learning_rate": 1.0967682040762331e-05, + "loss": 1.9301, + "mean_token_accuracy": 0.569774866104126, + "num_tokens": 4878921583.0, + "step": 9544 + }, + { + "epoch": 2.58112493239589, + "grad_norm": 1.1717114448547363, + "learning_rate": 1.0966105562225736e-05, + "loss": 1.7874, + "mean_token_accuracy": 0.5664774179458618, + "num_tokens": 4879445667.0, + "step": 9545 + }, + { + "epoch": 2.5813953488372094, + "grad_norm": 1.3681011199951172, + "learning_rate": 1.0964529084729122e-05, + "loss": 1.9546, + "mean_token_accuracy": 0.5590190887451172, + "num_tokens": 4879969885.0, + "step": 9546 + }, + { + "epoch": 2.581665765278529, + "grad_norm": 1.1844655275344849, + "learning_rate": 1.0962952608320853e-05, + "loss": 1.7905, + "mean_token_accuracy": 0.5699361562728882, + "num_tokens": 4880494171.0, + "step": 9547 + }, + { + "epoch": 2.5819361817198487, + "grad_norm": 1.158686637878418, + "learning_rate": 1.0961376133049315e-05, + "loss": 1.8693, + "mean_token_accuracy": 0.5794371366500854, + "num_tokens": 4881018401.0, + "step": 9548 + }, + { + "epoch": 2.582206598161168, + "grad_norm": 1.2366605997085571, + "learning_rate": 1.0959799658962866e-05, + "loss": 1.9067, + "mean_token_accuracy": 0.5901076793670654, + "num_tokens": 4881520831.0, + "step": 9549 + }, + { + "epoch": 2.582477014602488, + "grad_norm": 1.2058074474334717, + "learning_rate": 1.0958223186109882e-05, + "loss": 1.8925, + "mean_token_accuracy": 0.5648443698883057, + "num_tokens": 4881947985.0, + "step": 9550 + }, + { + "epoch": 2.582747431043807, + "grad_norm": 1.3640296459197998, + "learning_rate": 1.0956646714538736e-05, + "loss": 1.8761, + "mean_token_accuracy": 0.5827693939208984, + "num_tokens": 4882432228.0, + "step": 9551 + }, + { + "epoch": 2.5830178474851273, + "grad_norm": 1.084886908531189, + "learning_rate": 1.0955070244297798e-05, + "loss": 1.7408, + "mean_token_accuracy": 0.6043511033058167, + "num_tokens": 4882956356.0, + "step": 9552 + }, + { + "epoch": 2.5832882639264465, + "grad_norm": 1.3246852159500122, + "learning_rate": 1.0953493775435431e-05, + "loss": 1.9361, + "mean_token_accuracy": 0.5611844658851624, + "num_tokens": 4883480634.0, + "step": 9553 + }, + { + "epoch": 2.5835586803677666, + "grad_norm": 1.4456217288970947, + "learning_rate": 1.0951917308000017e-05, + "loss": 1.8467, + "mean_token_accuracy": 0.569701611995697, + "num_tokens": 4883949284.0, + "step": 9554 + }, + { + "epoch": 2.583829096809086, + "grad_norm": 0.9999287128448486, + "learning_rate": 1.0950340842039919e-05, + "loss": 1.9031, + "mean_token_accuracy": 0.5679168105125427, + "num_tokens": 4884473479.0, + "step": 9555 + }, + { + "epoch": 2.584099513250406, + "grad_norm": 1.123112440109253, + "learning_rate": 1.094876437760351e-05, + "loss": 1.8169, + "mean_token_accuracy": 0.5890814065933228, + "num_tokens": 4884961313.0, + "step": 9556 + }, + { + "epoch": 2.584369929691725, + "grad_norm": 1.233823537826538, + "learning_rate": 1.0947187914739159e-05, + "loss": 1.9506, + "mean_token_accuracy": 0.5451700687408447, + "num_tokens": 4885485541.0, + "step": 9557 + }, + { + "epoch": 2.584640346133045, + "grad_norm": 0.9599207639694214, + "learning_rate": 1.0945611453495242e-05, + "loss": 1.8685, + "mean_token_accuracy": 0.5685223937034607, + "num_tokens": 4886009771.0, + "step": 9558 + }, + { + "epoch": 2.5849107625743644, + "grad_norm": 1.1792188882827759, + "learning_rate": 1.094403499392012e-05, + "loss": 1.8215, + "mean_token_accuracy": 0.5778374671936035, + "num_tokens": 4886534024.0, + "step": 9559 + }, + { + "epoch": 2.585181179015684, + "grad_norm": 1.209477186203003, + "learning_rate": 1.0942458536062174e-05, + "loss": 1.896, + "mean_token_accuracy": 0.567890465259552, + "num_tokens": 4887058152.0, + "step": 9560 + }, + { + "epoch": 2.5854515954570036, + "grad_norm": 0.4621787667274475, + "learning_rate": 1.0940882079969768e-05, + "loss": 1.1044, + "mean_token_accuracy": 0.6999636888504028, + "num_tokens": 4887582262.0, + "step": 9561 + }, + { + "epoch": 2.5857220118983233, + "grad_norm": 0.987702488899231, + "learning_rate": 1.0939305625691274e-05, + "loss": 1.8236, + "mean_token_accuracy": 0.563382625579834, + "num_tokens": 4888106475.0, + "step": 9562 + }, + { + "epoch": 2.585992428339643, + "grad_norm": 1.1682651042938232, + "learning_rate": 1.0937729173275063e-05, + "loss": 1.8514, + "mean_token_accuracy": 0.5557785630226135, + "num_tokens": 4888579200.0, + "step": 9563 + }, + { + "epoch": 2.5862628447809626, + "grad_norm": 0.8640053868293762, + "learning_rate": 1.0936152722769498e-05, + "loss": 1.9069, + "mean_token_accuracy": 0.5535973310470581, + "num_tokens": 4889103427.0, + "step": 9564 + }, + { + "epoch": 2.586533261222282, + "grad_norm": 1.528252363204956, + "learning_rate": 1.0934576274222964e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.5684620141983032, + "num_tokens": 4889590461.0, + "step": 9565 + }, + { + "epoch": 2.586803677663602, + "grad_norm": 1.1478768587112427, + "learning_rate": 1.0932999827683817e-05, + "loss": 1.885, + "mean_token_accuracy": 0.5667669177055359, + "num_tokens": 4890114734.0, + "step": 9566 + }, + { + "epoch": 2.5870740941049215, + "grad_norm": 1.1448050737380981, + "learning_rate": 1.0931423383200432e-05, + "loss": 1.8877, + "mean_token_accuracy": 0.5670241117477417, + "num_tokens": 4890629250.0, + "step": 9567 + }, + { + "epoch": 2.587344510546241, + "grad_norm": 0.9824375510215759, + "learning_rate": 1.0929846940821183e-05, + "loss": 1.9371, + "mean_token_accuracy": 0.5503714084625244, + "num_tokens": 4891153530.0, + "step": 9568 + }, + { + "epoch": 2.587614926987561, + "grad_norm": 1.0199404954910278, + "learning_rate": 1.0928270500594434e-05, + "loss": 1.7885, + "mean_token_accuracy": 0.575037956237793, + "num_tokens": 4891667107.0, + "step": 9569 + }, + { + "epoch": 2.5878853434288804, + "grad_norm": 1.2525266408920288, + "learning_rate": 1.0926694062568555e-05, + "loss": 1.8351, + "mean_token_accuracy": 0.58243727684021, + "num_tokens": 4892178965.0, + "step": 9570 + }, + { + "epoch": 2.5881557598702, + "grad_norm": 1.316604495048523, + "learning_rate": 1.0925117626791919e-05, + "loss": 1.8857, + "mean_token_accuracy": 0.560885488986969, + "num_tokens": 4892703234.0, + "step": 9571 + }, + { + "epoch": 2.5884261763115197, + "grad_norm": 1.1104964017868042, + "learning_rate": 1.0923541193312897e-05, + "loss": 1.8412, + "mean_token_accuracy": 0.5577525496482849, + "num_tokens": 4893227438.0, + "step": 9572 + }, + { + "epoch": 2.5886965927528394, + "grad_norm": 1.1662124395370483, + "learning_rate": 1.0921964762179851e-05, + "loss": 1.7826, + "mean_token_accuracy": 0.5788936614990234, + "num_tokens": 4893751623.0, + "step": 9573 + }, + { + "epoch": 2.588967009194159, + "grad_norm": 1.1958739757537842, + "learning_rate": 1.092038833344116e-05, + "loss": 1.8535, + "mean_token_accuracy": 0.5776013135910034, + "num_tokens": 4894275818.0, + "step": 9574 + }, + { + "epoch": 2.5892374256354787, + "grad_norm": 1.0467123985290527, + "learning_rate": 1.091881190714519e-05, + "loss": 1.8816, + "mean_token_accuracy": 0.5813999176025391, + "num_tokens": 4894747257.0, + "step": 9575 + }, + { + "epoch": 2.5895078420767983, + "grad_norm": 1.230614423751831, + "learning_rate": 1.0917235483340305e-05, + "loss": 1.6932, + "mean_token_accuracy": 0.6127842664718628, + "num_tokens": 4895271544.0, + "step": 9576 + }, + { + "epoch": 2.589778258518118, + "grad_norm": 1.3569555282592773, + "learning_rate": 1.091565906207488e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5952423214912415, + "num_tokens": 4895747708.0, + "step": 9577 + }, + { + "epoch": 2.5900486749594376, + "grad_norm": 1.0937175750732422, + "learning_rate": 1.0914082643397286e-05, + "loss": 1.9422, + "mean_token_accuracy": 0.5712653994560242, + "num_tokens": 4896271909.0, + "step": 9578 + }, + { + "epoch": 2.5903190914007572, + "grad_norm": 1.1598446369171143, + "learning_rate": 1.0912506227355883e-05, + "loss": 1.8184, + "mean_token_accuracy": 0.5768533945083618, + "num_tokens": 4896796053.0, + "step": 9579 + }, + { + "epoch": 2.590589507842077, + "grad_norm": 1.1183136701583862, + "learning_rate": 1.0910929813999052e-05, + "loss": 1.8962, + "mean_token_accuracy": 0.5699785351753235, + "num_tokens": 4897320290.0, + "step": 9580 + }, + { + "epoch": 2.5908599242833965, + "grad_norm": 0.5115907788276672, + "learning_rate": 1.0909353403375156e-05, + "loss": 1.1467, + "mean_token_accuracy": 0.699914276599884, + "num_tokens": 4897844559.0, + "step": 9581 + }, + { + "epoch": 2.591130340724716, + "grad_norm": 1.0347095727920532, + "learning_rate": 1.090777699553256e-05, + "loss": 1.8714, + "mean_token_accuracy": 0.5617239475250244, + "num_tokens": 4898343540.0, + "step": 9582 + }, + { + "epoch": 2.591400757166036, + "grad_norm": 1.2879550457000732, + "learning_rate": 1.0906200590519642e-05, + "loss": 2.0088, + "mean_token_accuracy": 0.5529412627220154, + "num_tokens": 4898851269.0, + "step": 9583 + }, + { + "epoch": 2.5916711736073554, + "grad_norm": 0.9993076324462891, + "learning_rate": 1.090462418838476e-05, + "loss": 1.8099, + "mean_token_accuracy": 0.5923299789428711, + "num_tokens": 4899349653.0, + "step": 9584 + }, + { + "epoch": 2.591941590048675, + "grad_norm": 1.1727626323699951, + "learning_rate": 1.0903047789176293e-05, + "loss": 1.9183, + "mean_token_accuracy": 0.5738744735717773, + "num_tokens": 4899873871.0, + "step": 9585 + }, + { + "epoch": 2.5922120064899947, + "grad_norm": 0.9614190459251404, + "learning_rate": 1.0901471392942604e-05, + "loss": 1.791, + "mean_token_accuracy": 0.5854368805885315, + "num_tokens": 4900398113.0, + "step": 9586 + }, + { + "epoch": 2.5924824229313144, + "grad_norm": 1.0226566791534424, + "learning_rate": 1.089989499973206e-05, + "loss": 1.8467, + "mean_token_accuracy": 0.5730392932891846, + "num_tokens": 4900922350.0, + "step": 9587 + }, + { + "epoch": 2.592752839372634, + "grad_norm": 0.9742563962936401, + "learning_rate": 1.0898318609593035e-05, + "loss": 1.8392, + "mean_token_accuracy": 0.5753498673439026, + "num_tokens": 4901446441.0, + "step": 9588 + }, + { + "epoch": 2.5930232558139537, + "grad_norm": 1.0808035135269165, + "learning_rate": 1.0896742222573896e-05, + "loss": 1.6873, + "mean_token_accuracy": 0.6057882905006409, + "num_tokens": 4901958772.0, + "step": 9589 + }, + { + "epoch": 2.593293672255273, + "grad_norm": 1.3291352987289429, + "learning_rate": 1.0895165838723003e-05, + "loss": 1.9044, + "mean_token_accuracy": 0.544236958026886, + "num_tokens": 4902482833.0, + "step": 9590 + }, + { + "epoch": 2.593564088696593, + "grad_norm": 1.113773226737976, + "learning_rate": 1.0893589458088735e-05, + "loss": 1.8465, + "mean_token_accuracy": 0.5928946733474731, + "num_tokens": 4903006990.0, + "step": 9591 + }, + { + "epoch": 2.593834505137912, + "grad_norm": 0.9567151069641113, + "learning_rate": 1.0892013080719455e-05, + "loss": 1.8877, + "mean_token_accuracy": 0.5575775504112244, + "num_tokens": 4903530989.0, + "step": 9592 + }, + { + "epoch": 2.5941049215792322, + "grad_norm": 1.255743384361267, + "learning_rate": 1.0890436706663528e-05, + "loss": 1.8397, + "mean_token_accuracy": 0.5823203325271606, + "num_tokens": 4904055062.0, + "step": 9593 + }, + { + "epoch": 2.5943753380205514, + "grad_norm": 1.4754078388214111, + "learning_rate": 1.0888860335969327e-05, + "loss": 1.9429, + "mean_token_accuracy": 0.5552058815956116, + "num_tokens": 4904579289.0, + "step": 9594 + }, + { + "epoch": 2.5946457544618715, + "grad_norm": 1.2530990839004517, + "learning_rate": 1.088728396868522e-05, + "loss": 2.0216, + "mean_token_accuracy": 0.546062707901001, + "num_tokens": 4905103541.0, + "step": 9595 + }, + { + "epoch": 2.5949161709031907, + "grad_norm": 1.225325345993042, + "learning_rate": 1.0885707604859567e-05, + "loss": 1.779, + "mean_token_accuracy": 0.5976755619049072, + "num_tokens": 4905574123.0, + "step": 9596 + }, + { + "epoch": 2.595186587344511, + "grad_norm": 1.3276004791259766, + "learning_rate": 1.0884131244540747e-05, + "loss": 1.8928, + "mean_token_accuracy": 0.5628027319908142, + "num_tokens": 4906098245.0, + "step": 9597 + }, + { + "epoch": 2.59545700378583, + "grad_norm": 1.309059739112854, + "learning_rate": 1.088255488777712e-05, + "loss": 1.9415, + "mean_token_accuracy": 0.5615212917327881, + "num_tokens": 4906622494.0, + "step": 9598 + }, + { + "epoch": 2.59572742022715, + "grad_norm": 1.2560330629348755, + "learning_rate": 1.0880978534617048e-05, + "loss": 1.9399, + "mean_token_accuracy": 0.5664893388748169, + "num_tokens": 4907146658.0, + "step": 9599 + }, + { + "epoch": 2.5959978366684693, + "grad_norm": 1.4971699714660645, + "learning_rate": 1.0879402185108912e-05, + "loss": 1.914, + "mean_token_accuracy": 0.568903923034668, + "num_tokens": 4907670791.0, + "step": 9600 + }, + { + "epoch": 2.596268253109789, + "grad_norm": 0.541826605796814, + "learning_rate": 1.087782583930107e-05, + "loss": 1.122, + "mean_token_accuracy": 0.6996573209762573, + "num_tokens": 4908194857.0, + "step": 9601 + }, + { + "epoch": 2.5965386695511086, + "grad_norm": 1.3667677640914917, + "learning_rate": 1.0876249497241886e-05, + "loss": 1.8618, + "mean_token_accuracy": 0.566034734249115, + "num_tokens": 4908665593.0, + "step": 9602 + }, + { + "epoch": 2.596809085992428, + "grad_norm": 1.9577031135559082, + "learning_rate": 1.0874673158979738e-05, + "loss": 1.9941, + "mean_token_accuracy": 0.5835543870925903, + "num_tokens": 4909166228.0, + "step": 9603 + }, + { + "epoch": 2.597079502433748, + "grad_norm": 1.3878406286239624, + "learning_rate": 1.0873096824562981e-05, + "loss": 1.9499, + "mean_token_accuracy": 0.5386495590209961, + "num_tokens": 4909690456.0, + "step": 9604 + }, + { + "epoch": 2.5973499188750675, + "grad_norm": 1.2010339498519897, + "learning_rate": 1.0871520494039988e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.5719645023345947, + "num_tokens": 4910214686.0, + "step": 9605 + }, + { + "epoch": 2.597620335316387, + "grad_norm": 1.1703139543533325, + "learning_rate": 1.0869944167459126e-05, + "loss": 1.8741, + "mean_token_accuracy": 0.5758810043334961, + "num_tokens": 4910738917.0, + "step": 9606 + }, + { + "epoch": 2.597890751757707, + "grad_norm": 1.2292284965515137, + "learning_rate": 1.0868367844868755e-05, + "loss": 1.8474, + "mean_token_accuracy": 0.560407280921936, + "num_tokens": 4911263183.0, + "step": 9607 + }, + { + "epoch": 2.5981611681990264, + "grad_norm": 1.3865201473236084, + "learning_rate": 1.0866791526317248e-05, + "loss": 1.9379, + "mean_token_accuracy": 0.57255619764328, + "num_tokens": 4911787320.0, + "step": 9608 + }, + { + "epoch": 2.598431584640346, + "grad_norm": 1.237961769104004, + "learning_rate": 1.0865215211852972e-05, + "loss": 1.7644, + "mean_token_accuracy": 0.5936859846115112, + "num_tokens": 4912311599.0, + "step": 9609 + }, + { + "epoch": 2.5987020010816657, + "grad_norm": 1.2222009897232056, + "learning_rate": 1.0863638901524285e-05, + "loss": 1.843, + "mean_token_accuracy": 0.5648009777069092, + "num_tokens": 4912835870.0, + "step": 9610 + }, + { + "epoch": 2.5989724175229854, + "grad_norm": 1.2181015014648438, + "learning_rate": 1.0862062595379562e-05, + "loss": 1.9705, + "mean_token_accuracy": 0.5505193471908569, + "num_tokens": 4913360144.0, + "step": 9611 + }, + { + "epoch": 2.599242833964305, + "grad_norm": 1.2379897832870483, + "learning_rate": 1.086048629346716e-05, + "loss": 1.9912, + "mean_token_accuracy": 0.5661352872848511, + "num_tokens": 4913830371.0, + "step": 9612 + }, + { + "epoch": 2.5995132504056246, + "grad_norm": 1.109117031097412, + "learning_rate": 1.085890999583545e-05, + "loss": 1.8967, + "mean_token_accuracy": 0.5546050071716309, + "num_tokens": 4914354623.0, + "step": 9613 + }, + { + "epoch": 2.5997836668469443, + "grad_norm": 1.980699062347412, + "learning_rate": 1.0857333702532798e-05, + "loss": 1.8041, + "mean_token_accuracy": 0.5898085832595825, + "num_tokens": 4914878852.0, + "step": 9614 + }, + { + "epoch": 2.600054083288264, + "grad_norm": 1.6275662183761597, + "learning_rate": 1.0855757413607568e-05, + "loss": 1.8459, + "mean_token_accuracy": 0.5615822076797485, + "num_tokens": 4915402990.0, + "step": 9615 + }, + { + "epoch": 2.6003244997295836, + "grad_norm": 1.5363825559616089, + "learning_rate": 1.0854181129108121e-05, + "loss": 1.8992, + "mean_token_accuracy": 0.5699359178543091, + "num_tokens": 4915927264.0, + "step": 9616 + }, + { + "epoch": 2.600594916170903, + "grad_norm": 1.2366375923156738, + "learning_rate": 1.0852604849082829e-05, + "loss": 1.8874, + "mean_token_accuracy": 0.575122594833374, + "num_tokens": 4916405273.0, + "step": 9617 + }, + { + "epoch": 2.600865332612223, + "grad_norm": 1.1636667251586914, + "learning_rate": 1.0851028573580054e-05, + "loss": 1.8642, + "mean_token_accuracy": 0.5855750441551208, + "num_tokens": 4916919572.0, + "step": 9618 + }, + { + "epoch": 2.6011357490535425, + "grad_norm": 1.3324029445648193, + "learning_rate": 1.0849452302648156e-05, + "loss": 1.9733, + "mean_token_accuracy": 0.5561646819114685, + "num_tokens": 4917443815.0, + "step": 9619 + }, + { + "epoch": 2.601406165494862, + "grad_norm": 1.389554738998413, + "learning_rate": 1.0847876036335507e-05, + "loss": 1.8351, + "mean_token_accuracy": 0.5741223096847534, + "num_tokens": 4917968006.0, + "step": 9620 + }, + { + "epoch": 2.601676581936182, + "grad_norm": 0.5519909262657166, + "learning_rate": 1.0846299774690469e-05, + "loss": 1.0868, + "mean_token_accuracy": 0.717055082321167, + "num_tokens": 4918414206.0, + "step": 9621 + }, + { + "epoch": 2.6019469983775014, + "grad_norm": 1.3972355127334595, + "learning_rate": 1.0844723517761404e-05, + "loss": 1.8215, + "mean_token_accuracy": 0.5780425071716309, + "num_tokens": 4918895769.0, + "step": 9622 + }, + { + "epoch": 2.602217414818821, + "grad_norm": 1.365859031677246, + "learning_rate": 1.084314726559668e-05, + "loss": 1.8552, + "mean_token_accuracy": 0.5691686868667603, + "num_tokens": 4919419967.0, + "step": 9623 + }, + { + "epoch": 2.6024878312601407, + "grad_norm": 1.143041968345642, + "learning_rate": 1.0841571018244661e-05, + "loss": 1.8396, + "mean_token_accuracy": 0.5928311347961426, + "num_tokens": 4919944212.0, + "step": 9624 + }, + { + "epoch": 2.6027582477014604, + "grad_norm": 0.9885902404785156, + "learning_rate": 1.0839994775753707e-05, + "loss": 1.9531, + "mean_token_accuracy": 0.5618475675582886, + "num_tokens": 4920468495.0, + "step": 9625 + }, + { + "epoch": 2.60302866414278, + "grad_norm": 1.2491282224655151, + "learning_rate": 1.0838418538172184e-05, + "loss": 1.8514, + "mean_token_accuracy": 0.5865416526794434, + "num_tokens": 4920992769.0, + "step": 9626 + }, + { + "epoch": 2.6032990805840996, + "grad_norm": 1.307690978050232, + "learning_rate": 1.0836842305548454e-05, + "loss": 1.8971, + "mean_token_accuracy": 0.5629130005836487, + "num_tokens": 4921421157.0, + "step": 9627 + }, + { + "epoch": 2.6035694970254193, + "grad_norm": 1.1206226348876953, + "learning_rate": 1.0835266077930882e-05, + "loss": 1.8604, + "mean_token_accuracy": 0.5900092720985413, + "num_tokens": 4921904438.0, + "step": 9628 + }, + { + "epoch": 2.603839913466739, + "grad_norm": 1.2105242013931274, + "learning_rate": 1.0833689855367832e-05, + "loss": 1.7877, + "mean_token_accuracy": 0.5837953090667725, + "num_tokens": 4922414927.0, + "step": 9629 + }, + { + "epoch": 2.6041103299080586, + "grad_norm": 1.241952896118164, + "learning_rate": 1.0832113637907666e-05, + "loss": 1.9532, + "mean_token_accuracy": 0.5554627180099487, + "num_tokens": 4922904080.0, + "step": 9630 + }, + { + "epoch": 2.6043807463493778, + "grad_norm": 1.3246711492538452, + "learning_rate": 1.0830537425598749e-05, + "loss": 1.795, + "mean_token_accuracy": 0.5761300921440125, + "num_tokens": 4923428232.0, + "step": 9631 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 1.166123628616333, + "learning_rate": 1.0828961218489442e-05, + "loss": 1.8577, + "mean_token_accuracy": 0.5878802537918091, + "num_tokens": 4923847291.0, + "step": 9632 + }, + { + "epoch": 2.604921579232017, + "grad_norm": 1.0951380729675293, + "learning_rate": 1.0827385016628105e-05, + "loss": 1.9482, + "mean_token_accuracy": 0.564630389213562, + "num_tokens": 4924331802.0, + "step": 9633 + }, + { + "epoch": 2.605191995673337, + "grad_norm": 1.4651784896850586, + "learning_rate": 1.0825808820063104e-05, + "loss": 1.9795, + "mean_token_accuracy": 0.531934380531311, + "num_tokens": 4924855994.0, + "step": 9634 + }, + { + "epoch": 2.6054624121146563, + "grad_norm": 1.4753069877624512, + "learning_rate": 1.0824232628842803e-05, + "loss": 1.8583, + "mean_token_accuracy": 0.5779725313186646, + "num_tokens": 4925380265.0, + "step": 9635 + }, + { + "epoch": 2.6057328285559764, + "grad_norm": 1.0652104616165161, + "learning_rate": 1.082265644301556e-05, + "loss": 1.8338, + "mean_token_accuracy": 0.5932872891426086, + "num_tokens": 4925904505.0, + "step": 9636 + }, + { + "epoch": 2.6060032449972956, + "grad_norm": 1.521012544631958, + "learning_rate": 1.0821080262629739e-05, + "loss": 1.7633, + "mean_token_accuracy": 0.6012111306190491, + "num_tokens": 4926428727.0, + "step": 9637 + }, + { + "epoch": 2.6062736614386157, + "grad_norm": 1.5474978685379028, + "learning_rate": 1.0819504087733705e-05, + "loss": 1.8004, + "mean_token_accuracy": 0.5633952617645264, + "num_tokens": 4926952950.0, + "step": 9638 + }, + { + "epoch": 2.606544077879935, + "grad_norm": 1.0859167575836182, + "learning_rate": 1.0817927918375812e-05, + "loss": 1.8951, + "mean_token_accuracy": 0.5670220851898193, + "num_tokens": 4927477189.0, + "step": 9639 + }, + { + "epoch": 2.606814494321255, + "grad_norm": 1.0621668100357056, + "learning_rate": 1.0816351754604429e-05, + "loss": 1.8938, + "mean_token_accuracy": 0.5608209371566772, + "num_tokens": 4928001428.0, + "step": 9640 + }, + { + "epoch": 2.607084910762574, + "grad_norm": 0.5870822072029114, + "learning_rate": 1.0814775596467913e-05, + "loss": 1.095, + "mean_token_accuracy": 0.7132113575935364, + "num_tokens": 4928487238.0, + "step": 9641 + }, + { + "epoch": 2.607355327203894, + "grad_norm": 1.6556745767593384, + "learning_rate": 1.0813199444014624e-05, + "loss": 1.9506, + "mean_token_accuracy": 0.5531976819038391, + "num_tokens": 4929011494.0, + "step": 9642 + }, + { + "epoch": 2.6076257436452135, + "grad_norm": 1.4770169258117676, + "learning_rate": 1.0811623297292928e-05, + "loss": 1.91, + "mean_token_accuracy": 0.561980128288269, + "num_tokens": 4929535771.0, + "step": 9643 + }, + { + "epoch": 2.607896160086533, + "grad_norm": 1.3129795789718628, + "learning_rate": 1.0810047156351184e-05, + "loss": 1.8662, + "mean_token_accuracy": 0.5682932138442993, + "num_tokens": 4930060027.0, + "step": 9644 + }, + { + "epoch": 2.6081665765278528, + "grad_norm": 1.2916938066482544, + "learning_rate": 1.0808471021237747e-05, + "loss": 1.8834, + "mean_token_accuracy": 0.5608269572257996, + "num_tokens": 4930584169.0, + "step": 9645 + }, + { + "epoch": 2.6084369929691724, + "grad_norm": 1.4931617975234985, + "learning_rate": 1.080689489200099e-05, + "loss": 1.93, + "mean_token_accuracy": 0.5732585787773132, + "num_tokens": 4931108360.0, + "step": 9646 + }, + { + "epoch": 2.608707409410492, + "grad_norm": 1.3448116779327393, + "learning_rate": 1.0805318768689261e-05, + "loss": 1.9407, + "mean_token_accuracy": 0.5649979710578918, + "num_tokens": 4931632585.0, + "step": 9647 + }, + { + "epoch": 2.6089778258518117, + "grad_norm": 1.1708025932312012, + "learning_rate": 1.0803742651350924e-05, + "loss": 1.8124, + "mean_token_accuracy": 0.5781136155128479, + "num_tokens": 4932156798.0, + "step": 9648 + }, + { + "epoch": 2.6092482422931313, + "grad_norm": 1.2899404764175415, + "learning_rate": 1.080216654003434e-05, + "loss": 1.6545, + "mean_token_accuracy": 0.607778787612915, + "num_tokens": 4932680977.0, + "step": 9649 + }, + { + "epoch": 2.609518658734451, + "grad_norm": 1.4611561298370361, + "learning_rate": 1.0800590434787872e-05, + "loss": 1.9305, + "mean_token_accuracy": 0.5598183870315552, + "num_tokens": 4933193081.0, + "step": 9650 + }, + { + "epoch": 2.6097890751757706, + "grad_norm": 1.329264760017395, + "learning_rate": 1.079901433565987e-05, + "loss": 1.9008, + "mean_token_accuracy": 0.551764726638794, + "num_tokens": 4933717313.0, + "step": 9651 + }, + { + "epoch": 2.6100594916170903, + "grad_norm": 1.0313712358474731, + "learning_rate": 1.0797438242698704e-05, + "loss": 1.9749, + "mean_token_accuracy": 0.5493085384368896, + "num_tokens": 4934241565.0, + "step": 9652 + }, + { + "epoch": 2.61032990805841, + "grad_norm": 1.205481767654419, + "learning_rate": 1.0795862155952723e-05, + "loss": 1.9079, + "mean_token_accuracy": 0.5567572116851807, + "num_tokens": 4934765826.0, + "step": 9653 + }, + { + "epoch": 2.6106003244997296, + "grad_norm": 0.999091386795044, + "learning_rate": 1.0794286075470298e-05, + "loss": 1.9092, + "mean_token_accuracy": 0.5745925903320312, + "num_tokens": 4935290110.0, + "step": 9654 + }, + { + "epoch": 2.610870740941049, + "grad_norm": 1.1941869258880615, + "learning_rate": 1.0792710001299777e-05, + "loss": 1.8598, + "mean_token_accuracy": 0.5732463598251343, + "num_tokens": 4935814349.0, + "step": 9655 + }, + { + "epoch": 2.611141157382369, + "grad_norm": 1.378442645072937, + "learning_rate": 1.0791133933489523e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.5708856582641602, + "num_tokens": 4936296223.0, + "step": 9656 + }, + { + "epoch": 2.6114115738236885, + "grad_norm": 1.0326875448226929, + "learning_rate": 1.0789557872087897e-05, + "loss": 1.811, + "mean_token_accuracy": 0.5749929547309875, + "num_tokens": 4936820320.0, + "step": 9657 + }, + { + "epoch": 2.611681990265008, + "grad_norm": 1.172661542892456, + "learning_rate": 1.0787981817143254e-05, + "loss": 1.7693, + "mean_token_accuracy": 0.5862813591957092, + "num_tokens": 4937293237.0, + "step": 9658 + }, + { + "epoch": 2.611952406706328, + "grad_norm": 1.2413506507873535, + "learning_rate": 1.0786405768703949e-05, + "loss": 1.9102, + "mean_token_accuracy": 0.5760290622711182, + "num_tokens": 4937735678.0, + "step": 9659 + }, + { + "epoch": 2.6122228231476474, + "grad_norm": 1.2000547647476196, + "learning_rate": 1.0784829726818347e-05, + "loss": 1.7302, + "mean_token_accuracy": 0.5807050466537476, + "num_tokens": 4938259670.0, + "step": 9660 + }, + { + "epoch": 2.612493239588967, + "grad_norm": 0.47939494252204895, + "learning_rate": 1.0783253691534798e-05, + "loss": 1.0911, + "mean_token_accuracy": 0.7103438973426819, + "num_tokens": 4938783839.0, + "step": 9661 + }, + { + "epoch": 2.6127636560302867, + "grad_norm": 1.6807447671890259, + "learning_rate": 1.0781677662901663e-05, + "loss": 1.7544, + "mean_token_accuracy": 0.5627821683883667, + "num_tokens": 4939308075.0, + "step": 9662 + }, + { + "epoch": 2.6130340724716064, + "grad_norm": 1.6480258703231812, + "learning_rate": 1.0780101640967302e-05, + "loss": 1.7922, + "mean_token_accuracy": 0.5795687437057495, + "num_tokens": 4939832239.0, + "step": 9663 + }, + { + "epoch": 2.613304488912926, + "grad_norm": 1.2823081016540527, + "learning_rate": 1.0778525625780071e-05, + "loss": 1.8058, + "mean_token_accuracy": 0.583612859249115, + "num_tokens": 4940356475.0, + "step": 9664 + }, + { + "epoch": 2.6135749053542456, + "grad_norm": 1.032925009727478, + "learning_rate": 1.0776949617388321e-05, + "loss": 1.8727, + "mean_token_accuracy": 0.5731557607650757, + "num_tokens": 4940880746.0, + "step": 9665 + }, + { + "epoch": 2.6138453217955653, + "grad_norm": 1.2661570310592651, + "learning_rate": 1.0775373615840417e-05, + "loss": 1.9182, + "mean_token_accuracy": 0.5551838278770447, + "num_tokens": 4941404934.0, + "step": 9666 + }, + { + "epoch": 2.614115738236885, + "grad_norm": 1.3358350992202759, + "learning_rate": 1.0773797621184707e-05, + "loss": 1.8797, + "mean_token_accuracy": 0.5679608583450317, + "num_tokens": 4941929028.0, + "step": 9667 + }, + { + "epoch": 2.6143861546782046, + "grad_norm": 1.3787583112716675, + "learning_rate": 1.0772221633469554e-05, + "loss": 1.9382, + "mean_token_accuracy": 0.5639711022377014, + "num_tokens": 4942397054.0, + "step": 9668 + }, + { + "epoch": 2.614656571119524, + "grad_norm": 1.12982976436615, + "learning_rate": 1.0770645652743308e-05, + "loss": 1.7812, + "mean_token_accuracy": 0.5845571756362915, + "num_tokens": 4942921276.0, + "step": 9669 + }, + { + "epoch": 2.614926987560844, + "grad_norm": 1.3075469732284546, + "learning_rate": 1.0769069679054332e-05, + "loss": 1.8561, + "mean_token_accuracy": 0.5734326243400574, + "num_tokens": 4943445552.0, + "step": 9670 + }, + { + "epoch": 2.6151974040021635, + "grad_norm": 1.1405160427093506, + "learning_rate": 1.0767493712450971e-05, + "loss": 1.9391, + "mean_token_accuracy": 0.5589590072631836, + "num_tokens": 4943969691.0, + "step": 9671 + }, + { + "epoch": 2.6154678204434827, + "grad_norm": 1.187364101409912, + "learning_rate": 1.0765917752981593e-05, + "loss": 1.8569, + "mean_token_accuracy": 0.5799710750579834, + "num_tokens": 4944493747.0, + "step": 9672 + }, + { + "epoch": 2.615738236884803, + "grad_norm": 1.2292267084121704, + "learning_rate": 1.0764341800694546e-05, + "loss": 1.8203, + "mean_token_accuracy": 0.5670936107635498, + "num_tokens": 4944981897.0, + "step": 9673 + }, + { + "epoch": 2.616008653326122, + "grad_norm": 1.2819137573242188, + "learning_rate": 1.0762765855638183e-05, + "loss": 1.9308, + "mean_token_accuracy": 0.5672821402549744, + "num_tokens": 4945449646.0, + "step": 9674 + }, + { + "epoch": 2.616279069767442, + "grad_norm": 1.1800824403762817, + "learning_rate": 1.0761189917860861e-05, + "loss": 1.8821, + "mean_token_accuracy": 0.5779873132705688, + "num_tokens": 4945968398.0, + "step": 9675 + }, + { + "epoch": 2.6165494862087613, + "grad_norm": 1.1681047677993774, + "learning_rate": 1.0759613987410935e-05, + "loss": 1.9532, + "mean_token_accuracy": 0.5530452132225037, + "num_tokens": 4946492497.0, + "step": 9676 + }, + { + "epoch": 2.6168199026500814, + "grad_norm": 1.379551649093628, + "learning_rate": 1.075803806433676e-05, + "loss": 1.9161, + "mean_token_accuracy": 0.5536524057388306, + "num_tokens": 4947016621.0, + "step": 9677 + }, + { + "epoch": 2.6170903190914006, + "grad_norm": 1.3095242977142334, + "learning_rate": 1.0756462148686689e-05, + "loss": 1.9592, + "mean_token_accuracy": 0.541633129119873, + "num_tokens": 4947540799.0, + "step": 9678 + }, + { + "epoch": 2.6173607355327206, + "grad_norm": 1.21384859085083, + "learning_rate": 1.0754886240509072e-05, + "loss": 1.8145, + "mean_token_accuracy": 0.5793676972389221, + "num_tokens": 4948064834.0, + "step": 9679 + }, + { + "epoch": 2.61763115197404, + "grad_norm": 1.3388177156448364, + "learning_rate": 1.0753310339852271e-05, + "loss": 1.8374, + "mean_token_accuracy": 0.5761863589286804, + "num_tokens": 4948560909.0, + "step": 9680 + }, + { + "epoch": 2.61790156841536, + "grad_norm": 0.5666399002075195, + "learning_rate": 1.0751734446764635e-05, + "loss": 1.1444, + "mean_token_accuracy": 0.6905914545059204, + "num_tokens": 4949085168.0, + "step": 9681 + }, + { + "epoch": 2.618171984856679, + "grad_norm": 1.8013553619384766, + "learning_rate": 1.075015856129451e-05, + "loss": 1.7701, + "mean_token_accuracy": 0.5948465466499329, + "num_tokens": 4949609231.0, + "step": 9682 + }, + { + "epoch": 2.6184424012979988, + "grad_norm": 1.538300633430481, + "learning_rate": 1.074858268349026e-05, + "loss": 1.8711, + "mean_token_accuracy": 0.5734680891036987, + "num_tokens": 4950133357.0, + "step": 9683 + }, + { + "epoch": 2.6187128177393184, + "grad_norm": 1.129728078842163, + "learning_rate": 1.0747006813400232e-05, + "loss": 1.8964, + "mean_token_accuracy": 0.574250340461731, + "num_tokens": 4950657630.0, + "step": 9684 + }, + { + "epoch": 2.618983234180638, + "grad_norm": 1.1181895732879639, + "learning_rate": 1.0745430951072775e-05, + "loss": 1.8145, + "mean_token_accuracy": 0.5823585987091064, + "num_tokens": 4951181844.0, + "step": 9685 + }, + { + "epoch": 2.6192536506219577, + "grad_norm": 1.5396705865859985, + "learning_rate": 1.0743855096556248e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.5420283675193787, + "num_tokens": 4951705890.0, + "step": 9686 + }, + { + "epoch": 2.6195240670632773, + "grad_norm": 1.4115712642669678, + "learning_rate": 1.0742279249899004e-05, + "loss": 1.8533, + "mean_token_accuracy": 0.5677694082260132, + "num_tokens": 4952230161.0, + "step": 9687 + }, + { + "epoch": 2.619794483504597, + "grad_norm": 1.0888879299163818, + "learning_rate": 1.0740703411149383e-05, + "loss": 1.8444, + "mean_token_accuracy": 0.5877587199211121, + "num_tokens": 4952754339.0, + "step": 9688 + }, + { + "epoch": 2.6200648999459166, + "grad_norm": 1.3670692443847656, + "learning_rate": 1.0739127580355748e-05, + "loss": 2.0464, + "mean_token_accuracy": 0.5301063060760498, + "num_tokens": 4953278593.0, + "step": 9689 + }, + { + "epoch": 2.6203353163872363, + "grad_norm": 1.5268994569778442, + "learning_rate": 1.0737551757566447e-05, + "loss": 1.9807, + "mean_token_accuracy": 0.5596953630447388, + "num_tokens": 4953782378.0, + "step": 9690 + }, + { + "epoch": 2.620605732828556, + "grad_norm": 1.0580573081970215, + "learning_rate": 1.0735975942829828e-05, + "loss": 1.8386, + "mean_token_accuracy": 0.5664979219436646, + "num_tokens": 4954306629.0, + "step": 9691 + }, + { + "epoch": 2.6208761492698756, + "grad_norm": 1.329372525215149, + "learning_rate": 1.0734400136194243e-05, + "loss": 1.7947, + "mean_token_accuracy": 0.5864777565002441, + "num_tokens": 4954830819.0, + "step": 9692 + }, + { + "epoch": 2.621146565711195, + "grad_norm": 32.49242401123047, + "learning_rate": 1.0732824337708045e-05, + "loss": 1.7704, + "mean_token_accuracy": 0.6220024824142456, + "num_tokens": 4955316160.0, + "step": 9693 + }, + { + "epoch": 2.621416982152515, + "grad_norm": 1.7906849384307861, + "learning_rate": 1.0731248547419577e-05, + "loss": 1.9329, + "mean_token_accuracy": 0.560332715511322, + "num_tokens": 4955840365.0, + "step": 9694 + }, + { + "epoch": 2.6216873985938345, + "grad_norm": 1.3655701875686646, + "learning_rate": 1.0729672765377201e-05, + "loss": 1.8795, + "mean_token_accuracy": 0.571945071220398, + "num_tokens": 4956364529.0, + "step": 9695 + }, + { + "epoch": 2.621957815035154, + "grad_norm": 1.2740098237991333, + "learning_rate": 1.0728096991629256e-05, + "loss": 1.9225, + "mean_token_accuracy": 0.5691695809364319, + "num_tokens": 4956888728.0, + "step": 9696 + }, + { + "epoch": 2.6222282314764738, + "grad_norm": 1.1387802362442017, + "learning_rate": 1.0726521226224093e-05, + "loss": 1.8869, + "mean_token_accuracy": 0.5591461062431335, + "num_tokens": 4957412944.0, + "step": 9697 + }, + { + "epoch": 2.6224986479177934, + "grad_norm": 1.300262212753296, + "learning_rate": 1.0724945469210067e-05, + "loss": 1.9115, + "mean_token_accuracy": 0.5636448860168457, + "num_tokens": 4957937127.0, + "step": 9698 + }, + { + "epoch": 2.622769064359113, + "grad_norm": 0.9606694579124451, + "learning_rate": 1.0723369720635518e-05, + "loss": 1.7107, + "mean_token_accuracy": 0.6208873391151428, + "num_tokens": 4958461319.0, + "step": 9699 + }, + { + "epoch": 2.6230394808004327, + "grad_norm": 1.2824842929840088, + "learning_rate": 1.0721793980548805e-05, + "loss": 1.9138, + "mean_token_accuracy": 0.5365513563156128, + "num_tokens": 4958985519.0, + "step": 9700 + }, + { + "epoch": 2.6233098972417523, + "grad_norm": 0.39340195059776306, + "learning_rate": 1.072021824899827e-05, + "loss": 1.0682, + "mean_token_accuracy": 0.7170159220695496, + "num_tokens": 4959470810.0, + "step": 9701 + }, + { + "epoch": 2.623580313683072, + "grad_norm": 1.7791422605514526, + "learning_rate": 1.0718642526032257e-05, + "loss": 1.9339, + "mean_token_accuracy": 0.560030996799469, + "num_tokens": 4959995071.0, + "step": 9702 + }, + { + "epoch": 2.6238507301243916, + "grad_norm": 1.4831748008728027, + "learning_rate": 1.0717066811699123e-05, + "loss": 1.8773, + "mean_token_accuracy": 0.5729578733444214, + "num_tokens": 4960519346.0, + "step": 9703 + }, + { + "epoch": 2.6241211465657113, + "grad_norm": 1.093116044998169, + "learning_rate": 1.0715491106047212e-05, + "loss": 1.8564, + "mean_token_accuracy": 0.5836162567138672, + "num_tokens": 4961043621.0, + "step": 9704 + }, + { + "epoch": 2.624391563007031, + "grad_norm": 1.2100164890289307, + "learning_rate": 1.0713915409124866e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5846588015556335, + "num_tokens": 4961567817.0, + "step": 9705 + }, + { + "epoch": 2.6246619794483506, + "grad_norm": 1.254504680633545, + "learning_rate": 1.071233972098044e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.5751475691795349, + "num_tokens": 4961956081.0, + "step": 9706 + }, + { + "epoch": 2.62493239588967, + "grad_norm": 1.206605315208435, + "learning_rate": 1.0710764041662278e-05, + "loss": 1.8327, + "mean_token_accuracy": 0.5823901891708374, + "num_tokens": 4962415575.0, + "step": 9707 + }, + { + "epoch": 2.62520281233099, + "grad_norm": 1.2030938863754272, + "learning_rate": 1.0709188371218721e-05, + "loss": 1.8137, + "mean_token_accuracy": 0.5825334191322327, + "num_tokens": 4962939807.0, + "step": 9708 + }, + { + "epoch": 2.6254732287723095, + "grad_norm": 1.0942797660827637, + "learning_rate": 1.0707612709698125e-05, + "loss": 2.029, + "mean_token_accuracy": 0.5401911735534668, + "num_tokens": 4963464019.0, + "step": 9709 + }, + { + "epoch": 2.625743645213629, + "grad_norm": 1.1170662641525269, + "learning_rate": 1.0706037057148832e-05, + "loss": 1.8473, + "mean_token_accuracy": 0.568215548992157, + "num_tokens": 4963988088.0, + "step": 9710 + }, + { + "epoch": 2.6260140616549488, + "grad_norm": 1.2350114583969116, + "learning_rate": 1.0704461413619182e-05, + "loss": 1.7991, + "mean_token_accuracy": 0.589227020740509, + "num_tokens": 4964512353.0, + "step": 9711 + }, + { + "epoch": 2.6262844780962684, + "grad_norm": 1.0847715139389038, + "learning_rate": 1.0702885779157525e-05, + "loss": 1.8916, + "mean_token_accuracy": 0.551129937171936, + "num_tokens": 4965036617.0, + "step": 9712 + }, + { + "epoch": 2.6265548945375876, + "grad_norm": 1.2920417785644531, + "learning_rate": 1.0701310153812213e-05, + "loss": 1.7753, + "mean_token_accuracy": 0.6042830944061279, + "num_tokens": 4965560659.0, + "step": 9713 + }, + { + "epoch": 2.6268253109789077, + "grad_norm": 1.1145943403244019, + "learning_rate": 1.0699734537631574e-05, + "loss": 1.8729, + "mean_token_accuracy": 0.5616717338562012, + "num_tokens": 4966084807.0, + "step": 9714 + }, + { + "epoch": 2.627095727420227, + "grad_norm": 1.0541702508926392, + "learning_rate": 1.0698158930663968e-05, + "loss": 1.7875, + "mean_token_accuracy": 0.5819827318191528, + "num_tokens": 4966609086.0, + "step": 9715 + }, + { + "epoch": 2.627366143861547, + "grad_norm": 0.9675642848014832, + "learning_rate": 1.0696583332957737e-05, + "loss": 1.9028, + "mean_token_accuracy": 0.5564196705818176, + "num_tokens": 4967133318.0, + "step": 9716 + }, + { + "epoch": 2.627636560302866, + "grad_norm": 1.0572954416275024, + "learning_rate": 1.0695007744561215e-05, + "loss": 1.7243, + "mean_token_accuracy": 0.6151288747787476, + "num_tokens": 4967618491.0, + "step": 9717 + }, + { + "epoch": 2.6279069767441863, + "grad_norm": 1.1570932865142822, + "learning_rate": 1.0693432165522756e-05, + "loss": 1.8784, + "mean_token_accuracy": 0.560393214225769, + "num_tokens": 4968103742.0, + "step": 9718 + }, + { + "epoch": 2.6281773931855055, + "grad_norm": 1.1759533882141113, + "learning_rate": 1.0691856595890699e-05, + "loss": 1.7531, + "mean_token_accuracy": 0.576887845993042, + "num_tokens": 4968627968.0, + "step": 9719 + }, + { + "epoch": 2.6284478096268256, + "grad_norm": 1.1620711088180542, + "learning_rate": 1.0690281035713384e-05, + "loss": 1.8737, + "mean_token_accuracy": 0.5699188709259033, + "num_tokens": 4969152199.0, + "step": 9720 + }, + { + "epoch": 2.6287182260681448, + "grad_norm": 0.49583908915519714, + "learning_rate": 1.0688705485039164e-05, + "loss": 1.1277, + "mean_token_accuracy": 0.7022233009338379, + "num_tokens": 4969624630.0, + "step": 9721 + }, + { + "epoch": 2.628988642509465, + "grad_norm": 1.467150092124939, + "learning_rate": 1.068712994391637e-05, + "loss": 1.8922, + "mean_token_accuracy": 0.5718891620635986, + "num_tokens": 4970148817.0, + "step": 9722 + }, + { + "epoch": 2.629259058950784, + "grad_norm": 1.3961750268936157, + "learning_rate": 1.0685554412393353e-05, + "loss": 1.8392, + "mean_token_accuracy": 0.5810353755950928, + "num_tokens": 4970673096.0, + "step": 9723 + }, + { + "epoch": 2.6295294753921037, + "grad_norm": 1.2906956672668457, + "learning_rate": 1.0683978890518451e-05, + "loss": 1.9124, + "mean_token_accuracy": 0.5720822811126709, + "num_tokens": 4971197344.0, + "step": 9724 + }, + { + "epoch": 2.6297998918334233, + "grad_norm": 1.3395637273788452, + "learning_rate": 1.0682403378340003e-05, + "loss": 1.9183, + "mean_token_accuracy": 0.5570510625839233, + "num_tokens": 4971694997.0, + "step": 9725 + }, + { + "epoch": 2.630070308274743, + "grad_norm": 1.2967188358306885, + "learning_rate": 1.0680827875906357e-05, + "loss": 1.8109, + "mean_token_accuracy": 0.5920774936676025, + "num_tokens": 4972219104.0, + "step": 9726 + }, + { + "epoch": 2.6303407247160626, + "grad_norm": 1.2348453998565674, + "learning_rate": 1.0679252383265847e-05, + "loss": 1.8575, + "mean_token_accuracy": 0.5783510208129883, + "num_tokens": 4972743220.0, + "step": 9727 + }, + { + "epoch": 2.6306111411573823, + "grad_norm": 1.187648057937622, + "learning_rate": 1.0677676900466818e-05, + "loss": 1.8074, + "mean_token_accuracy": 0.5851584672927856, + "num_tokens": 4973267396.0, + "step": 9728 + }, + { + "epoch": 2.630881557598702, + "grad_norm": 1.0757919549942017, + "learning_rate": 1.0676101427557612e-05, + "loss": 1.8545, + "mean_token_accuracy": 0.571627140045166, + "num_tokens": 4973770077.0, + "step": 9729 + }, + { + "epoch": 2.6311519740400215, + "grad_norm": 1.0992225408554077, + "learning_rate": 1.0674525964586566e-05, + "loss": 1.8631, + "mean_token_accuracy": 0.5674605965614319, + "num_tokens": 4974294362.0, + "step": 9730 + }, + { + "epoch": 2.631422390481341, + "grad_norm": 1.0435439348220825, + "learning_rate": 1.0672950511602018e-05, + "loss": 1.8123, + "mean_token_accuracy": 0.5752725601196289, + "num_tokens": 4974818551.0, + "step": 9731 + }, + { + "epoch": 2.631692806922661, + "grad_norm": 1.1052080392837524, + "learning_rate": 1.0671375068652314e-05, + "loss": 1.9396, + "mean_token_accuracy": 0.5579646825790405, + "num_tokens": 4975342718.0, + "step": 9732 + }, + { + "epoch": 2.6319632233639805, + "grad_norm": 1.2000001668930054, + "learning_rate": 1.0669799635785784e-05, + "loss": 1.8717, + "mean_token_accuracy": 0.6244174838066101, + "num_tokens": 4975742910.0, + "step": 9733 + }, + { + "epoch": 2.6322336398053, + "grad_norm": 1.0339365005493164, + "learning_rate": 1.0668224213050775e-05, + "loss": 1.8393, + "mean_token_accuracy": 0.592442512512207, + "num_tokens": 4976181573.0, + "step": 9734 + }, + { + "epoch": 2.6325040562466198, + "grad_norm": 1.0611268281936646, + "learning_rate": 1.0666648800495624e-05, + "loss": 1.8768, + "mean_token_accuracy": 0.5641577243804932, + "num_tokens": 4976705828.0, + "step": 9735 + }, + { + "epoch": 2.6327744726879394, + "grad_norm": 0.9018926620483398, + "learning_rate": 1.066507339816867e-05, + "loss": 1.7393, + "mean_token_accuracy": 0.6208277940750122, + "num_tokens": 4977173802.0, + "step": 9736 + }, + { + "epoch": 2.633044889129259, + "grad_norm": 1.0236761569976807, + "learning_rate": 1.0663498006118244e-05, + "loss": 1.8654, + "mean_token_accuracy": 0.5721024870872498, + "num_tokens": 4977641137.0, + "step": 9737 + }, + { + "epoch": 2.6333153055705787, + "grad_norm": 1.1864161491394043, + "learning_rate": 1.0661922624392694e-05, + "loss": 1.8997, + "mean_token_accuracy": 0.5715640783309937, + "num_tokens": 4978157252.0, + "step": 9738 + }, + { + "epoch": 2.6335857220118983, + "grad_norm": 1.1923547983169556, + "learning_rate": 1.0660347253040348e-05, + "loss": 1.9005, + "mean_token_accuracy": 0.5726542472839355, + "num_tokens": 4978681517.0, + "step": 9739 + }, + { + "epoch": 2.633856138453218, + "grad_norm": 1.2037360668182373, + "learning_rate": 1.0658771892109548e-05, + "loss": 1.9106, + "mean_token_accuracy": 0.5681293606758118, + "num_tokens": 4979182096.0, + "step": 9740 + }, + { + "epoch": 2.6341265548945376, + "grad_norm": 0.4861851632595062, + "learning_rate": 1.065719654164863e-05, + "loss": 1.1756, + "mean_token_accuracy": 0.698574960231781, + "num_tokens": 4979658193.0, + "step": 9741 + }, + { + "epoch": 2.6343969713358573, + "grad_norm": 1.651674509048462, + "learning_rate": 1.0655621201705932e-05, + "loss": 1.9035, + "mean_token_accuracy": 0.5781867504119873, + "num_tokens": 4980118842.0, + "step": 9742 + }, + { + "epoch": 2.634667387777177, + "grad_norm": 1.3820784091949463, + "learning_rate": 1.0654045872329784e-05, + "loss": 1.8876, + "mean_token_accuracy": 0.5795271992683411, + "num_tokens": 4980582907.0, + "step": 9743 + }, + { + "epoch": 2.6349378042184965, + "grad_norm": 1.5094183683395386, + "learning_rate": 1.065247055356853e-05, + "loss": 1.9131, + "mean_token_accuracy": 0.5745306015014648, + "num_tokens": 4981107099.0, + "step": 9744 + }, + { + "epoch": 2.635208220659816, + "grad_norm": 1.2053569555282593, + "learning_rate": 1.0650895245470498e-05, + "loss": 1.9855, + "mean_token_accuracy": 0.5592920780181885, + "num_tokens": 4981631247.0, + "step": 9745 + }, + { + "epoch": 2.635478637101136, + "grad_norm": 1.2592827081680298, + "learning_rate": 1.0649319948084032e-05, + "loss": 1.7987, + "mean_token_accuracy": 0.5711952447891235, + "num_tokens": 4982155336.0, + "step": 9746 + }, + { + "epoch": 2.6357490535424555, + "grad_norm": 1.1125322580337524, + "learning_rate": 1.0647744661457458e-05, + "loss": 1.9401, + "mean_token_accuracy": 0.5579850673675537, + "num_tokens": 4982679608.0, + "step": 9747 + }, + { + "epoch": 2.636019469983775, + "grad_norm": 1.243476390838623, + "learning_rate": 1.0646169385639112e-05, + "loss": 1.9986, + "mean_token_accuracy": 0.5454551577568054, + "num_tokens": 4983203881.0, + "step": 9748 + }, + { + "epoch": 2.6362898864250948, + "grad_norm": 1.0802258253097534, + "learning_rate": 1.0644594120677331e-05, + "loss": 1.8862, + "mean_token_accuracy": 0.5912495851516724, + "num_tokens": 4983663377.0, + "step": 9749 + }, + { + "epoch": 2.6365603028664144, + "grad_norm": 1.1248878240585327, + "learning_rate": 1.064301886662045e-05, + "loss": 1.8059, + "mean_token_accuracy": 0.5738691091537476, + "num_tokens": 4984187564.0, + "step": 9750 + }, + { + "epoch": 2.636830719307734, + "grad_norm": 1.1900311708450317, + "learning_rate": 1.0641443623516796e-05, + "loss": 1.8116, + "mean_token_accuracy": 0.5852273106575012, + "num_tokens": 4984710706.0, + "step": 9751 + }, + { + "epoch": 2.6371011357490537, + "grad_norm": 1.163901448249817, + "learning_rate": 1.063986839141471e-05, + "loss": 1.9272, + "mean_token_accuracy": 0.5547066926956177, + "num_tokens": 4985234960.0, + "step": 9752 + }, + { + "epoch": 2.6373715521903733, + "grad_norm": 1.5627378225326538, + "learning_rate": 1.0638293170362519e-05, + "loss": 1.9433, + "mean_token_accuracy": 0.5665379166603088, + "num_tokens": 4985759239.0, + "step": 9753 + }, + { + "epoch": 2.6376419686316925, + "grad_norm": 1.1162360906600952, + "learning_rate": 1.0636717960408555e-05, + "loss": 1.8762, + "mean_token_accuracy": 0.5712993144989014, + "num_tokens": 4986283491.0, + "step": 9754 + }, + { + "epoch": 2.6379123850730126, + "grad_norm": 1.0907373428344727, + "learning_rate": 1.0635142761601154e-05, + "loss": 1.8995, + "mean_token_accuracy": 0.5733925700187683, + "num_tokens": 4986807677.0, + "step": 9755 + }, + { + "epoch": 2.638182801514332, + "grad_norm": 1.1461337804794312, + "learning_rate": 1.0633567573988649e-05, + "loss": 1.8876, + "mean_token_accuracy": 0.5813297033309937, + "num_tokens": 4987312408.0, + "step": 9756 + }, + { + "epoch": 2.638453217955652, + "grad_norm": 1.0536407232284546, + "learning_rate": 1.0631992397619363e-05, + "loss": 1.7949, + "mean_token_accuracy": 0.5965146422386169, + "num_tokens": 4987773493.0, + "step": 9757 + }, + { + "epoch": 2.638723634396971, + "grad_norm": 1.1946403980255127, + "learning_rate": 1.063041723254164e-05, + "loss": 1.89, + "mean_token_accuracy": 0.5545586347579956, + "num_tokens": 4988297710.0, + "step": 9758 + }, + { + "epoch": 2.638994050838291, + "grad_norm": 1.1351555585861206, + "learning_rate": 1.0628842078803799e-05, + "loss": 1.9149, + "mean_token_accuracy": 0.5700151920318604, + "num_tokens": 4988821771.0, + "step": 9759 + }, + { + "epoch": 2.6392644672796104, + "grad_norm": 1.1403576135635376, + "learning_rate": 1.062726693645417e-05, + "loss": 1.9034, + "mean_token_accuracy": 0.576167106628418, + "num_tokens": 4989346031.0, + "step": 9760 + }, + { + "epoch": 2.6395348837209305, + "grad_norm": 0.49931764602661133, + "learning_rate": 1.0625691805541094e-05, + "loss": 1.1289, + "mean_token_accuracy": 0.6892738342285156, + "num_tokens": 4989870243.0, + "step": 9761 + }, + { + "epoch": 2.6398053001622497, + "grad_norm": 1.3107547760009766, + "learning_rate": 1.0624116686112894e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.5737011432647705, + "num_tokens": 4990375087.0, + "step": 9762 + }, + { + "epoch": 2.6400757166035698, + "grad_norm": 1.2175251245498657, + "learning_rate": 1.0622541578217893e-05, + "loss": 1.8799, + "mean_token_accuracy": 0.5573422908782959, + "num_tokens": 4990884528.0, + "step": 9763 + }, + { + "epoch": 2.640346133044889, + "grad_norm": 1.0782731771469116, + "learning_rate": 1.0620966481904434e-05, + "loss": 1.8616, + "mean_token_accuracy": 0.5609913468360901, + "num_tokens": 4991408786.0, + "step": 9764 + }, + { + "epoch": 2.6406165494862086, + "grad_norm": 1.1632260084152222, + "learning_rate": 1.0619391397220835e-05, + "loss": 1.8456, + "mean_token_accuracy": 0.5806624293327332, + "num_tokens": 4991911778.0, + "step": 9765 + }, + { + "epoch": 2.6408869659275283, + "grad_norm": 1.0165361166000366, + "learning_rate": 1.0617816324215424e-05, + "loss": 1.7852, + "mean_token_accuracy": 0.5887744426727295, + "num_tokens": 4992435892.0, + "step": 9766 + }, + { + "epoch": 2.641157382368848, + "grad_norm": 1.4010292291641235, + "learning_rate": 1.0616241262936537e-05, + "loss": 1.9607, + "mean_token_accuracy": 0.540640115737915, + "num_tokens": 4992960023.0, + "step": 9767 + }, + { + "epoch": 2.6414277988101675, + "grad_norm": 1.1609607934951782, + "learning_rate": 1.0614666213432495e-05, + "loss": 1.8299, + "mean_token_accuracy": 0.5566312670707703, + "num_tokens": 4993484096.0, + "step": 9768 + }, + { + "epoch": 2.641698215251487, + "grad_norm": 1.094282627105713, + "learning_rate": 1.0613091175751624e-05, + "loss": 1.8955, + "mean_token_accuracy": 0.5631650686264038, + "num_tokens": 4994008296.0, + "step": 9769 + }, + { + "epoch": 2.641968631692807, + "grad_norm": 1.1063851118087769, + "learning_rate": 1.0611516149942259e-05, + "loss": 1.8501, + "mean_token_accuracy": 0.5889121294021606, + "num_tokens": 4994524431.0, + "step": 9770 + }, + { + "epoch": 2.6422390481341265, + "grad_norm": 1.312891960144043, + "learning_rate": 1.0609941136052714e-05, + "loss": 1.9555, + "mean_token_accuracy": 0.552041232585907, + "num_tokens": 4995047543.0, + "step": 9771 + }, + { + "epoch": 2.642509464575446, + "grad_norm": 1.3427895307540894, + "learning_rate": 1.060836613413133e-05, + "loss": 1.9512, + "mean_token_accuracy": 0.5678144097328186, + "num_tokens": 4995571730.0, + "step": 9772 + }, + { + "epoch": 2.6427798810167658, + "grad_norm": 1.2148131132125854, + "learning_rate": 1.0606791144226421e-05, + "loss": 1.9267, + "mean_token_accuracy": 0.5637021064758301, + "num_tokens": 4996057409.0, + "step": 9773 + }, + { + "epoch": 2.6430502974580854, + "grad_norm": 1.3664494752883911, + "learning_rate": 1.0605216166386314e-05, + "loss": 2.0496, + "mean_token_accuracy": 0.5622762441635132, + "num_tokens": 4996534552.0, + "step": 9774 + }, + { + "epoch": 2.643320713899405, + "grad_norm": 1.3351022005081177, + "learning_rate": 1.0603641200659339e-05, + "loss": 1.909, + "mean_token_accuracy": 0.5388076305389404, + "num_tokens": 4997058759.0, + "step": 9775 + }, + { + "epoch": 2.6435911303407247, + "grad_norm": 1.2675566673278809, + "learning_rate": 1.0602066247093817e-05, + "loss": 1.7938, + "mean_token_accuracy": 0.6022172570228577, + "num_tokens": 4997582819.0, + "step": 9776 + }, + { + "epoch": 2.6438615467820443, + "grad_norm": 1.3313504457473755, + "learning_rate": 1.0600491305738067e-05, + "loss": 1.8814, + "mean_token_accuracy": 0.5839208364486694, + "num_tokens": 4998106944.0, + "step": 9777 + }, + { + "epoch": 2.644131963223364, + "grad_norm": 1.0396791696548462, + "learning_rate": 1.0598916376640427e-05, + "loss": 1.7763, + "mean_token_accuracy": 0.5873785614967346, + "num_tokens": 4998631210.0, + "step": 9778 + }, + { + "epoch": 2.6444023796646836, + "grad_norm": 1.3386048078536987, + "learning_rate": 1.0597341459849211e-05, + "loss": 1.7287, + "mean_token_accuracy": 0.588092029094696, + "num_tokens": 4999155482.0, + "step": 9779 + }, + { + "epoch": 2.6446727961060033, + "grad_norm": 1.079054594039917, + "learning_rate": 1.0595766555412739e-05, + "loss": 1.8626, + "mean_token_accuracy": 0.5698314905166626, + "num_tokens": 4999679763.0, + "step": 9780 + }, + { + "epoch": 2.644943212547323, + "grad_norm": 0.4959304630756378, + "learning_rate": 1.0594191663379341e-05, + "loss": 1.1223, + "mean_token_accuracy": 0.6924326419830322, + "num_tokens": 5000185744.0, + "step": 9781 + }, + { + "epoch": 2.6452136289886425, + "grad_norm": 1.9098433256149292, + "learning_rate": 1.0592616783797336e-05, + "loss": 1.9246, + "mean_token_accuracy": 0.567440390586853, + "num_tokens": 5000658045.0, + "step": 9782 + }, + { + "epoch": 2.645484045429962, + "grad_norm": 1.5216164588928223, + "learning_rate": 1.0591041916715044e-05, + "loss": 1.8987, + "mean_token_accuracy": 0.5509682893753052, + "num_tokens": 5001182303.0, + "step": 9783 + }, + { + "epoch": 2.645754461871282, + "grad_norm": 1.1475194692611694, + "learning_rate": 1.058946706218079e-05, + "loss": 1.9546, + "mean_token_accuracy": 0.5623446702957153, + "num_tokens": 5001706530.0, + "step": 9784 + }, + { + "epoch": 2.6460248783126015, + "grad_norm": 1.3430161476135254, + "learning_rate": 1.0587892220242895e-05, + "loss": 1.9146, + "mean_token_accuracy": 0.555885910987854, + "num_tokens": 5002198367.0, + "step": 9785 + }, + { + "epoch": 2.646295294753921, + "grad_norm": 1.30512535572052, + "learning_rate": 1.0586317390949673e-05, + "loss": 1.9016, + "mean_token_accuracy": 0.5522654056549072, + "num_tokens": 5002722418.0, + "step": 9786 + }, + { + "epoch": 2.6465657111952408, + "grad_norm": 1.268375277519226, + "learning_rate": 1.0584742574349459e-05, + "loss": 1.9346, + "mean_token_accuracy": 0.564171314239502, + "num_tokens": 5003246579.0, + "step": 9787 + }, + { + "epoch": 2.6468361276365604, + "grad_norm": 1.0453060865402222, + "learning_rate": 1.058316777049056e-05, + "loss": 1.8488, + "mean_token_accuracy": 0.5861412286758423, + "num_tokens": 5003732142.0, + "step": 9788 + }, + { + "epoch": 2.64710654407788, + "grad_norm": 1.2558447122573853, + "learning_rate": 1.0581592979421295e-05, + "loss": 1.8111, + "mean_token_accuracy": 0.5873863697052002, + "num_tokens": 5004206430.0, + "step": 9789 + }, + { + "epoch": 2.6473769605191997, + "grad_norm": 1.3199336528778076, + "learning_rate": 1.0580018201189993e-05, + "loss": 1.8605, + "mean_token_accuracy": 0.5773260593414307, + "num_tokens": 5004730696.0, + "step": 9790 + }, + { + "epoch": 2.6476473769605193, + "grad_norm": 1.1841574907302856, + "learning_rate": 1.0578443435844965e-05, + "loss": 1.8402, + "mean_token_accuracy": 0.5801749229431152, + "num_tokens": 5005254976.0, + "step": 9791 + }, + { + "epoch": 2.647917793401839, + "grad_norm": 1.1524765491485596, + "learning_rate": 1.0576868683434533e-05, + "loss": 1.8805, + "mean_token_accuracy": 0.5588170289993286, + "num_tokens": 5005779192.0, + "step": 9792 + }, + { + "epoch": 2.6481882098431586, + "grad_norm": 1.1167577505111694, + "learning_rate": 1.0575293944007016e-05, + "loss": 1.8185, + "mean_token_accuracy": 0.5834600925445557, + "num_tokens": 5006241688.0, + "step": 9793 + }, + { + "epoch": 2.6484586262844783, + "grad_norm": 1.0691115856170654, + "learning_rate": 1.0573719217610727e-05, + "loss": 1.9138, + "mean_token_accuracy": 0.5660783052444458, + "num_tokens": 5006765804.0, + "step": 9794 + }, + { + "epoch": 2.6487290427257975, + "grad_norm": 1.203837513923645, + "learning_rate": 1.0572144504293988e-05, + "loss": 1.9242, + "mean_token_accuracy": 0.5745995044708252, + "num_tokens": 5007290040.0, + "step": 9795 + }, + { + "epoch": 2.6489994591671175, + "grad_norm": 1.1946672201156616, + "learning_rate": 1.0570569804105114e-05, + "loss": 2.101, + "mean_token_accuracy": 0.5386545658111572, + "num_tokens": 5007749923.0, + "step": 9796 + }, + { + "epoch": 2.6492698756084367, + "grad_norm": 1.1563875675201416, + "learning_rate": 1.056899511709242e-05, + "loss": 1.9013, + "mean_token_accuracy": 0.5679299235343933, + "num_tokens": 5008274111.0, + "step": 9797 + }, + { + "epoch": 2.649540292049757, + "grad_norm": 1.2681095600128174, + "learning_rate": 1.0567420443304223e-05, + "loss": 1.7281, + "mean_token_accuracy": 0.609009861946106, + "num_tokens": 5008787096.0, + "step": 9798 + }, + { + "epoch": 2.649810708491076, + "grad_norm": 1.298818826675415, + "learning_rate": 1.0565845782788842e-05, + "loss": 1.9974, + "mean_token_accuracy": 0.5509270429611206, + "num_tokens": 5009311318.0, + "step": 9799 + }, + { + "epoch": 2.650081124932396, + "grad_norm": 1.1912308931350708, + "learning_rate": 1.0564271135594583e-05, + "loss": 1.8953, + "mean_token_accuracy": 0.5747401714324951, + "num_tokens": 5009835562.0, + "step": 9800 + }, + { + "epoch": 2.6503515413737153, + "grad_norm": 0.5001893639564514, + "learning_rate": 1.0562696501769774e-05, + "loss": 1.0639, + "mean_token_accuracy": 0.715907633304596, + "num_tokens": 5010325160.0, + "step": 9801 + }, + { + "epoch": 2.6506219578150354, + "grad_norm": 1.3671530485153198, + "learning_rate": 1.0561121881362722e-05, + "loss": 1.8146, + "mean_token_accuracy": 0.5662128925323486, + "num_tokens": 5010813642.0, + "step": 9802 + }, + { + "epoch": 2.6508923742563546, + "grad_norm": 1.4392060041427612, + "learning_rate": 1.0559547274421736e-05, + "loss": 1.8201, + "mean_token_accuracy": 0.5857649445533752, + "num_tokens": 5011314377.0, + "step": 9803 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 1.0975804328918457, + "learning_rate": 1.0557972680995136e-05, + "loss": 1.8437, + "mean_token_accuracy": 0.5633723139762878, + "num_tokens": 5011838587.0, + "step": 9804 + }, + { + "epoch": 2.651433207138994, + "grad_norm": 1.14814293384552, + "learning_rate": 1.0556398101131238e-05, + "loss": 1.7752, + "mean_token_accuracy": 0.5841934680938721, + "num_tokens": 5012309945.0, + "step": 9805 + }, + { + "epoch": 2.6517036235803135, + "grad_norm": 1.3045347929000854, + "learning_rate": 1.0554823534878347e-05, + "loss": 1.778, + "mean_token_accuracy": 0.5736488103866577, + "num_tokens": 5012833968.0, + "step": 9806 + }, + { + "epoch": 2.651974040021633, + "grad_norm": 1.2512482404708862, + "learning_rate": 1.0553248982284784e-05, + "loss": 1.8266, + "mean_token_accuracy": 0.5722533464431763, + "num_tokens": 5013358110.0, + "step": 9807 + }, + { + "epoch": 2.652244456462953, + "grad_norm": 1.0472540855407715, + "learning_rate": 1.0551674443398852e-05, + "loss": 1.8923, + "mean_token_accuracy": 0.5726659893989563, + "num_tokens": 5013882389.0, + "step": 9808 + }, + { + "epoch": 2.6525148729042725, + "grad_norm": 1.2829313278198242, + "learning_rate": 1.0550099918268863e-05, + "loss": 2.0189, + "mean_token_accuracy": 0.5440427660942078, + "num_tokens": 5014351580.0, + "step": 9809 + }, + { + "epoch": 2.652785289345592, + "grad_norm": 1.3225938081741333, + "learning_rate": 1.054852540694314e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5610594749450684, + "num_tokens": 5014875775.0, + "step": 9810 + }, + { + "epoch": 2.6530557057869117, + "grad_norm": 1.483511209487915, + "learning_rate": 1.054695090946998e-05, + "loss": 1.805, + "mean_token_accuracy": 0.5787486433982849, + "num_tokens": 5015400034.0, + "step": 9811 + }, + { + "epoch": 2.6533261222282314, + "grad_norm": 1.3154488801956177, + "learning_rate": 1.0545376425897695e-05, + "loss": 1.9291, + "mean_token_accuracy": 0.5558417439460754, + "num_tokens": 5015924261.0, + "step": 9812 + }, + { + "epoch": 2.653596538669551, + "grad_norm": 0.9507439136505127, + "learning_rate": 1.0543801956274607e-05, + "loss": 1.8261, + "mean_token_accuracy": 0.5743685364723206, + "num_tokens": 5016407666.0, + "step": 9813 + }, + { + "epoch": 2.6538669551108707, + "grad_norm": 1.1257387399673462, + "learning_rate": 1.054222750064901e-05, + "loss": 1.8634, + "mean_token_accuracy": 0.5641053915023804, + "num_tokens": 5016931938.0, + "step": 9814 + }, + { + "epoch": 2.6541373715521903, + "grad_norm": 1.178094506263733, + "learning_rate": 1.0540653059069222e-05, + "loss": 1.9182, + "mean_token_accuracy": 0.5728946328163147, + "num_tokens": 5017425266.0, + "step": 9815 + }, + { + "epoch": 2.65440778799351, + "grad_norm": 1.1934419870376587, + "learning_rate": 1.053907863158355e-05, + "loss": 1.8769, + "mean_token_accuracy": 0.5619966983795166, + "num_tokens": 5017949515.0, + "step": 9816 + }, + { + "epoch": 2.6546782044348296, + "grad_norm": 1.0801358222961426, + "learning_rate": 1.0537504218240298e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5628395080566406, + "num_tokens": 5018415366.0, + "step": 9817 + }, + { + "epoch": 2.6549486208761492, + "grad_norm": 1.3251514434814453, + "learning_rate": 1.053592981908778e-05, + "loss": 1.8147, + "mean_token_accuracy": 0.5816154479980469, + "num_tokens": 5018939605.0, + "step": 9818 + }, + { + "epoch": 2.655219037317469, + "grad_norm": 1.3055405616760254, + "learning_rate": 1.0534355434174297e-05, + "loss": 1.8464, + "mean_token_accuracy": 0.5630907416343689, + "num_tokens": 5019463849.0, + "step": 9819 + }, + { + "epoch": 2.6554894537587885, + "grad_norm": 0.975969672203064, + "learning_rate": 1.0532781063548159e-05, + "loss": 1.7791, + "mean_token_accuracy": 0.5924133062362671, + "num_tokens": 5019987978.0, + "step": 9820 + }, + { + "epoch": 2.655759870200108, + "grad_norm": 0.4103914201259613, + "learning_rate": 1.0531206707257672e-05, + "loss": 1.2545, + "mean_token_accuracy": 0.6632499694824219, + "num_tokens": 5020512082.0, + "step": 9821 + }, + { + "epoch": 2.656030286641428, + "grad_norm": 1.8122442960739136, + "learning_rate": 1.0529632365351145e-05, + "loss": 1.6968, + "mean_token_accuracy": 0.6040352582931519, + "num_tokens": 5020988691.0, + "step": 9822 + }, + { + "epoch": 2.6563007030827475, + "grad_norm": 1.666108250617981, + "learning_rate": 1.0528058037876876e-05, + "loss": 1.8328, + "mean_token_accuracy": 0.5879213809967041, + "num_tokens": 5021512979.0, + "step": 9823 + }, + { + "epoch": 2.656571119524067, + "grad_norm": 1.2079869508743286, + "learning_rate": 1.0526483724883178e-05, + "loss": 1.8596, + "mean_token_accuracy": 0.551702618598938, + "num_tokens": 5022037140.0, + "step": 9824 + }, + { + "epoch": 2.6568415359653867, + "grad_norm": 1.0220073461532593, + "learning_rate": 1.052490942641835e-05, + "loss": 1.828, + "mean_token_accuracy": 0.5794803500175476, + "num_tokens": 5022529202.0, + "step": 9825 + }, + { + "epoch": 2.6571119524067064, + "grad_norm": 1.3976191282272339, + "learning_rate": 1.0523335142530697e-05, + "loss": 1.9475, + "mean_token_accuracy": 0.5738978981971741, + "num_tokens": 5023053471.0, + "step": 9826 + }, + { + "epoch": 2.657382368848026, + "grad_norm": 1.130576729774475, + "learning_rate": 1.0521760873268524e-05, + "loss": 1.9685, + "mean_token_accuracy": 0.5512464046478271, + "num_tokens": 5023567270.0, + "step": 9827 + }, + { + "epoch": 2.6576527852893457, + "grad_norm": 0.945201575756073, + "learning_rate": 1.0520186618680137e-05, + "loss": 1.8262, + "mean_token_accuracy": 0.5761910676956177, + "num_tokens": 5024091473.0, + "step": 9828 + }, + { + "epoch": 2.6579232017306653, + "grad_norm": 0.8518456220626831, + "learning_rate": 1.0518612378813833e-05, + "loss": 1.8989, + "mean_token_accuracy": 0.5556429624557495, + "num_tokens": 5024615735.0, + "step": 9829 + }, + { + "epoch": 2.658193618171985, + "grad_norm": 1.0216134786605835, + "learning_rate": 1.0517038153717918e-05, + "loss": 1.7932, + "mean_token_accuracy": 0.5796718597412109, + "num_tokens": 5025139988.0, + "step": 9830 + }, + { + "epoch": 2.6584640346133046, + "grad_norm": 0.9930573105812073, + "learning_rate": 1.051546394344069e-05, + "loss": 1.8043, + "mean_token_accuracy": 0.581072211265564, + "num_tokens": 5025664261.0, + "step": 9831 + }, + { + "epoch": 2.6587344510546242, + "grad_norm": 0.9698492884635925, + "learning_rate": 1.0513889748030455e-05, + "loss": 1.7754, + "mean_token_accuracy": 0.5759042501449585, + "num_tokens": 5026188427.0, + "step": 9832 + }, + { + "epoch": 2.659004867495944, + "grad_norm": 0.9920869469642639, + "learning_rate": 1.051231556753551e-05, + "loss": 1.8535, + "mean_token_accuracy": 0.5698312520980835, + "num_tokens": 5026674923.0, + "step": 9833 + }, + { + "epoch": 2.6592752839372635, + "grad_norm": 1.1992466449737549, + "learning_rate": 1.0510741402004162e-05, + "loss": 1.9022, + "mean_token_accuracy": 0.5652495622634888, + "num_tokens": 5027199128.0, + "step": 9834 + }, + { + "epoch": 2.659545700378583, + "grad_norm": 1.1595486402511597, + "learning_rate": 1.0509167251484703e-05, + "loss": 1.879, + "mean_token_accuracy": 0.5620264410972595, + "num_tokens": 5027670013.0, + "step": 9835 + }, + { + "epoch": 2.6598161168199024, + "grad_norm": 1.1779028177261353, + "learning_rate": 1.0507593116025438e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5645567178726196, + "num_tokens": 5028194286.0, + "step": 9836 + }, + { + "epoch": 2.6600865332612225, + "grad_norm": 1.184874415397644, + "learning_rate": 1.0506018995674663e-05, + "loss": 1.8636, + "mean_token_accuracy": 0.5500723123550415, + "num_tokens": 5028718551.0, + "step": 9837 + }, + { + "epoch": 2.6603569497025417, + "grad_norm": 1.1715667247772217, + "learning_rate": 1.0504444890480678e-05, + "loss": 1.8716, + "mean_token_accuracy": 0.5471310019493103, + "num_tokens": 5029242651.0, + "step": 9838 + }, + { + "epoch": 2.6606273661438617, + "grad_norm": 0.9028854370117188, + "learning_rate": 1.0502870800491782e-05, + "loss": 1.8218, + "mean_token_accuracy": 0.573593258857727, + "num_tokens": 5029766932.0, + "step": 9839 + }, + { + "epoch": 2.660897782585181, + "grad_norm": 1.1378214359283447, + "learning_rate": 1.0501296725756272e-05, + "loss": 1.9605, + "mean_token_accuracy": 0.560663640499115, + "num_tokens": 5030231011.0, + "step": 9840 + }, + { + "epoch": 2.661168199026501, + "grad_norm": 0.3830331861972809, + "learning_rate": 1.0499722666322445e-05, + "loss": 1.1331, + "mean_token_accuracy": 0.7028599977493286, + "num_tokens": 5030755296.0, + "step": 9841 + }, + { + "epoch": 2.6614386154678202, + "grad_norm": 1.7346322536468506, + "learning_rate": 1.0498148622238597e-05, + "loss": 1.8336, + "mean_token_accuracy": 0.5507678985595703, + "num_tokens": 5031219225.0, + "step": 9842 + }, + { + "epoch": 2.6617090319091403, + "grad_norm": 1.426456332206726, + "learning_rate": 1.0496574593553024e-05, + "loss": 1.8005, + "mean_token_accuracy": 0.5801087617874146, + "num_tokens": 5031743496.0, + "step": 9843 + }, + { + "epoch": 2.6619794483504595, + "grad_norm": 1.039109706878662, + "learning_rate": 1.0495000580314026e-05, + "loss": 1.9361, + "mean_token_accuracy": 0.5688611268997192, + "num_tokens": 5032219519.0, + "step": 9844 + }, + { + "epoch": 2.6622498647917796, + "grad_norm": 1.211391806602478, + "learning_rate": 1.0493426582569894e-05, + "loss": 1.7715, + "mean_token_accuracy": 0.5982983708381653, + "num_tokens": 5032743707.0, + "step": 9845 + }, + { + "epoch": 2.662520281233099, + "grad_norm": 1.1637682914733887, + "learning_rate": 1.0491852600368924e-05, + "loss": 1.9167, + "mean_token_accuracy": 0.5638604760169983, + "num_tokens": 5033267986.0, + "step": 9846 + }, + { + "epoch": 2.6627906976744184, + "grad_norm": 1.1994580030441284, + "learning_rate": 1.0490278633759407e-05, + "loss": 1.7868, + "mean_token_accuracy": 0.5821554660797119, + "num_tokens": 5033792145.0, + "step": 9847 + }, + { + "epoch": 2.663061114115738, + "grad_norm": 1.3014918565750122, + "learning_rate": 1.0488704682789648e-05, + "loss": 1.792, + "mean_token_accuracy": 0.5762871503829956, + "num_tokens": 5034278724.0, + "step": 9848 + }, + { + "epoch": 2.6633315305570577, + "grad_norm": 1.1964223384857178, + "learning_rate": 1.0487130747507925e-05, + "loss": 1.8502, + "mean_token_accuracy": 0.5644959211349487, + "num_tokens": 5034802950.0, + "step": 9849 + }, + { + "epoch": 2.6636019469983774, + "grad_norm": 1.2177459001541138, + "learning_rate": 1.0485556827962544e-05, + "loss": 1.7896, + "mean_token_accuracy": 0.5847916603088379, + "num_tokens": 5035327080.0, + "step": 9850 + }, + { + "epoch": 2.663872363439697, + "grad_norm": 1.165894865989685, + "learning_rate": 1.0483982924201795e-05, + "loss": 1.7756, + "mean_token_accuracy": 0.5831011533737183, + "num_tokens": 5035851345.0, + "step": 9851 + }, + { + "epoch": 2.6641427798810167, + "grad_norm": 1.0710248947143555, + "learning_rate": 1.048240903627396e-05, + "loss": 1.8414, + "mean_token_accuracy": 0.5702728033065796, + "num_tokens": 5036375488.0, + "step": 9852 + }, + { + "epoch": 2.6644131963223363, + "grad_norm": 1.4337027072906494, + "learning_rate": 1.0480835164227343e-05, + "loss": 1.8855, + "mean_token_accuracy": 0.5696229934692383, + "num_tokens": 5036898558.0, + "step": 9853 + }, + { + "epoch": 2.664683612763656, + "grad_norm": 1.0156095027923584, + "learning_rate": 1.0479261308110228e-05, + "loss": 1.8073, + "mean_token_accuracy": 0.5666460990905762, + "num_tokens": 5037422796.0, + "step": 9854 + }, + { + "epoch": 2.6649540292049756, + "grad_norm": 1.0167261362075806, + "learning_rate": 1.0477687467970905e-05, + "loss": 1.89, + "mean_token_accuracy": 0.5682646036148071, + "num_tokens": 5037896053.0, + "step": 9855 + }, + { + "epoch": 2.6652244456462952, + "grad_norm": 1.2269777059555054, + "learning_rate": 1.0476113643857673e-05, + "loss": 1.9045, + "mean_token_accuracy": 0.5624329447746277, + "num_tokens": 5038375797.0, + "step": 9856 + }, + { + "epoch": 2.665494862087615, + "grad_norm": 0.941200315952301, + "learning_rate": 1.0474539835818812e-05, + "loss": 1.8418, + "mean_token_accuracy": 0.5809526443481445, + "num_tokens": 5038899990.0, + "step": 9857 + }, + { + "epoch": 2.6657652785289345, + "grad_norm": 1.008456826210022, + "learning_rate": 1.047296604390261e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5610605478286743, + "num_tokens": 5039424197.0, + "step": 9858 + }, + { + "epoch": 2.666035694970254, + "grad_norm": 1.2779419422149658, + "learning_rate": 1.0471392268157366e-05, + "loss": 1.8509, + "mean_token_accuracy": 0.5713942646980286, + "num_tokens": 5039948425.0, + "step": 9859 + }, + { + "epoch": 2.666306111411574, + "grad_norm": 1.0646885633468628, + "learning_rate": 1.0469818508631357e-05, + "loss": 1.8639, + "mean_token_accuracy": 0.5804362297058105, + "num_tokens": 5040419234.0, + "step": 9860 + }, + { + "epoch": 2.6665765278528935, + "grad_norm": 0.5375557541847229, + "learning_rate": 1.0468244765372877e-05, + "loss": 1.1144, + "mean_token_accuracy": 0.6951643228530884, + "num_tokens": 5040943422.0, + "step": 9861 + }, + { + "epoch": 2.666846944294213, + "grad_norm": 1.3502147197723389, + "learning_rate": 1.0466671038430212e-05, + "loss": 1.858, + "mean_token_accuracy": 0.5771466493606567, + "num_tokens": 5041467586.0, + "step": 9862 + }, + { + "epoch": 2.6671173607355327, + "grad_norm": 1.567762017250061, + "learning_rate": 1.0465097327851645e-05, + "loss": 1.8846, + "mean_token_accuracy": 0.5683906078338623, + "num_tokens": 5041981428.0, + "step": 9863 + }, + { + "epoch": 2.6673877771768524, + "grad_norm": 1.1421397924423218, + "learning_rate": 1.046352363368547e-05, + "loss": 1.8648, + "mean_token_accuracy": 0.5746256709098816, + "num_tokens": 5042505578.0, + "step": 9864 + }, + { + "epoch": 2.667658193618172, + "grad_norm": 1.056756615638733, + "learning_rate": 1.0461949955979966e-05, + "loss": 1.8487, + "mean_token_accuracy": 0.5816202759742737, + "num_tokens": 5043029820.0, + "step": 9865 + }, + { + "epoch": 2.6679286100594917, + "grad_norm": 1.8396730422973633, + "learning_rate": 1.0460376294783419e-05, + "loss": 1.9463, + "mean_token_accuracy": 0.5354929566383362, + "num_tokens": 5043554055.0, + "step": 9866 + }, + { + "epoch": 2.6681990265008113, + "grad_norm": 1.3472915887832642, + "learning_rate": 1.0458802650144116e-05, + "loss": 1.9624, + "mean_token_accuracy": 0.5431498289108276, + "num_tokens": 5044078283.0, + "step": 9867 + }, + { + "epoch": 2.668469442942131, + "grad_norm": 1.1579757928848267, + "learning_rate": 1.0457229022110336e-05, + "loss": 1.9191, + "mean_token_accuracy": 0.5848429203033447, + "num_tokens": 5044602506.0, + "step": 9868 + }, + { + "epoch": 2.6687398593834506, + "grad_norm": 1.3098145723342896, + "learning_rate": 1.0455655410730363e-05, + "loss": 1.8211, + "mean_token_accuracy": 0.5657828450202942, + "num_tokens": 5045126480.0, + "step": 9869 + }, + { + "epoch": 2.6690102758247702, + "grad_norm": 1.2674957513809204, + "learning_rate": 1.045408181605249e-05, + "loss": 1.9281, + "mean_token_accuracy": 0.5530961751937866, + "num_tokens": 5045650700.0, + "step": 9870 + }, + { + "epoch": 2.66928069226609, + "grad_norm": 1.033320665359497, + "learning_rate": 1.045250823812499e-05, + "loss": 1.8254, + "mean_token_accuracy": 0.5764367580413818, + "num_tokens": 5046169335.0, + "step": 9871 + }, + { + "epoch": 2.6695511087074095, + "grad_norm": 1.0867369174957275, + "learning_rate": 1.0450934676996145e-05, + "loss": 1.946, + "mean_token_accuracy": 0.5458934307098389, + "num_tokens": 5046690314.0, + "step": 9872 + }, + { + "epoch": 2.669821525148729, + "grad_norm": 1.3287851810455322, + "learning_rate": 1.0449361132714242e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.5520032048225403, + "num_tokens": 5047214496.0, + "step": 9873 + }, + { + "epoch": 2.670091941590049, + "grad_norm": 1.30194890499115, + "learning_rate": 1.0447787605327557e-05, + "loss": 1.7613, + "mean_token_accuracy": 0.6000158786773682, + "num_tokens": 5047698041.0, + "step": 9874 + }, + { + "epoch": 2.6703623580313685, + "grad_norm": 0.9907786250114441, + "learning_rate": 1.0446214094884373e-05, + "loss": 1.8593, + "mean_token_accuracy": 0.565760612487793, + "num_tokens": 5048154055.0, + "step": 9875 + }, + { + "epoch": 2.670632774472688, + "grad_norm": 1.243462324142456, + "learning_rate": 1.0444640601432967e-05, + "loss": 1.9345, + "mean_token_accuracy": 0.566024899482727, + "num_tokens": 5048678244.0, + "step": 9876 + }, + { + "epoch": 2.6709031909140073, + "grad_norm": 1.2973235845565796, + "learning_rate": 1.0443067125021624e-05, + "loss": 1.927, + "mean_token_accuracy": 0.5472039580345154, + "num_tokens": 5049202425.0, + "step": 9877 + }, + { + "epoch": 2.6711736073553274, + "grad_norm": 1.0536298751831055, + "learning_rate": 1.0441493665698615e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.5730611681938171, + "num_tokens": 5049703017.0, + "step": 9878 + }, + { + "epoch": 2.6714440237966466, + "grad_norm": 1.2814959287643433, + "learning_rate": 1.0439920223512228e-05, + "loss": 1.9892, + "mean_token_accuracy": 0.5621559619903564, + "num_tokens": 5050227162.0, + "step": 9879 + }, + { + "epoch": 2.6717144402379667, + "grad_norm": 1.053260326385498, + "learning_rate": 1.0438346798510735e-05, + "loss": 1.8279, + "mean_token_accuracy": 0.5803682804107666, + "num_tokens": 5050751266.0, + "step": 9880 + }, + { + "epoch": 2.671984856679286, + "grad_norm": 0.47846749424934387, + "learning_rate": 1.0436773390742407e-05, + "loss": 1.1317, + "mean_token_accuracy": 0.6972852945327759, + "num_tokens": 5051275543.0, + "step": 9881 + }, + { + "epoch": 2.672255273120606, + "grad_norm": 1.2103177309036255, + "learning_rate": 1.0435200000255534e-05, + "loss": 1.8684, + "mean_token_accuracy": 0.5668606162071228, + "num_tokens": 5051799650.0, + "step": 9882 + }, + { + "epoch": 2.672525689561925, + "grad_norm": 1.4761570692062378, + "learning_rate": 1.0433626627098383e-05, + "loss": 1.7602, + "mean_token_accuracy": 0.5940913558006287, + "num_tokens": 5052323918.0, + "step": 9883 + }, + { + "epoch": 2.6727961060032452, + "grad_norm": 0.9608462452888489, + "learning_rate": 1.0432053271319235e-05, + "loss": 1.8235, + "mean_token_accuracy": 0.5741342902183533, + "num_tokens": 5052847934.0, + "step": 9884 + }, + { + "epoch": 2.6730665224445644, + "grad_norm": 1.1287975311279297, + "learning_rate": 1.0430479932966365e-05, + "loss": 1.7814, + "mean_token_accuracy": 0.5789940357208252, + "num_tokens": 5053371895.0, + "step": 9885 + }, + { + "epoch": 2.6733369388858845, + "grad_norm": 1.134865403175354, + "learning_rate": 1.0428906612088038e-05, + "loss": 1.8102, + "mean_token_accuracy": 0.579289436340332, + "num_tokens": 5053896170.0, + "step": 9886 + }, + { + "epoch": 2.6736073553272037, + "grad_norm": 1.0730760097503662, + "learning_rate": 1.0427333308732543e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.5487523674964905, + "num_tokens": 5054420443.0, + "step": 9887 + }, + { + "epoch": 2.6738777717685234, + "grad_norm": 1.230214238166809, + "learning_rate": 1.0425760022948144e-05, + "loss": 1.8824, + "mean_token_accuracy": 0.5754744410514832, + "num_tokens": 5054944654.0, + "step": 9888 + }, + { + "epoch": 2.674148188209843, + "grad_norm": 1.04252290725708, + "learning_rate": 1.0424186754783115e-05, + "loss": 1.7504, + "mean_token_accuracy": 0.5882137417793274, + "num_tokens": 5055468800.0, + "step": 9889 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 0.940435528755188, + "learning_rate": 1.042261350428573e-05, + "loss": 1.9723, + "mean_token_accuracy": 0.5597022175788879, + "num_tokens": 5055993073.0, + "step": 9890 + }, + { + "epoch": 2.6746890210924823, + "grad_norm": 0.9071158170700073, + "learning_rate": 1.0421040271504261e-05, + "loss": 1.7516, + "mean_token_accuracy": 0.5802041888237, + "num_tokens": 5056517178.0, + "step": 9891 + }, + { + "epoch": 2.674959437533802, + "grad_norm": 1.0355355739593506, + "learning_rate": 1.0419467056486978e-05, + "loss": 1.9381, + "mean_token_accuracy": 0.5620003938674927, + "num_tokens": 5057041401.0, + "step": 9892 + }, + { + "epoch": 2.6752298539751216, + "grad_norm": 0.9667112827301025, + "learning_rate": 1.0417893859282155e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.5440398454666138, + "num_tokens": 5057565645.0, + "step": 9893 + }, + { + "epoch": 2.6755002704164412, + "grad_norm": 1.1133838891983032, + "learning_rate": 1.0416320679938062e-05, + "loss": 1.9118, + "mean_token_accuracy": 0.5676915645599365, + "num_tokens": 5058078566.0, + "step": 9894 + }, + { + "epoch": 2.675770686857761, + "grad_norm": 1.1836175918579102, + "learning_rate": 1.041474751850296e-05, + "loss": 1.9574, + "mean_token_accuracy": 0.5620492100715637, + "num_tokens": 5058602728.0, + "step": 9895 + }, + { + "epoch": 2.6760411032990805, + "grad_norm": 0.9573523998260498, + "learning_rate": 1.0413174375025133e-05, + "loss": 1.827, + "mean_token_accuracy": 0.5823184251785278, + "num_tokens": 5059126996.0, + "step": 9896 + }, + { + "epoch": 2.6763115197404, + "grad_norm": 1.177858591079712, + "learning_rate": 1.0411601249552838e-05, + "loss": 1.843, + "mean_token_accuracy": 0.5541491508483887, + "num_tokens": 5059651145.0, + "step": 9897 + }, + { + "epoch": 2.67658193618172, + "grad_norm": 1.1786224842071533, + "learning_rate": 1.0410028142134345e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.5599819421768188, + "num_tokens": 5060175377.0, + "step": 9898 + }, + { + "epoch": 2.6768523526230394, + "grad_norm": 0.9356268644332886, + "learning_rate": 1.0408455052817928e-05, + "loss": 1.9114, + "mean_token_accuracy": 0.565330982208252, + "num_tokens": 5060699507.0, + "step": 9899 + }, + { + "epoch": 2.677122769064359, + "grad_norm": 1.1401493549346924, + "learning_rate": 1.0406881981651848e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.5399233102798462, + "num_tokens": 5061223782.0, + "step": 9900 + }, + { + "epoch": 2.6773931855056787, + "grad_norm": 0.6042444705963135, + "learning_rate": 1.0405308928684372e-05, + "loss": 1.1739, + "mean_token_accuracy": 0.6834291219711304, + "num_tokens": 5061747987.0, + "step": 9901 + }, + { + "epoch": 2.6776636019469984, + "grad_norm": 1.5164824724197388, + "learning_rate": 1.0403735893963769e-05, + "loss": 1.9982, + "mean_token_accuracy": 0.5295753479003906, + "num_tokens": 5062272264.0, + "step": 9902 + }, + { + "epoch": 2.677934018388318, + "grad_norm": 1.3589863777160645, + "learning_rate": 1.0402162877538301e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.5620335936546326, + "num_tokens": 5062742103.0, + "step": 9903 + }, + { + "epoch": 2.6782044348296377, + "grad_norm": 1.1928361654281616, + "learning_rate": 1.0400589879456232e-05, + "loss": 1.9774, + "mean_token_accuracy": 0.5569790601730347, + "num_tokens": 5063266382.0, + "step": 9904 + }, + { + "epoch": 2.6784748512709573, + "grad_norm": 0.96699458360672, + "learning_rate": 1.0399016899765832e-05, + "loss": 1.7755, + "mean_token_accuracy": 0.5726801156997681, + "num_tokens": 5063790588.0, + "step": 9905 + }, + { + "epoch": 2.678745267712277, + "grad_norm": 1.0838048458099365, + "learning_rate": 1.0397443938515357e-05, + "loss": 1.9203, + "mean_token_accuracy": 0.5542042255401611, + "num_tokens": 5064314861.0, + "step": 9906 + }, + { + "epoch": 2.6790156841535966, + "grad_norm": 1.151263952255249, + "learning_rate": 1.0395870995753077e-05, + "loss": 1.8598, + "mean_token_accuracy": 0.5455397367477417, + "num_tokens": 5064839138.0, + "step": 9907 + }, + { + "epoch": 2.6792861005949162, + "grad_norm": 1.3619656562805176, + "learning_rate": 1.0394298071527253e-05, + "loss": 1.8748, + "mean_token_accuracy": 0.570830225944519, + "num_tokens": 5065342806.0, + "step": 9908 + }, + { + "epoch": 2.679556517036236, + "grad_norm": 1.1564706563949585, + "learning_rate": 1.0392725165886139e-05, + "loss": 1.9547, + "mean_token_accuracy": 0.5484814643859863, + "num_tokens": 5065866985.0, + "step": 9909 + }, + { + "epoch": 2.6798269334775555, + "grad_norm": 1.057357668876648, + "learning_rate": 1.039115227887801e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5634720921516418, + "num_tokens": 5066352638.0, + "step": 9910 + }, + { + "epoch": 2.680097349918875, + "grad_norm": 1.2035584449768066, + "learning_rate": 1.0389579410551115e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5404156446456909, + "num_tokens": 5066876794.0, + "step": 9911 + }, + { + "epoch": 2.680367766360195, + "grad_norm": 1.1786839962005615, + "learning_rate": 1.0388006560953718e-05, + "loss": 1.8762, + "mean_token_accuracy": 0.589790403842926, + "num_tokens": 5067334965.0, + "step": 9912 + }, + { + "epoch": 2.6806381828015144, + "grad_norm": 0.9283795356750488, + "learning_rate": 1.0386433730134084e-05, + "loss": 1.8587, + "mean_token_accuracy": 0.5658718347549438, + "num_tokens": 5067859104.0, + "step": 9913 + }, + { + "epoch": 2.680908599242834, + "grad_norm": 0.9711439609527588, + "learning_rate": 1.0384860918140467e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.543937087059021, + "num_tokens": 5068383268.0, + "step": 9914 + }, + { + "epoch": 2.6811790156841537, + "grad_norm": 1.3183664083480835, + "learning_rate": 1.0383288125021123e-05, + "loss": 1.8066, + "mean_token_accuracy": 0.6060830354690552, + "num_tokens": 5068875994.0, + "step": 9915 + }, + { + "epoch": 2.6814494321254734, + "grad_norm": 1.2442034482955933, + "learning_rate": 1.0381715350824315e-05, + "loss": 1.8942, + "mean_token_accuracy": 0.5754005908966064, + "num_tokens": 5069336360.0, + "step": 9916 + }, + { + "epoch": 2.681719848566793, + "grad_norm": 1.2366241216659546, + "learning_rate": 1.0380142595598297e-05, + "loss": 1.9235, + "mean_token_accuracy": 0.552987813949585, + "num_tokens": 5069860477.0, + "step": 9917 + }, + { + "epoch": 2.681990265008112, + "grad_norm": 1.0848647356033325, + "learning_rate": 1.0378569859391327e-05, + "loss": 1.8309, + "mean_token_accuracy": 0.573206901550293, + "num_tokens": 5070319497.0, + "step": 9918 + }, + { + "epoch": 2.6822606814494323, + "grad_norm": 1.3011248111724854, + "learning_rate": 1.0376997142251661e-05, + "loss": 1.8638, + "mean_token_accuracy": 0.5751950740814209, + "num_tokens": 5070843547.0, + "step": 9919 + }, + { + "epoch": 2.6825310978907515, + "grad_norm": 1.160252571105957, + "learning_rate": 1.0375424444227557e-05, + "loss": 1.9918, + "mean_token_accuracy": 0.541490912437439, + "num_tokens": 5071367744.0, + "step": 9920 + }, + { + "epoch": 2.6828015143320716, + "grad_norm": 0.49966979026794434, + "learning_rate": 1.0373851765367265e-05, + "loss": 1.1327, + "mean_token_accuracy": 0.6936908960342407, + "num_tokens": 5071891988.0, + "step": 9921 + }, + { + "epoch": 2.683071930773391, + "grad_norm": 1.5456856489181519, + "learning_rate": 1.0372279105719046e-05, + "loss": 1.7844, + "mean_token_accuracy": 0.5712618231773376, + "num_tokens": 5072369681.0, + "step": 9922 + }, + { + "epoch": 2.683342347214711, + "grad_norm": 1.302689790725708, + "learning_rate": 1.0370706465331148e-05, + "loss": 1.9155, + "mean_token_accuracy": 0.5636819005012512, + "num_tokens": 5072893812.0, + "step": 9923 + }, + { + "epoch": 2.68361276365603, + "grad_norm": 1.2372307777404785, + "learning_rate": 1.0369133844251824e-05, + "loss": 1.8254, + "mean_token_accuracy": 0.5779334306716919, + "num_tokens": 5073418018.0, + "step": 9924 + }, + { + "epoch": 2.68388318009735, + "grad_norm": 1.3111799955368042, + "learning_rate": 1.036756124252933e-05, + "loss": 1.8194, + "mean_token_accuracy": 0.5796270370483398, + "num_tokens": 5073942236.0, + "step": 9925 + }, + { + "epoch": 2.6841535965386694, + "grad_norm": 1.1815125942230225, + "learning_rate": 1.036598866021192e-05, + "loss": 1.6924, + "mean_token_accuracy": 0.5940110683441162, + "num_tokens": 5074466506.0, + "step": 9926 + }, + { + "epoch": 2.6844240129799894, + "grad_norm": 1.134270191192627, + "learning_rate": 1.0364416097347839e-05, + "loss": 1.9022, + "mean_token_accuracy": 0.5744321346282959, + "num_tokens": 5074990744.0, + "step": 9927 + }, + { + "epoch": 2.6846944294213086, + "grad_norm": 1.0431504249572754, + "learning_rate": 1.0362843553985342e-05, + "loss": 1.8661, + "mean_token_accuracy": 0.5737134218215942, + "num_tokens": 5075514769.0, + "step": 9928 + }, + { + "epoch": 2.6849648458626283, + "grad_norm": 1.0810626745224, + "learning_rate": 1.0361271030172677e-05, + "loss": 1.8783, + "mean_token_accuracy": 0.5694632530212402, + "num_tokens": 5075978693.0, + "step": 9929 + }, + { + "epoch": 2.685235262303948, + "grad_norm": 1.2845749855041504, + "learning_rate": 1.03596985259581e-05, + "loss": 1.7454, + "mean_token_accuracy": 0.6046524047851562, + "num_tokens": 5076502954.0, + "step": 9930 + }, + { + "epoch": 2.6855056787452676, + "grad_norm": 1.0600316524505615, + "learning_rate": 1.0358126041389853e-05, + "loss": 1.933, + "mean_token_accuracy": 0.5576719045639038, + "num_tokens": 5077027155.0, + "step": 9931 + }, + { + "epoch": 2.685776095186587, + "grad_norm": 1.1950803995132446, + "learning_rate": 1.0356553576516184e-05, + "loss": 1.9066, + "mean_token_accuracy": 0.5586825609207153, + "num_tokens": 5077551330.0, + "step": 9932 + }, + { + "epoch": 2.686046511627907, + "grad_norm": 1.2468732595443726, + "learning_rate": 1.0354981131385347e-05, + "loss": 1.8203, + "mean_token_accuracy": 0.5717366337776184, + "num_tokens": 5078075508.0, + "step": 9933 + }, + { + "epoch": 2.6863169280692265, + "grad_norm": 1.2802679538726807, + "learning_rate": 1.0353408706045584e-05, + "loss": 1.8139, + "mean_token_accuracy": 0.5847707390785217, + "num_tokens": 5078599740.0, + "step": 9934 + }, + { + "epoch": 2.686587344510546, + "grad_norm": 1.1884393692016602, + "learning_rate": 1.0351836300545142e-05, + "loss": 1.8545, + "mean_token_accuracy": 0.5755388736724854, + "num_tokens": 5079082099.0, + "step": 9935 + }, + { + "epoch": 2.686857760951866, + "grad_norm": 1.295353651046753, + "learning_rate": 1.0350263914932272e-05, + "loss": 1.914, + "mean_token_accuracy": 0.5543508529663086, + "num_tokens": 5079606275.0, + "step": 9936 + }, + { + "epoch": 2.6871281773931854, + "grad_norm": 1.0922746658325195, + "learning_rate": 1.0348691549255216e-05, + "loss": 1.7125, + "mean_token_accuracy": 0.5964324474334717, + "num_tokens": 5080130367.0, + "step": 9937 + }, + { + "epoch": 2.687398593834505, + "grad_norm": 1.14840567111969, + "learning_rate": 1.0347119203562215e-05, + "loss": 1.4769, + "mean_token_accuracy": 0.6148545742034912, + "num_tokens": 5080599544.0, + "step": 9938 + }, + { + "epoch": 2.6876690102758247, + "grad_norm": 1.2351809740066528, + "learning_rate": 1.0345546877901518e-05, + "loss": 1.9281, + "mean_token_accuracy": 0.5592694282531738, + "num_tokens": 5081121358.0, + "step": 9939 + }, + { + "epoch": 2.6879394267171444, + "grad_norm": 1.1547589302062988, + "learning_rate": 1.0343974572321369e-05, + "loss": 1.7465, + "mean_token_accuracy": 0.5900958776473999, + "num_tokens": 5081645565.0, + "step": 9940 + }, + { + "epoch": 2.688209843158464, + "grad_norm": 0.46624240279197693, + "learning_rate": 1.0342402286870006e-05, + "loss": 1.1509, + "mean_token_accuracy": 0.6922808885574341, + "num_tokens": 5082169844.0, + "step": 9941 + }, + { + "epoch": 2.6884802595997837, + "grad_norm": 1.5583844184875488, + "learning_rate": 1.0340830021595677e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5660595893859863, + "num_tokens": 5082694119.0, + "step": 9942 + }, + { + "epoch": 2.6887506760411033, + "grad_norm": 1.1489510536193848, + "learning_rate": 1.0339257776546625e-05, + "loss": 1.876, + "mean_token_accuracy": 0.5673689842224121, + "num_tokens": 5083218252.0, + "step": 9943 + }, + { + "epoch": 2.689021092482423, + "grad_norm": 1.0713013410568237, + "learning_rate": 1.0337685551771082e-05, + "loss": 1.8894, + "mean_token_accuracy": 0.5801959037780762, + "num_tokens": 5083742403.0, + "step": 9944 + }, + { + "epoch": 2.6892915089237426, + "grad_norm": 1.3339530229568481, + "learning_rate": 1.0336113347317296e-05, + "loss": 1.6356, + "mean_token_accuracy": 0.6182640790939331, + "num_tokens": 5084266686.0, + "step": 9945 + }, + { + "epoch": 2.6895619253650622, + "grad_norm": 1.1036804914474487, + "learning_rate": 1.0334541163233504e-05, + "loss": 1.7489, + "mean_token_accuracy": 0.5968934297561646, + "num_tokens": 5084728414.0, + "step": 9946 + }, + { + "epoch": 2.689832341806382, + "grad_norm": 1.0070239305496216, + "learning_rate": 1.0332968999567946e-05, + "loss": 1.8233, + "mean_token_accuracy": 0.5846626162528992, + "num_tokens": 5085246776.0, + "step": 9947 + }, + { + "epoch": 2.6901027582477015, + "grad_norm": 1.1385538578033447, + "learning_rate": 1.0331396856368863e-05, + "loss": 1.9141, + "mean_token_accuracy": 0.5548518896102905, + "num_tokens": 5085770961.0, + "step": 9948 + }, + { + "epoch": 2.690373174689021, + "grad_norm": 1.4231036901474, + "learning_rate": 1.0329824733684492e-05, + "loss": 1.8311, + "mean_token_accuracy": 0.5829012393951416, + "num_tokens": 5086295205.0, + "step": 9949 + }, + { + "epoch": 2.690643591130341, + "grad_norm": 1.2018520832061768, + "learning_rate": 1.0328252631563062e-05, + "loss": 1.6568, + "mean_token_accuracy": 0.579647958278656, + "num_tokens": 5086819252.0, + "step": 9950 + }, + { + "epoch": 2.6909140075716604, + "grad_norm": 1.1158499717712402, + "learning_rate": 1.0326680550052824e-05, + "loss": 1.8522, + "mean_token_accuracy": 0.6092450022697449, + "num_tokens": 5087281081.0, + "step": 9951 + }, + { + "epoch": 2.69118442401298, + "grad_norm": 1.1318997144699097, + "learning_rate": 1.0325108489202002e-05, + "loss": 1.9912, + "mean_token_accuracy": 0.5666757822036743, + "num_tokens": 5087743436.0, + "step": 9952 + }, + { + "epoch": 2.6914548404542997, + "grad_norm": 1.2134250402450562, + "learning_rate": 1.0323536449058839e-05, + "loss": 1.8988, + "mean_token_accuracy": 0.5749560594558716, + "num_tokens": 5088252604.0, + "step": 9953 + }, + { + "epoch": 2.6917252568956194, + "grad_norm": 1.1232949495315552, + "learning_rate": 1.0321964429671568e-05, + "loss": 1.9343, + "mean_token_accuracy": 0.5643296241760254, + "num_tokens": 5088734518.0, + "step": 9954 + }, + { + "epoch": 2.691995673336939, + "grad_norm": 1.1043405532836914, + "learning_rate": 1.0320392431088418e-05, + "loss": 1.9201, + "mean_token_accuracy": 0.5716546773910522, + "num_tokens": 5089258778.0, + "step": 9955 + }, + { + "epoch": 2.6922660897782587, + "grad_norm": 1.0240730047225952, + "learning_rate": 1.0318820453357629e-05, + "loss": 1.7534, + "mean_token_accuracy": 0.5830572843551636, + "num_tokens": 5089782870.0, + "step": 9956 + }, + { + "epoch": 2.6925365062195783, + "grad_norm": 1.1229665279388428, + "learning_rate": 1.0317248496527434e-05, + "loss": 1.7902, + "mean_token_accuracy": 0.5845535397529602, + "num_tokens": 5090219628.0, + "step": 9957 + }, + { + "epoch": 2.692806922660898, + "grad_norm": 1.2558516263961792, + "learning_rate": 1.0315676560646059e-05, + "loss": 1.8652, + "mean_token_accuracy": 0.571579098701477, + "num_tokens": 5090743768.0, + "step": 9958 + }, + { + "epoch": 2.693077339102217, + "grad_norm": 1.2214025259017944, + "learning_rate": 1.0314104645761745e-05, + "loss": 1.913, + "mean_token_accuracy": 0.5713230967521667, + "num_tokens": 5091267830.0, + "step": 9959 + }, + { + "epoch": 2.6933477555435372, + "grad_norm": 1.0523691177368164, + "learning_rate": 1.0312532751922712e-05, + "loss": 1.9101, + "mean_token_accuracy": 0.5668450593948364, + "num_tokens": 5091792070.0, + "step": 9960 + }, + { + "epoch": 2.6936181719848564, + "grad_norm": 0.5571262836456299, + "learning_rate": 1.03109608791772e-05, + "loss": 1.1064, + "mean_token_accuracy": 0.7078606486320496, + "num_tokens": 5092311645.0, + "step": 9961 + }, + { + "epoch": 2.6938885884261765, + "grad_norm": 1.4523022174835205, + "learning_rate": 1.0309389027573431e-05, + "loss": 1.934, + "mean_token_accuracy": 0.5869927406311035, + "num_tokens": 5092835865.0, + "step": 9962 + }, + { + "epoch": 2.6941590048674957, + "grad_norm": 1.2340112924575806, + "learning_rate": 1.0307817197159642e-05, + "loss": 1.9479, + "mean_token_accuracy": 0.5568199157714844, + "num_tokens": 5093323553.0, + "step": 9963 + }, + { + "epoch": 2.694429421308816, + "grad_norm": 1.1555943489074707, + "learning_rate": 1.030624538798405e-05, + "loss": 1.7899, + "mean_token_accuracy": 0.5890103578567505, + "num_tokens": 5093847720.0, + "step": 9964 + }, + { + "epoch": 2.694699837750135, + "grad_norm": 1.1183093786239624, + "learning_rate": 1.0304673600094897e-05, + "loss": 1.926, + "mean_token_accuracy": 0.5689011812210083, + "num_tokens": 5094294972.0, + "step": 9965 + }, + { + "epoch": 2.694970254191455, + "grad_norm": 1.068467378616333, + "learning_rate": 1.0303101833540401e-05, + "loss": 1.866, + "mean_token_accuracy": 0.5694833397865295, + "num_tokens": 5094819175.0, + "step": 9966 + }, + { + "epoch": 2.6952406706327743, + "grad_norm": 1.1139785051345825, + "learning_rate": 1.0301530088368786e-05, + "loss": 1.7986, + "mean_token_accuracy": 0.5721573233604431, + "num_tokens": 5095294794.0, + "step": 9967 + }, + { + "epoch": 2.6955110870740944, + "grad_norm": 1.2014784812927246, + "learning_rate": 1.0299958364628285e-05, + "loss": 1.895, + "mean_token_accuracy": 0.5590880513191223, + "num_tokens": 5095819065.0, + "step": 9968 + }, + { + "epoch": 2.6957815035154136, + "grad_norm": 1.1004830598831177, + "learning_rate": 1.0298386662367121e-05, + "loss": 1.8829, + "mean_token_accuracy": 0.5598993301391602, + "num_tokens": 5096343299.0, + "step": 9969 + }, + { + "epoch": 2.696051919956733, + "grad_norm": 1.0746548175811768, + "learning_rate": 1.0296814981633514e-05, + "loss": 1.7626, + "mean_token_accuracy": 0.5931074023246765, + "num_tokens": 5096867451.0, + "step": 9970 + }, + { + "epoch": 2.696322336398053, + "grad_norm": 1.0967775583267212, + "learning_rate": 1.0295243322475694e-05, + "loss": 1.8321, + "mean_token_accuracy": 0.5796405673027039, + "num_tokens": 5097391653.0, + "step": 9971 + }, + { + "epoch": 2.6965927528393725, + "grad_norm": 1.3354912996292114, + "learning_rate": 1.0293671684941883e-05, + "loss": 2.0129, + "mean_token_accuracy": 0.5465496182441711, + "num_tokens": 5097915918.0, + "step": 9972 + }, + { + "epoch": 2.696863169280692, + "grad_norm": 1.3545480966567993, + "learning_rate": 1.0292100069080298e-05, + "loss": 1.9057, + "mean_token_accuracy": 0.5640348196029663, + "num_tokens": 5098440162.0, + "step": 9973 + }, + { + "epoch": 2.697133585722012, + "grad_norm": 1.392392635345459, + "learning_rate": 1.0290528474939167e-05, + "loss": 1.6419, + "mean_token_accuracy": 0.58553147315979, + "num_tokens": 5098964398.0, + "step": 9974 + }, + { + "epoch": 2.6974040021633314, + "grad_norm": 1.1285722255706787, + "learning_rate": 1.028895690256671e-05, + "loss": 1.7873, + "mean_token_accuracy": 0.5842915773391724, + "num_tokens": 5099435832.0, + "step": 9975 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 1.0093863010406494, + "learning_rate": 1.0287385352011141e-05, + "loss": 1.8511, + "mean_token_accuracy": 0.5589046478271484, + "num_tokens": 5099959953.0, + "step": 9976 + }, + { + "epoch": 2.6979448350459707, + "grad_norm": 1.2778284549713135, + "learning_rate": 1.0285813823320691e-05, + "loss": 1.8671, + "mean_token_accuracy": 0.5800691843032837, + "num_tokens": 5100445454.0, + "step": 9977 + }, + { + "epoch": 2.6982152514872904, + "grad_norm": 1.0312083959579468, + "learning_rate": 1.0284242316543568e-05, + "loss": 1.9325, + "mean_token_accuracy": 0.5429452657699585, + "num_tokens": 5100969692.0, + "step": 9978 + }, + { + "epoch": 2.69848566792861, + "grad_norm": 1.1707453727722168, + "learning_rate": 1.0282670831728e-05, + "loss": 1.8568, + "mean_token_accuracy": 0.5927053689956665, + "num_tokens": 5101493808.0, + "step": 9979 + }, + { + "epoch": 2.6987560843699296, + "grad_norm": 1.08695387840271, + "learning_rate": 1.0281099368922203e-05, + "loss": 1.8447, + "mean_token_accuracy": 0.5662615299224854, + "num_tokens": 5101959456.0, + "step": 9980 + }, + { + "epoch": 2.6990265008112493, + "grad_norm": 0.48299530148506165, + "learning_rate": 1.0279527928174385e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.6934717893600464, + "num_tokens": 5102483711.0, + "step": 9981 + }, + { + "epoch": 2.699296917252569, + "grad_norm": 1.5686408281326294, + "learning_rate": 1.0277956509532771e-05, + "loss": 1.8117, + "mean_token_accuracy": 0.58132004737854, + "num_tokens": 5103007842.0, + "step": 9982 + }, + { + "epoch": 2.6995673336938886, + "grad_norm": 1.1743659973144531, + "learning_rate": 1.0276385113045577e-05, + "loss": 1.861, + "mean_token_accuracy": 0.5665982365608215, + "num_tokens": 5103531999.0, + "step": 9983 + }, + { + "epoch": 2.699837750135208, + "grad_norm": 1.0528645515441895, + "learning_rate": 1.027481373876101e-05, + "loss": 1.8441, + "mean_token_accuracy": 0.5708990097045898, + "num_tokens": 5104056218.0, + "step": 9984 + }, + { + "epoch": 2.700108166576528, + "grad_norm": 1.220706582069397, + "learning_rate": 1.0273242386727298e-05, + "loss": 1.7844, + "mean_token_accuracy": 0.5946844816207886, + "num_tokens": 5104580347.0, + "step": 9985 + }, + { + "epoch": 2.7003785830178475, + "grad_norm": 1.2105576992034912, + "learning_rate": 1.0271671056992643e-05, + "loss": 1.785, + "mean_token_accuracy": 0.573733925819397, + "num_tokens": 5105104519.0, + "step": 9986 + }, + { + "epoch": 2.700648999459167, + "grad_norm": 1.1127159595489502, + "learning_rate": 1.027009974960526e-05, + "loss": 1.7149, + "mean_token_accuracy": 0.6070467829704285, + "num_tokens": 5105628663.0, + "step": 9987 + }, + { + "epoch": 2.700919415900487, + "grad_norm": 1.3544279336929321, + "learning_rate": 1.0268528464613365e-05, + "loss": 1.782, + "mean_token_accuracy": 0.5591959953308105, + "num_tokens": 5106152776.0, + "step": 9988 + }, + { + "epoch": 2.7011898323418064, + "grad_norm": 1.0764809846878052, + "learning_rate": 1.0266957202065169e-05, + "loss": 1.7429, + "mean_token_accuracy": 0.5970702171325684, + "num_tokens": 5106673312.0, + "step": 9989 + }, + { + "epoch": 2.701460248783126, + "grad_norm": 1.0604257583618164, + "learning_rate": 1.0265385962008878e-05, + "loss": 1.9594, + "mean_token_accuracy": 0.5549734830856323, + "num_tokens": 5107197500.0, + "step": 9990 + }, + { + "epoch": 2.7017306652244457, + "grad_norm": 0.9874039888381958, + "learning_rate": 1.026381474449271e-05, + "loss": 1.9261, + "mean_token_accuracy": 0.5591511726379395, + "num_tokens": 5107721582.0, + "step": 9991 + }, + { + "epoch": 2.7020010816657654, + "grad_norm": 1.1187149286270142, + "learning_rate": 1.0262243549564872e-05, + "loss": 1.7209, + "mean_token_accuracy": 0.6200952529907227, + "num_tokens": 5108245778.0, + "step": 9992 + }, + { + "epoch": 2.702271498107085, + "grad_norm": 1.085436463356018, + "learning_rate": 1.0260672377273564e-05, + "loss": 1.8969, + "mean_token_accuracy": 0.5653550028800964, + "num_tokens": 5108718104.0, + "step": 9993 + }, + { + "epoch": 2.7025419145484046, + "grad_norm": 1.0680429935455322, + "learning_rate": 1.025910122766701e-05, + "loss": 1.8012, + "mean_token_accuracy": 0.5806188583374023, + "num_tokens": 5109242257.0, + "step": 9994 + }, + { + "epoch": 2.7028123309897243, + "grad_norm": 1.1405644416809082, + "learning_rate": 1.0257530100793406e-05, + "loss": 1.8298, + "mean_token_accuracy": 0.5646359920501709, + "num_tokens": 5109766442.0, + "step": 9995 + }, + { + "epoch": 2.703082747431044, + "grad_norm": 1.2610474824905396, + "learning_rate": 1.0255958996700958e-05, + "loss": 1.8415, + "mean_token_accuracy": 0.5834245681762695, + "num_tokens": 5110290644.0, + "step": 9996 + }, + { + "epoch": 2.7033531638723636, + "grad_norm": 1.2291066646575928, + "learning_rate": 1.025438791543788e-05, + "loss": 1.8511, + "mean_token_accuracy": 0.5795891284942627, + "num_tokens": 5110803054.0, + "step": 9997 + }, + { + "epoch": 2.703623580313683, + "grad_norm": 1.221140742301941, + "learning_rate": 1.025281685705237e-05, + "loss": 1.8574, + "mean_token_accuracy": 0.5738683938980103, + "num_tokens": 5111327240.0, + "step": 9998 + }, + { + "epoch": 2.703893996755003, + "grad_norm": 1.1897515058517456, + "learning_rate": 1.0251245821592642e-05, + "loss": 1.8426, + "mean_token_accuracy": 0.5691109895706177, + "num_tokens": 5111851413.0, + "step": 9999 + }, + { + "epoch": 2.7041644131963225, + "grad_norm": 1.2375733852386475, + "learning_rate": 1.0249674809106893e-05, + "loss": 1.8248, + "mean_token_accuracy": 0.5945473909378052, + "num_tokens": 5112375594.0, + "step": 10000 + }, + { + "epoch": 2.704434829637642, + "grad_norm": 0.4354155957698822, + "learning_rate": 1.0248103819643323e-05, + "loss": 1.1431, + "mean_token_accuracy": 0.7042584419250488, + "num_tokens": 5112895420.0, + "step": 10001 + }, + { + "epoch": 2.7047052460789613, + "grad_norm": 1.8599637746810913, + "learning_rate": 1.0246532853250144e-05, + "loss": 1.9639, + "mean_token_accuracy": 0.5571541786193848, + "num_tokens": 5113363907.0, + "step": 10002 + }, + { + "epoch": 2.7049756625202814, + "grad_norm": 1.4286420345306396, + "learning_rate": 1.024496190997555e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5574996471405029, + "num_tokens": 5113888032.0, + "step": 10003 + }, + { + "epoch": 2.7052460789616006, + "grad_norm": 1.1547857522964478, + "learning_rate": 1.0243390989867742e-05, + "loss": 1.8678, + "mean_token_accuracy": 0.5767755508422852, + "num_tokens": 5114412276.0, + "step": 10004 + }, + { + "epoch": 2.7055164954029207, + "grad_norm": 1.41793954372406, + "learning_rate": 1.024182009297493e-05, + "loss": 1.7665, + "mean_token_accuracy": 0.5723686218261719, + "num_tokens": 5114936444.0, + "step": 10005 + }, + { + "epoch": 2.70578691184424, + "grad_norm": 1.4147858619689941, + "learning_rate": 1.0240249219345305e-05, + "loss": 1.8117, + "mean_token_accuracy": 0.5680906772613525, + "num_tokens": 5115460650.0, + "step": 10006 + }, + { + "epoch": 2.70605732828556, + "grad_norm": 1.300623893737793, + "learning_rate": 1.0238678369027067e-05, + "loss": 1.9277, + "mean_token_accuracy": 0.5720393657684326, + "num_tokens": 5115948210.0, + "step": 10007 + }, + { + "epoch": 2.706327744726879, + "grad_norm": 1.2892643213272095, + "learning_rate": 1.0237107542068416e-05, + "loss": 1.8008, + "mean_token_accuracy": 0.5735093951225281, + "num_tokens": 5116472463.0, + "step": 10008 + }, + { + "epoch": 2.7065981611681993, + "grad_norm": 1.372705101966858, + "learning_rate": 1.0235536738517551e-05, + "loss": 1.8786, + "mean_token_accuracy": 0.5542895793914795, + "num_tokens": 5116996660.0, + "step": 10009 + }, + { + "epoch": 2.7068685776095185, + "grad_norm": 1.2303316593170166, + "learning_rate": 1.0233965958422665e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5604040622711182, + "num_tokens": 5117438248.0, + "step": 10010 + }, + { + "epoch": 2.707138994050838, + "grad_norm": 1.047761082649231, + "learning_rate": 1.0232395201831954e-05, + "loss": 1.7176, + "mean_token_accuracy": 0.6002970933914185, + "num_tokens": 5117962321.0, + "step": 10011 + }, + { + "epoch": 2.7074094104921578, + "grad_norm": 1.3048460483551025, + "learning_rate": 1.0230824468793621e-05, + "loss": 1.9211, + "mean_token_accuracy": 0.5526734590530396, + "num_tokens": 5118486540.0, + "step": 10012 + }, + { + "epoch": 2.7076798269334774, + "grad_norm": 1.361298680305481, + "learning_rate": 1.0229253759355849e-05, + "loss": 1.8911, + "mean_token_accuracy": 0.5776562094688416, + "num_tokens": 5118971554.0, + "step": 10013 + }, + { + "epoch": 2.707950243374797, + "grad_norm": 1.229866623878479, + "learning_rate": 1.0227683073566843e-05, + "loss": 1.9247, + "mean_token_accuracy": 0.559586763381958, + "num_tokens": 5119473256.0, + "step": 10014 + }, + { + "epoch": 2.7082206598161167, + "grad_norm": 1.0029563903808594, + "learning_rate": 1.022611241147479e-05, + "loss": 1.9271, + "mean_token_accuracy": 0.546945333480835, + "num_tokens": 5119997525.0, + "step": 10015 + }, + { + "epoch": 2.7084910762574363, + "grad_norm": 1.191996693611145, + "learning_rate": 1.0224541773127883e-05, + "loss": 1.6443, + "mean_token_accuracy": 0.6079058647155762, + "num_tokens": 5120521786.0, + "step": 10016 + }, + { + "epoch": 2.708761492698756, + "grad_norm": 1.1606321334838867, + "learning_rate": 1.0222971158574315e-05, + "loss": 2.0109, + "mean_token_accuracy": 0.5437902808189392, + "num_tokens": 5121045981.0, + "step": 10017 + }, + { + "epoch": 2.7090319091400756, + "grad_norm": 1.1174412965774536, + "learning_rate": 1.022140056786228e-05, + "loss": 1.9452, + "mean_token_accuracy": 0.5507416129112244, + "num_tokens": 5121570171.0, + "step": 10018 + }, + { + "epoch": 2.7093023255813953, + "grad_norm": 1.0988776683807373, + "learning_rate": 1.0219830001039964e-05, + "loss": 1.9069, + "mean_token_accuracy": 0.5725423097610474, + "num_tokens": 5122094436.0, + "step": 10019 + }, + { + "epoch": 2.709572742022715, + "grad_norm": 1.0488011837005615, + "learning_rate": 1.0218259458155557e-05, + "loss": 1.7705, + "mean_token_accuracy": 0.592739999294281, + "num_tokens": 5122559397.0, + "step": 10020 + }, + { + "epoch": 2.7098431584640346, + "grad_norm": 0.4042915403842926, + "learning_rate": 1.0216688939257246e-05, + "loss": 1.0539, + "mean_token_accuracy": 0.6952919960021973, + "num_tokens": 5123083550.0, + "step": 10021 + }, + { + "epoch": 2.710113574905354, + "grad_norm": 1.6340527534484863, + "learning_rate": 1.0215118444393226e-05, + "loss": 2.0064, + "mean_token_accuracy": 0.5562453269958496, + "num_tokens": 5123607722.0, + "step": 10022 + }, + { + "epoch": 2.710383991346674, + "grad_norm": 1.882163643836975, + "learning_rate": 1.0213547973611682e-05, + "loss": 1.8436, + "mean_token_accuracy": 0.5946575403213501, + "num_tokens": 5124068622.0, + "step": 10023 + }, + { + "epoch": 2.7106544077879935, + "grad_norm": 1.1414240598678589, + "learning_rate": 1.0211977526960793e-05, + "loss": 1.9909, + "mean_token_accuracy": 0.5627024173736572, + "num_tokens": 5124571658.0, + "step": 10024 + }, + { + "epoch": 2.710924824229313, + "grad_norm": 1.0733823776245117, + "learning_rate": 1.0210407104488753e-05, + "loss": 1.9265, + "mean_token_accuracy": 0.5642050504684448, + "num_tokens": 5125095939.0, + "step": 10025 + }, + { + "epoch": 2.7111952406706328, + "grad_norm": 1.1790488958358765, + "learning_rate": 1.0208836706243748e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5757874846458435, + "num_tokens": 5125620066.0, + "step": 10026 + }, + { + "epoch": 2.7114656571119524, + "grad_norm": 0.9938612580299377, + "learning_rate": 1.0207266332273956e-05, + "loss": 1.904, + "mean_token_accuracy": 0.5596187710762024, + "num_tokens": 5126144328.0, + "step": 10027 + }, + { + "epoch": 2.711736073553272, + "grad_norm": 1.1638225317001343, + "learning_rate": 1.0205695982627566e-05, + "loss": 1.8622, + "mean_token_accuracy": 0.5735772848129272, + "num_tokens": 5126645057.0, + "step": 10028 + }, + { + "epoch": 2.7120064899945917, + "grad_norm": 1.300695776939392, + "learning_rate": 1.0204125657352761e-05, + "loss": 1.8479, + "mean_token_accuracy": 0.5787003040313721, + "num_tokens": 5127169255.0, + "step": 10029 + }, + { + "epoch": 2.7122769064359114, + "grad_norm": 1.127213954925537, + "learning_rate": 1.0202555356497717e-05, + "loss": 1.9707, + "mean_token_accuracy": 0.5430718064308167, + "num_tokens": 5127693407.0, + "step": 10030 + }, + { + "epoch": 2.712547322877231, + "grad_norm": 1.2823524475097656, + "learning_rate": 1.0200985080110623e-05, + "loss": 1.9584, + "mean_token_accuracy": 0.5687006115913391, + "num_tokens": 5128155682.0, + "step": 10031 + }, + { + "epoch": 2.7128177393185506, + "grad_norm": 1.169245719909668, + "learning_rate": 1.0199414828239655e-05, + "loss": 1.7845, + "mean_token_accuracy": 0.5862954258918762, + "num_tokens": 5128679864.0, + "step": 10032 + }, + { + "epoch": 2.7130881557598703, + "grad_norm": 1.0380240678787231, + "learning_rate": 1.0197844600932994e-05, + "loss": 1.7868, + "mean_token_accuracy": 0.5934461355209351, + "num_tokens": 5129204125.0, + "step": 10033 + }, + { + "epoch": 2.71335857220119, + "grad_norm": 1.0109772682189941, + "learning_rate": 1.0196274398238823e-05, + "loss": 1.8864, + "mean_token_accuracy": 0.5705721378326416, + "num_tokens": 5129728266.0, + "step": 10034 + }, + { + "epoch": 2.7136289886425096, + "grad_norm": 1.3257516622543335, + "learning_rate": 1.0194704220205315e-05, + "loss": 1.7849, + "mean_token_accuracy": 0.5854805707931519, + "num_tokens": 5130213158.0, + "step": 10035 + }, + { + "epoch": 2.713899405083829, + "grad_norm": 1.3968932628631592, + "learning_rate": 1.0193134066880648e-05, + "loss": 1.8942, + "mean_token_accuracy": 0.5390306115150452, + "num_tokens": 5130737384.0, + "step": 10036 + }, + { + "epoch": 2.714169821525149, + "grad_norm": 1.1390454769134521, + "learning_rate": 1.0191563938313003e-05, + "loss": 1.7904, + "mean_token_accuracy": 0.5693626999855042, + "num_tokens": 5131253571.0, + "step": 10037 + }, + { + "epoch": 2.7144402379664685, + "grad_norm": 1.0719468593597412, + "learning_rate": 1.0189993834550553e-05, + "loss": 1.9087, + "mean_token_accuracy": 0.5669906139373779, + "num_tokens": 5131767567.0, + "step": 10038 + }, + { + "epoch": 2.714710654407788, + "grad_norm": 1.239731788635254, + "learning_rate": 1.0188423755641472e-05, + "loss": 1.9006, + "mean_token_accuracy": 0.5758141875267029, + "num_tokens": 5132291466.0, + "step": 10039 + }, + { + "epoch": 2.714981070849108, + "grad_norm": 1.157785177230835, + "learning_rate": 1.018685370163394e-05, + "loss": 1.8945, + "mean_token_accuracy": 0.5777028799057007, + "num_tokens": 5132812138.0, + "step": 10040 + }, + { + "epoch": 2.7152514872904274, + "grad_norm": 0.43592187762260437, + "learning_rate": 1.018528367257613e-05, + "loss": 1.0112, + "mean_token_accuracy": 0.7280150055885315, + "num_tokens": 5133336324.0, + "step": 10041 + }, + { + "epoch": 2.715521903731747, + "grad_norm": 1.674181580543518, + "learning_rate": 1.0183713668516206e-05, + "loss": 1.923, + "mean_token_accuracy": 0.5780926942825317, + "num_tokens": 5133833476.0, + "step": 10042 + }, + { + "epoch": 2.7157923201730663, + "grad_norm": 1.413794755935669, + "learning_rate": 1.0182143689502352e-05, + "loss": 1.826, + "mean_token_accuracy": 0.576900064945221, + "num_tokens": 5134357693.0, + "step": 10043 + }, + { + "epoch": 2.7160627366143864, + "grad_norm": 1.2159961462020874, + "learning_rate": 1.0180573735582732e-05, + "loss": 1.8584, + "mean_token_accuracy": 0.5729018449783325, + "num_tokens": 5134881971.0, + "step": 10044 + }, + { + "epoch": 2.7163331530557056, + "grad_norm": 1.345273494720459, + "learning_rate": 1.0179003806805523e-05, + "loss": 1.7603, + "mean_token_accuracy": 0.6004056930541992, + "num_tokens": 5135343874.0, + "step": 10045 + }, + { + "epoch": 2.7166035694970256, + "grad_norm": 1.2644219398498535, + "learning_rate": 1.017743390321889e-05, + "loss": 1.8448, + "mean_token_accuracy": 0.5786959528923035, + "num_tokens": 5135868139.0, + "step": 10046 + }, + { + "epoch": 2.716873985938345, + "grad_norm": 1.1506593227386475, + "learning_rate": 1.0175864024871e-05, + "loss": 1.8932, + "mean_token_accuracy": 0.5673004388809204, + "num_tokens": 5136353413.0, + "step": 10047 + }, + { + "epoch": 2.717144402379665, + "grad_norm": 1.22026526927948, + "learning_rate": 1.017429417181003e-05, + "loss": 2.0716, + "mean_token_accuracy": 0.5282548666000366, + "num_tokens": 5136877592.0, + "step": 10048 + }, + { + "epoch": 2.717414818820984, + "grad_norm": 1.2638523578643799, + "learning_rate": 1.0172724344084144e-05, + "loss": 1.8915, + "mean_token_accuracy": 0.5559436082839966, + "num_tokens": 5137401845.0, + "step": 10049 + }, + { + "epoch": 2.717685235262304, + "grad_norm": 0.994692862033844, + "learning_rate": 1.0171154541741503e-05, + "loss": 1.7321, + "mean_token_accuracy": 0.5931869745254517, + "num_tokens": 5137883960.0, + "step": 10050 + }, + { + "epoch": 2.7179556517036234, + "grad_norm": 1.1019830703735352, + "learning_rate": 1.0169584764830285e-05, + "loss": 1.9071, + "mean_token_accuracy": 0.556784987449646, + "num_tokens": 5138378312.0, + "step": 10051 + }, + { + "epoch": 2.718226068144943, + "grad_norm": 1.2490215301513672, + "learning_rate": 1.0168015013398644e-05, + "loss": 1.9404, + "mean_token_accuracy": 0.5607849359512329, + "num_tokens": 5138867092.0, + "step": 10052 + }, + { + "epoch": 2.7184964845862627, + "grad_norm": 1.0552195310592651, + "learning_rate": 1.0166445287494749e-05, + "loss": 1.8467, + "mean_token_accuracy": 0.5699840784072876, + "num_tokens": 5139391363.0, + "step": 10053 + }, + { + "epoch": 2.7187669010275823, + "grad_norm": 1.3110755681991577, + "learning_rate": 1.0164875587166764e-05, + "loss": 1.9551, + "mean_token_accuracy": 0.5565507411956787, + "num_tokens": 5139915473.0, + "step": 10054 + }, + { + "epoch": 2.719037317468902, + "grad_norm": 0.9813376665115356, + "learning_rate": 1.0163305912462856e-05, + "loss": 1.8829, + "mean_token_accuracy": 0.5711236000061035, + "num_tokens": 5140439696.0, + "step": 10055 + }, + { + "epoch": 2.7193077339102216, + "grad_norm": 1.162778615951538, + "learning_rate": 1.0161736263431177e-05, + "loss": 1.9159, + "mean_token_accuracy": 0.5629390478134155, + "num_tokens": 5140963937.0, + "step": 10056 + }, + { + "epoch": 2.7195781503515413, + "grad_norm": 1.2083399295806885, + "learning_rate": 1.0160166640119897e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.5666608214378357, + "num_tokens": 5141411053.0, + "step": 10057 + }, + { + "epoch": 2.719848566792861, + "grad_norm": 1.2208462953567505, + "learning_rate": 1.0158597042577176e-05, + "loss": 1.8918, + "mean_token_accuracy": 0.5608524680137634, + "num_tokens": 5141905948.0, + "step": 10058 + }, + { + "epoch": 2.7201189832341806, + "grad_norm": 1.243590235710144, + "learning_rate": 1.0157027470851168e-05, + "loss": 1.9321, + "mean_token_accuracy": 0.5659880638122559, + "num_tokens": 5142413886.0, + "step": 10059 + }, + { + "epoch": 2.7203893996755, + "grad_norm": 1.297411561012268, + "learning_rate": 1.015545792499004e-05, + "loss": 1.8333, + "mean_token_accuracy": 0.5748486518859863, + "num_tokens": 5142886367.0, + "step": 10060 + }, + { + "epoch": 2.72065981611682, + "grad_norm": 0.4671250581741333, + "learning_rate": 1.0153888405041948e-05, + "loss": 0.9716, + "mean_token_accuracy": 0.7431379556655884, + "num_tokens": 5143410586.0, + "step": 10061 + }, + { + "epoch": 2.7209302325581395, + "grad_norm": 1.7495698928833008, + "learning_rate": 1.0152318911055042e-05, + "loss": 1.8655, + "mean_token_accuracy": 0.5768513679504395, + "num_tokens": 5143906027.0, + "step": 10062 + }, + { + "epoch": 2.721200648999459, + "grad_norm": 1.7423685789108276, + "learning_rate": 1.015074944307749e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.5831053256988525, + "num_tokens": 5144366407.0, + "step": 10063 + }, + { + "epoch": 2.7214710654407788, + "grad_norm": 1.1159863471984863, + "learning_rate": 1.0149180001157441e-05, + "loss": 1.8442, + "mean_token_accuracy": 0.5694981813430786, + "num_tokens": 5144888470.0, + "step": 10064 + }, + { + "epoch": 2.7217414818820984, + "grad_norm": 1.5516602993011475, + "learning_rate": 1.014761058534305e-05, + "loss": 1.886, + "mean_token_accuracy": 0.5655120611190796, + "num_tokens": 5145412733.0, + "step": 10065 + }, + { + "epoch": 2.722011898323418, + "grad_norm": 1.6790848970413208, + "learning_rate": 1.0146041195682476e-05, + "loss": 1.8089, + "mean_token_accuracy": 0.5896664261817932, + "num_tokens": 5145936997.0, + "step": 10066 + }, + { + "epoch": 2.7222823147647377, + "grad_norm": 1.339468240737915, + "learning_rate": 1.0144471832223865e-05, + "loss": 1.9729, + "mean_token_accuracy": 0.5663172602653503, + "num_tokens": 5146437196.0, + "step": 10067 + }, + { + "epoch": 2.7225527312060573, + "grad_norm": 1.0811758041381836, + "learning_rate": 1.0142902495015376e-05, + "loss": 1.9066, + "mean_token_accuracy": 0.5729730129241943, + "num_tokens": 5146915893.0, + "step": 10068 + }, + { + "epoch": 2.722823147647377, + "grad_norm": 1.534127950668335, + "learning_rate": 1.0141333184105162e-05, + "loss": 1.9298, + "mean_token_accuracy": 0.5436771512031555, + "num_tokens": 5147440119.0, + "step": 10069 + }, + { + "epoch": 2.7230935640886966, + "grad_norm": 1.600471019744873, + "learning_rate": 1.0139763899541366e-05, + "loss": 1.917, + "mean_token_accuracy": 0.551111102104187, + "num_tokens": 5147964381.0, + "step": 10070 + }, + { + "epoch": 2.7233639805300163, + "grad_norm": 1.2138429880142212, + "learning_rate": 1.0138194641372149e-05, + "loss": 1.8997, + "mean_token_accuracy": 0.5577903985977173, + "num_tokens": 5148488491.0, + "step": 10071 + }, + { + "epoch": 2.723634396971336, + "grad_norm": 1.1508731842041016, + "learning_rate": 1.0136625409645652e-05, + "loss": 1.959, + "mean_token_accuracy": 0.5556156635284424, + "num_tokens": 5149012716.0, + "step": 10072 + }, + { + "epoch": 2.7239048134126556, + "grad_norm": 1.2662614583969116, + "learning_rate": 1.0135056204410025e-05, + "loss": 1.8241, + "mean_token_accuracy": 0.5627584457397461, + "num_tokens": 5149536887.0, + "step": 10073 + }, + { + "epoch": 2.724175229853975, + "grad_norm": 1.580918788909912, + "learning_rate": 1.013348702571342e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.5543108582496643, + "num_tokens": 5150061146.0, + "step": 10074 + }, + { + "epoch": 2.724445646295295, + "grad_norm": 1.342050552368164, + "learning_rate": 1.013191787360398e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5383216142654419, + "num_tokens": 5150585298.0, + "step": 10075 + }, + { + "epoch": 2.7247160627366145, + "grad_norm": 1.1084948778152466, + "learning_rate": 1.0130348748129853e-05, + "loss": 1.8017, + "mean_token_accuracy": 0.5737154483795166, + "num_tokens": 5151109460.0, + "step": 10076 + }, + { + "epoch": 2.724986479177934, + "grad_norm": 1.7925082445144653, + "learning_rate": 1.0128779649339184e-05, + "loss": 1.9803, + "mean_token_accuracy": 0.5617835521697998, + "num_tokens": 5151633715.0, + "step": 10077 + }, + { + "epoch": 2.7252568956192538, + "grad_norm": 1.4422435760498047, + "learning_rate": 1.0127210577280117e-05, + "loss": 1.7865, + "mean_token_accuracy": 0.5865100026130676, + "num_tokens": 5152157896.0, + "step": 10078 + }, + { + "epoch": 2.7255273120605734, + "grad_norm": 1.033435344696045, + "learning_rate": 1.0125641532000797e-05, + "loss": 1.8929, + "mean_token_accuracy": 0.5655854344367981, + "num_tokens": 5152676838.0, + "step": 10079 + }, + { + "epoch": 2.725797728501893, + "grad_norm": 1.1704353094100952, + "learning_rate": 1.0124072513549366e-05, + "loss": 1.9186, + "mean_token_accuracy": 0.5551101565361023, + "num_tokens": 5153200973.0, + "step": 10080 + }, + { + "epoch": 2.7260681449432127, + "grad_norm": 0.4329042434692383, + "learning_rate": 1.0122503521973966e-05, + "loss": 1.0665, + "mean_token_accuracy": 0.7141981720924377, + "num_tokens": 5153724396.0, + "step": 10081 + }, + { + "epoch": 2.7263385613845323, + "grad_norm": 1.5615826845169067, + "learning_rate": 1.0120934557322735e-05, + "loss": 1.873, + "mean_token_accuracy": 0.5795055627822876, + "num_tokens": 5154248556.0, + "step": 10082 + }, + { + "epoch": 2.726608977825852, + "grad_norm": 1.2205561399459839, + "learning_rate": 1.011936561964382e-05, + "loss": 1.864, + "mean_token_accuracy": 0.5770285129547119, + "num_tokens": 5154732796.0, + "step": 10083 + }, + { + "epoch": 2.726879394267171, + "grad_norm": 1.2968679666519165, + "learning_rate": 1.0117796708985361e-05, + "loss": 1.8252, + "mean_token_accuracy": 0.5920457243919373, + "num_tokens": 5155243547.0, + "step": 10084 + }, + { + "epoch": 2.7271498107084913, + "grad_norm": 1.2065787315368652, + "learning_rate": 1.0116227825395486e-05, + "loss": 1.8464, + "mean_token_accuracy": 0.5832116603851318, + "num_tokens": 5155767806.0, + "step": 10085 + }, + { + "epoch": 2.7274202271498105, + "grad_norm": 1.3087013959884644, + "learning_rate": 1.0114658968922347e-05, + "loss": 1.9351, + "mean_token_accuracy": 0.5794289112091064, + "num_tokens": 5156252799.0, + "step": 10086 + }, + { + "epoch": 2.7276906435911306, + "grad_norm": 1.2971174716949463, + "learning_rate": 1.011309013961407e-05, + "loss": 2.0068, + "mean_token_accuracy": 0.5533219575881958, + "num_tokens": 5156776988.0, + "step": 10087 + }, + { + "epoch": 2.7279610600324498, + "grad_norm": 1.1469327211380005, + "learning_rate": 1.0111521337518798e-05, + "loss": 1.8136, + "mean_token_accuracy": 0.5829697847366333, + "num_tokens": 5157301268.0, + "step": 10088 + }, + { + "epoch": 2.72823147647377, + "grad_norm": 1.040982723236084, + "learning_rate": 1.0109952562684665e-05, + "loss": 1.797, + "mean_token_accuracy": 0.5892025232315063, + "num_tokens": 5157782158.0, + "step": 10089 + }, + { + "epoch": 2.728501892915089, + "grad_norm": 1.1095932722091675, + "learning_rate": 1.0108383815159799e-05, + "loss": 1.8748, + "mean_token_accuracy": 0.5749617218971252, + "num_tokens": 5158306348.0, + "step": 10090 + }, + { + "epoch": 2.728772309356409, + "grad_norm": 1.2431530952453613, + "learning_rate": 1.0106815094992347e-05, + "loss": 1.8961, + "mean_token_accuracy": 0.5667270421981812, + "num_tokens": 5158830627.0, + "step": 10091 + }, + { + "epoch": 2.7290427257977283, + "grad_norm": 1.2772306203842163, + "learning_rate": 1.0105246402230433e-05, + "loss": 1.7888, + "mean_token_accuracy": 0.5918262004852295, + "num_tokens": 5159349397.0, + "step": 10092 + }, + { + "epoch": 2.729313142239048, + "grad_norm": 0.9524754881858826, + "learning_rate": 1.0103677736922186e-05, + "loss": 1.7684, + "mean_token_accuracy": 0.5961155891418457, + "num_tokens": 5159873635.0, + "step": 10093 + }, + { + "epoch": 2.7295835586803676, + "grad_norm": 1.150327205657959, + "learning_rate": 1.0102109099115747e-05, + "loss": 1.9574, + "mean_token_accuracy": 0.5634353160858154, + "num_tokens": 5160397776.0, + "step": 10094 + }, + { + "epoch": 2.7298539751216873, + "grad_norm": 1.0597513914108276, + "learning_rate": 1.010054048885924e-05, + "loss": 1.9899, + "mean_token_accuracy": 0.5368543267250061, + "num_tokens": 5160921979.0, + "step": 10095 + }, + { + "epoch": 2.730124391563007, + "grad_norm": 1.0452206134796143, + "learning_rate": 1.0098971906200794e-05, + "loss": 1.8443, + "mean_token_accuracy": 0.5852787494659424, + "num_tokens": 5161446260.0, + "step": 10096 + }, + { + "epoch": 2.7303948080043265, + "grad_norm": 1.3857980966567993, + "learning_rate": 1.0097403351188541e-05, + "loss": 2.0134, + "mean_token_accuracy": 0.557182788848877, + "num_tokens": 5161970408.0, + "step": 10097 + }, + { + "epoch": 2.730665224445646, + "grad_norm": 1.2296792268753052, + "learning_rate": 1.009583482387061e-05, + "loss": 1.9422, + "mean_token_accuracy": 0.5452035665512085, + "num_tokens": 5162494654.0, + "step": 10098 + }, + { + "epoch": 2.730935640886966, + "grad_norm": 1.2980445623397827, + "learning_rate": 1.0094266324295122e-05, + "loss": 1.8425, + "mean_token_accuracy": 0.5747062563896179, + "num_tokens": 5163018729.0, + "step": 10099 + }, + { + "epoch": 2.7312060573282855, + "grad_norm": 1.191733956336975, + "learning_rate": 1.0092697852510209e-05, + "loss": 1.887, + "mean_token_accuracy": 0.581280529499054, + "num_tokens": 5163484856.0, + "step": 10100 + }, + { + "epoch": 2.731476473769605, + "grad_norm": 0.4766193926334381, + "learning_rate": 1.0091129408563996e-05, + "loss": 1.0867, + "mean_token_accuracy": 0.7206804156303406, + "num_tokens": 5164009086.0, + "step": 10101 + }, + { + "epoch": 2.7317468902109248, + "grad_norm": 1.5557583570480347, + "learning_rate": 1.00895609925046e-05, + "loss": 1.8416, + "mean_token_accuracy": 0.5755020380020142, + "num_tokens": 5164533356.0, + "step": 10102 + }, + { + "epoch": 2.7320173066522444, + "grad_norm": 1.6189260482788086, + "learning_rate": 1.0087992604380153e-05, + "loss": 1.8838, + "mean_token_accuracy": 0.5480860471725464, + "num_tokens": 5165046683.0, + "step": 10103 + }, + { + "epoch": 2.732287723093564, + "grad_norm": 1.2006927728652954, + "learning_rate": 1.0086424244238775e-05, + "loss": 1.8287, + "mean_token_accuracy": 0.5897359848022461, + "num_tokens": 5165569152.0, + "step": 10104 + }, + { + "epoch": 2.7325581395348837, + "grad_norm": 1.011960506439209, + "learning_rate": 1.0084855912128585e-05, + "loss": 1.7945, + "mean_token_accuracy": 0.5844211578369141, + "num_tokens": 5166093118.0, + "step": 10105 + }, + { + "epoch": 2.7328285559762033, + "grad_norm": 1.2770092487335205, + "learning_rate": 1.008328760809771e-05, + "loss": 1.9709, + "mean_token_accuracy": 0.5629479289054871, + "num_tokens": 5166583765.0, + "step": 10106 + }, + { + "epoch": 2.733098972417523, + "grad_norm": 1.1407854557037354, + "learning_rate": 1.0081719332194271e-05, + "loss": 1.8101, + "mean_token_accuracy": 0.5732182264328003, + "num_tokens": 5167108042.0, + "step": 10107 + }, + { + "epoch": 2.7333693888588426, + "grad_norm": 0.9843636155128479, + "learning_rate": 1.0080151084466376e-05, + "loss": 1.8564, + "mean_token_accuracy": 0.5761699676513672, + "num_tokens": 5167632213.0, + "step": 10108 + }, + { + "epoch": 2.7336398053001623, + "grad_norm": 1.1687678098678589, + "learning_rate": 1.0078582864962154e-05, + "loss": 1.8248, + "mean_token_accuracy": 0.5881308317184448, + "num_tokens": 5168149166.0, + "step": 10109 + }, + { + "epoch": 2.733910221741482, + "grad_norm": 1.295042634010315, + "learning_rate": 1.0077014673729722e-05, + "loss": 1.9225, + "mean_token_accuracy": 0.5603296160697937, + "num_tokens": 5168673386.0, + "step": 10110 + }, + { + "epoch": 2.7341806381828015, + "grad_norm": 1.076112151145935, + "learning_rate": 1.0075446510817189e-05, + "loss": 1.806, + "mean_token_accuracy": 0.5841096639633179, + "num_tokens": 5169197439.0, + "step": 10111 + }, + { + "epoch": 2.734451054624121, + "grad_norm": 1.10356605052948, + "learning_rate": 1.0073878376272679e-05, + "loss": 1.6373, + "mean_token_accuracy": 0.6095898151397705, + "num_tokens": 5169664851.0, + "step": 10112 + }, + { + "epoch": 2.734721471065441, + "grad_norm": 1.3193975687026978, + "learning_rate": 1.0072310270144302e-05, + "loss": 1.8485, + "mean_token_accuracy": 0.5871982574462891, + "num_tokens": 5170189055.0, + "step": 10113 + }, + { + "epoch": 2.7349918875067605, + "grad_norm": 1.015630841255188, + "learning_rate": 1.0070742192480177e-05, + "loss": 1.8925, + "mean_token_accuracy": 0.5761339664459229, + "num_tokens": 5170651604.0, + "step": 10114 + }, + { + "epoch": 2.73526230394808, + "grad_norm": 1.2068463563919067, + "learning_rate": 1.0069174143328412e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.5496945381164551, + "num_tokens": 5171171658.0, + "step": 10115 + }, + { + "epoch": 2.7355327203893998, + "grad_norm": 1.0651720762252808, + "learning_rate": 1.0067606122737122e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.5933778285980225, + "num_tokens": 5171695927.0, + "step": 10116 + }, + { + "epoch": 2.7358031368307194, + "grad_norm": 1.0517678260803223, + "learning_rate": 1.0066038130754417e-05, + "loss": 1.8831, + "mean_token_accuracy": 0.5565046668052673, + "num_tokens": 5172220192.0, + "step": 10117 + }, + { + "epoch": 2.736073553272039, + "grad_norm": 1.2702049016952515, + "learning_rate": 1.0064470167428413e-05, + "loss": 1.8156, + "mean_token_accuracy": 0.5841902494430542, + "num_tokens": 5172706436.0, + "step": 10118 + }, + { + "epoch": 2.7363439697133587, + "grad_norm": 1.3019750118255615, + "learning_rate": 1.006290223280721e-05, + "loss": 2.0007, + "mean_token_accuracy": 0.5422885417938232, + "num_tokens": 5173230562.0, + "step": 10119 + }, + { + "epoch": 2.7366143861546783, + "grad_norm": 1.1572091579437256, + "learning_rate": 1.0061334326938927e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.5406264066696167, + "num_tokens": 5173747243.0, + "step": 10120 + }, + { + "epoch": 2.736884802595998, + "grad_norm": 0.44917118549346924, + "learning_rate": 1.0059766449871667e-05, + "loss": 1.1203, + "mean_token_accuracy": 0.7000402212142944, + "num_tokens": 5174271492.0, + "step": 10121 + }, + { + "epoch": 2.7371552190373176, + "grad_norm": 1.737410068511963, + "learning_rate": 1.0058198601653533e-05, + "loss": 1.8201, + "mean_token_accuracy": 0.5782390832901001, + "num_tokens": 5174772033.0, + "step": 10122 + }, + { + "epoch": 2.7374256354786373, + "grad_norm": 1.7439186573028564, + "learning_rate": 1.0056630782332642e-05, + "loss": 1.9617, + "mean_token_accuracy": 0.5597926378250122, + "num_tokens": 5175296108.0, + "step": 10123 + }, + { + "epoch": 2.737696051919957, + "grad_norm": 1.1443185806274414, + "learning_rate": 1.0055062991957087e-05, + "loss": 1.929, + "mean_token_accuracy": 0.5471477508544922, + "num_tokens": 5175820281.0, + "step": 10124 + }, + { + "epoch": 2.737966468361276, + "grad_norm": 1.2332730293273926, + "learning_rate": 1.0053495230574978e-05, + "loss": 1.9371, + "mean_token_accuracy": 0.5681170225143433, + "num_tokens": 5176303311.0, + "step": 10125 + }, + { + "epoch": 2.738236884802596, + "grad_norm": 1.592455506324768, + "learning_rate": 1.0051927498234424e-05, + "loss": 1.872, + "mean_token_accuracy": 0.5974915623664856, + "num_tokens": 5176762225.0, + "step": 10126 + }, + { + "epoch": 2.7385073012439154, + "grad_norm": 1.5849117040634155, + "learning_rate": 1.0050359794983522e-05, + "loss": 1.8995, + "mean_token_accuracy": 0.5725727677345276, + "num_tokens": 5177286414.0, + "step": 10127 + }, + { + "epoch": 2.7387777176852355, + "grad_norm": 1.0477335453033447, + "learning_rate": 1.0048792120870367e-05, + "loss": 1.8056, + "mean_token_accuracy": 0.5751347541809082, + "num_tokens": 5177810601.0, + "step": 10128 + }, + { + "epoch": 2.7390481341265547, + "grad_norm": 1.0211879014968872, + "learning_rate": 1.0047224475943073e-05, + "loss": 1.7768, + "mean_token_accuracy": 0.5783401727676392, + "num_tokens": 5178334800.0, + "step": 10129 + }, + { + "epoch": 2.7393185505678748, + "grad_norm": 1.5503312349319458, + "learning_rate": 1.0045656860249735e-05, + "loss": 1.9388, + "mean_token_accuracy": 0.563265860080719, + "num_tokens": 5178859069.0, + "step": 10130 + }, + { + "epoch": 2.739588967009194, + "grad_norm": 1.5194157361984253, + "learning_rate": 1.004408927383845e-05, + "loss": 1.9036, + "mean_token_accuracy": 0.56374591588974, + "num_tokens": 5179374201.0, + "step": 10131 + }, + { + "epoch": 2.739859383450514, + "grad_norm": 1.0555675029754639, + "learning_rate": 1.0042521716757315e-05, + "loss": 1.9952, + "mean_token_accuracy": 0.5484268069267273, + "num_tokens": 5179898385.0, + "step": 10132 + }, + { + "epoch": 2.7401297998918333, + "grad_norm": 1.2657657861709595, + "learning_rate": 1.004095418905443e-05, + "loss": 1.9533, + "mean_token_accuracy": 0.5572888851165771, + "num_tokens": 5180422660.0, + "step": 10133 + }, + { + "epoch": 2.740400216333153, + "grad_norm": 1.3228240013122559, + "learning_rate": 1.003938669077789e-05, + "loss": 1.8417, + "mean_token_accuracy": 0.5779587626457214, + "num_tokens": 5180946878.0, + "step": 10134 + }, + { + "epoch": 2.7406706327744725, + "grad_norm": 1.132088541984558, + "learning_rate": 1.0037819221975795e-05, + "loss": 1.7985, + "mean_token_accuracy": 0.5824549794197083, + "num_tokens": 5181471108.0, + "step": 10135 + }, + { + "epoch": 2.740941049215792, + "grad_norm": 1.3254053592681885, + "learning_rate": 1.0036251782696231e-05, + "loss": 1.8218, + "mean_token_accuracy": 0.579110324382782, + "num_tokens": 5181995336.0, + "step": 10136 + }, + { + "epoch": 2.741211465657112, + "grad_norm": 1.3480432033538818, + "learning_rate": 1.0034684372987303e-05, + "loss": 1.7135, + "mean_token_accuracy": 0.5989536046981812, + "num_tokens": 5182519604.0, + "step": 10137 + }, + { + "epoch": 2.7414818820984315, + "grad_norm": 1.3596391677856445, + "learning_rate": 1.0033116992897092e-05, + "loss": 1.9653, + "mean_token_accuracy": 0.566521406173706, + "num_tokens": 5183018848.0, + "step": 10138 + }, + { + "epoch": 2.741752298539751, + "grad_norm": 1.1133874654769897, + "learning_rate": 1.0031549642473697e-05, + "loss": 1.8751, + "mean_token_accuracy": 0.5671908259391785, + "num_tokens": 5183543018.0, + "step": 10139 + }, + { + "epoch": 2.7420227149810708, + "grad_norm": 1.3774548768997192, + "learning_rate": 1.0029982321765206e-05, + "loss": 1.8655, + "mean_token_accuracy": 0.5859021544456482, + "num_tokens": 5183985994.0, + "step": 10140 + }, + { + "epoch": 2.7422931314223904, + "grad_norm": 0.5596588850021362, + "learning_rate": 1.0028415030819712e-05, + "loss": 0.9763, + "mean_token_accuracy": 0.7285635471343994, + "num_tokens": 5184471599.0, + "step": 10141 + }, + { + "epoch": 2.74256354786371, + "grad_norm": 1.7041469812393188, + "learning_rate": 1.0026847769685297e-05, + "loss": 1.9654, + "mean_token_accuracy": 0.5455199480056763, + "num_tokens": 5184963981.0, + "step": 10142 + }, + { + "epoch": 2.7428339643050297, + "grad_norm": 1.4540778398513794, + "learning_rate": 1.0025280538410061e-05, + "loss": 1.9158, + "mean_token_accuracy": 0.5517600774765015, + "num_tokens": 5185488187.0, + "step": 10143 + }, + { + "epoch": 2.7431043807463493, + "grad_norm": 1.1727060079574585, + "learning_rate": 1.002371333704208e-05, + "loss": 1.9584, + "mean_token_accuracy": 0.5522702932357788, + "num_tokens": 5185988297.0, + "step": 10144 + }, + { + "epoch": 2.743374797187669, + "grad_norm": 1.31234872341156, + "learning_rate": 1.0022146165629447e-05, + "loss": 1.8635, + "mean_token_accuracy": 0.5658570528030396, + "num_tokens": 5186512458.0, + "step": 10145 + }, + { + "epoch": 2.7436452136289886, + "grad_norm": 1.556692123413086, + "learning_rate": 1.0020579024220244e-05, + "loss": 1.9672, + "mean_token_accuracy": 0.539601743221283, + "num_tokens": 5187002771.0, + "step": 10146 + }, + { + "epoch": 2.7439156300703083, + "grad_norm": 1.132102608680725, + "learning_rate": 1.001901191286256e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5628406405448914, + "num_tokens": 5187486100.0, + "step": 10147 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 1.0838820934295654, + "learning_rate": 1.001744483160447e-05, + "loss": 1.9079, + "mean_token_accuracy": 0.5700157880783081, + "num_tokens": 5188010346.0, + "step": 10148 + }, + { + "epoch": 2.7444564629529475, + "grad_norm": 1.3293745517730713, + "learning_rate": 1.0015877780494068e-05, + "loss": 1.9171, + "mean_token_accuracy": 0.5472032427787781, + "num_tokens": 5188534545.0, + "step": 10149 + }, + { + "epoch": 2.744726879394267, + "grad_norm": 1.2597113847732544, + "learning_rate": 1.0014310759579429e-05, + "loss": 2.015, + "mean_token_accuracy": 0.5520667433738708, + "num_tokens": 5189052687.0, + "step": 10150 + }, + { + "epoch": 2.744997295835587, + "grad_norm": 1.0261448621749878, + "learning_rate": 1.0012743768908631e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5507030487060547, + "num_tokens": 5189576929.0, + "step": 10151 + }, + { + "epoch": 2.7452677122769065, + "grad_norm": 1.3639341592788696, + "learning_rate": 1.0011176808529756e-05, + "loss": 1.8233, + "mean_token_accuracy": 0.5899218320846558, + "num_tokens": 5190054460.0, + "step": 10152 + }, + { + "epoch": 2.745538128718226, + "grad_norm": 1.0354257822036743, + "learning_rate": 1.000960987849089e-05, + "loss": 1.9651, + "mean_token_accuracy": 0.5661290287971497, + "num_tokens": 5190560828.0, + "step": 10153 + }, + { + "epoch": 2.7458085451595458, + "grad_norm": 0.8455836772918701, + "learning_rate": 1.00080429788401e-05, + "loss": 1.8022, + "mean_token_accuracy": 0.5836173892021179, + "num_tokens": 5191059889.0, + "step": 10154 + }, + { + "epoch": 2.7460789616008654, + "grad_norm": 1.484438419342041, + "learning_rate": 1.0006476109625473e-05, + "loss": 1.9233, + "mean_token_accuracy": 0.5549037456512451, + "num_tokens": 5191584031.0, + "step": 10155 + }, + { + "epoch": 2.746349378042185, + "grad_norm": 1.070309042930603, + "learning_rate": 1.0004909270895083e-05, + "loss": 1.9357, + "mean_token_accuracy": 0.5706734657287598, + "num_tokens": 5192102001.0, + "step": 10156 + }, + { + "epoch": 2.7466197944835047, + "grad_norm": 1.1660820245742798, + "learning_rate": 1.0003342462696995e-05, + "loss": 1.9057, + "mean_token_accuracy": 0.5702061653137207, + "num_tokens": 5192568309.0, + "step": 10157 + }, + { + "epoch": 2.7468902109248243, + "grad_norm": 0.869835615158081, + "learning_rate": 1.0001775685079299e-05, + "loss": 1.6913, + "mean_token_accuracy": 0.5939555168151855, + "num_tokens": 5193092586.0, + "step": 10158 + }, + { + "epoch": 2.747160627366144, + "grad_norm": 1.335504174232483, + "learning_rate": 1.0000208938090054e-05, + "loss": 1.9421, + "mean_token_accuracy": 0.5810805559158325, + "num_tokens": 5193569813.0, + "step": 10159 + }, + { + "epoch": 2.7474310438074636, + "grad_norm": 1.082481026649475, + "learning_rate": 9.99864222177734e-06, + "loss": 1.9117, + "mean_token_accuracy": 0.559151291847229, + "num_tokens": 5194032640.0, + "step": 10160 + }, + { + "epoch": 2.7477014602487833, + "grad_norm": 0.4326733648777008, + "learning_rate": 9.997075536189231e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7069246172904968, + "num_tokens": 5194525662.0, + "step": 10161 + }, + { + "epoch": 2.747971876690103, + "grad_norm": 1.1983261108398438, + "learning_rate": 9.99550888137379e-06, + "loss": 1.8081, + "mean_token_accuracy": 0.5825182199478149, + "num_tokens": 5195049829.0, + "step": 10162 + }, + { + "epoch": 2.7482422931314225, + "grad_norm": 1.454618215560913, + "learning_rate": 9.993942257379095e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5799157619476318, + "num_tokens": 5195519746.0, + "step": 10163 + }, + { + "epoch": 2.748512709572742, + "grad_norm": 1.1531686782836914, + "learning_rate": 9.99237566425321e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5806315541267395, + "num_tokens": 5196026805.0, + "step": 10164 + }, + { + "epoch": 2.748783126014062, + "grad_norm": 1.3558378219604492, + "learning_rate": 9.990809102044196e-06, + "loss": 1.9522, + "mean_token_accuracy": 0.5560620427131653, + "num_tokens": 5196550972.0, + "step": 10165 + }, + { + "epoch": 2.749053542455381, + "grad_norm": 1.3739302158355713, + "learning_rate": 9.989242570800135e-06, + "loss": 2.006, + "mean_token_accuracy": 0.5550917983055115, + "num_tokens": 5197075113.0, + "step": 10166 + }, + { + "epoch": 2.749323958896701, + "grad_norm": 1.2155370712280273, + "learning_rate": 9.987676070569082e-06, + "loss": 1.9289, + "mean_token_accuracy": 0.5701001286506653, + "num_tokens": 5197586713.0, + "step": 10167 + }, + { + "epoch": 2.7495943753380203, + "grad_norm": 1.344771385192871, + "learning_rate": 9.986109601399099e-06, + "loss": 1.9221, + "mean_token_accuracy": 0.552122950553894, + "num_tokens": 5198110906.0, + "step": 10168 + }, + { + "epoch": 2.7498647917793404, + "grad_norm": 1.3161625862121582, + "learning_rate": 9.98454316333826e-06, + "loss": 1.9542, + "mean_token_accuracy": 0.552639365196228, + "num_tokens": 5198635032.0, + "step": 10169 + }, + { + "epoch": 2.7501352082206596, + "grad_norm": 1.1067532300949097, + "learning_rate": 9.982976756434625e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5723676681518555, + "num_tokens": 5199117015.0, + "step": 10170 + }, + { + "epoch": 2.7504056246619797, + "grad_norm": 1.5611995458602905, + "learning_rate": 9.981410380736248e-06, + "loss": 1.8917, + "mean_token_accuracy": 0.5671378374099731, + "num_tokens": 5199641268.0, + "step": 10171 + }, + { + "epoch": 2.750676041103299, + "grad_norm": 0.9922888278961182, + "learning_rate": 9.979844036291202e-06, + "loss": 1.8686, + "mean_token_accuracy": 0.5644541382789612, + "num_tokens": 5200165397.0, + "step": 10172 + }, + { + "epoch": 2.750946457544619, + "grad_norm": 1.139131784439087, + "learning_rate": 9.978277723147537e-06, + "loss": 1.8967, + "mean_token_accuracy": 0.5519345998764038, + "num_tokens": 5200689662.0, + "step": 10173 + }, + { + "epoch": 2.751216873985938, + "grad_norm": 1.0497119426727295, + "learning_rate": 9.976711441353318e-06, + "loss": 1.993, + "mean_token_accuracy": 0.5806175470352173, + "num_tokens": 5201150106.0, + "step": 10174 + }, + { + "epoch": 2.751487290427258, + "grad_norm": 0.8942026495933533, + "learning_rate": 9.975145190956599e-06, + "loss": 1.9004, + "mean_token_accuracy": 0.5651609897613525, + "num_tokens": 5201674382.0, + "step": 10175 + }, + { + "epoch": 2.7517577068685775, + "grad_norm": 1.0012909173965454, + "learning_rate": 9.973578972005442e-06, + "loss": 1.8024, + "mean_token_accuracy": 0.5872416496276855, + "num_tokens": 5202196570.0, + "step": 10176 + }, + { + "epoch": 2.752028123309897, + "grad_norm": 1.189164161682129, + "learning_rate": 9.972012784547898e-06, + "loss": 2.0104, + "mean_token_accuracy": 0.5722017884254456, + "num_tokens": 5202710365.0, + "step": 10177 + }, + { + "epoch": 2.7522985397512167, + "grad_norm": 1.1877495050430298, + "learning_rate": 9.970446628632028e-06, + "loss": 1.8683, + "mean_token_accuracy": 0.5761555433273315, + "num_tokens": 5203178904.0, + "step": 10178 + }, + { + "epoch": 2.7525689561925364, + "grad_norm": 1.3466428518295288, + "learning_rate": 9.968880504305882e-06, + "loss": 1.9449, + "mean_token_accuracy": 0.5580462217330933, + "num_tokens": 5203702997.0, + "step": 10179 + }, + { + "epoch": 2.752839372633856, + "grad_norm": 1.3860535621643066, + "learning_rate": 9.967314411617514e-06, + "loss": 1.822, + "mean_token_accuracy": 0.5810652375221252, + "num_tokens": 5204203386.0, + "step": 10180 + }, + { + "epoch": 2.7531097890751757, + "grad_norm": 0.4513387680053711, + "learning_rate": 9.965748350614977e-06, + "loss": 1.1107, + "mean_token_accuracy": 0.7001631259918213, + "num_tokens": 5204727544.0, + "step": 10181 + }, + { + "epoch": 2.7533802055164953, + "grad_norm": 1.4155062437057495, + "learning_rate": 9.964182321346322e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5624297857284546, + "num_tokens": 5205226899.0, + "step": 10182 + }, + { + "epoch": 2.753650621957815, + "grad_norm": 1.3187196254730225, + "learning_rate": 9.962616323859597e-06, + "loss": 1.7312, + "mean_token_accuracy": 0.5873713493347168, + "num_tokens": 5205742332.0, + "step": 10183 + }, + { + "epoch": 2.7539210383991346, + "grad_norm": 1.1981171369552612, + "learning_rate": 9.96105035820286e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5645910501480103, + "num_tokens": 5206266356.0, + "step": 10184 + }, + { + "epoch": 2.7541914548404542, + "grad_norm": 1.0710376501083374, + "learning_rate": 9.959484424424147e-06, + "loss": 1.8839, + "mean_token_accuracy": 0.5730176568031311, + "num_tokens": 5206790481.0, + "step": 10185 + }, + { + "epoch": 2.754461871281774, + "grad_norm": 1.5222419500350952, + "learning_rate": 9.957918522571518e-06, + "loss": 1.7321, + "mean_token_accuracy": 0.5897373557090759, + "num_tokens": 5207294485.0, + "step": 10186 + }, + { + "epoch": 2.7547322877230935, + "grad_norm": 1.4609453678131104, + "learning_rate": 9.956352652693008e-06, + "loss": 1.8197, + "mean_token_accuracy": 0.5676918029785156, + "num_tokens": 5207799164.0, + "step": 10187 + }, + { + "epoch": 2.755002704164413, + "grad_norm": 1.1238744258880615, + "learning_rate": 9.95478681483667e-06, + "loss": 1.9783, + "mean_token_accuracy": 0.5590828657150269, + "num_tokens": 5208271556.0, + "step": 10188 + }, + { + "epoch": 2.755273120605733, + "grad_norm": 1.229563593864441, + "learning_rate": 9.953221009050544e-06, + "loss": 1.8114, + "mean_token_accuracy": 0.5769720077514648, + "num_tokens": 5208795730.0, + "step": 10189 + }, + { + "epoch": 2.7555435370470525, + "grad_norm": 1.538590908050537, + "learning_rate": 9.95165523538268e-06, + "loss": 1.8845, + "mean_token_accuracy": 0.5714141130447388, + "num_tokens": 5209319972.0, + "step": 10190 + }, + { + "epoch": 2.755813953488372, + "grad_norm": 0.9063197374343872, + "learning_rate": 9.950089493881111e-06, + "loss": 1.7708, + "mean_token_accuracy": 0.5748646259307861, + "num_tokens": 5209844119.0, + "step": 10191 + }, + { + "epoch": 2.7560843699296917, + "grad_norm": 1.050449013710022, + "learning_rate": 9.94852378459389e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5661283731460571, + "num_tokens": 5210368356.0, + "step": 10192 + }, + { + "epoch": 2.7563547863710114, + "grad_norm": 1.3615243434906006, + "learning_rate": 9.94695810756905e-06, + "loss": 1.8595, + "mean_token_accuracy": 0.5639320015907288, + "num_tokens": 5210892601.0, + "step": 10193 + }, + { + "epoch": 2.756625202812331, + "grad_norm": 1.0010830163955688, + "learning_rate": 9.945392462854628e-06, + "loss": 1.6868, + "mean_token_accuracy": 0.6223588585853577, + "num_tokens": 5211415809.0, + "step": 10194 + }, + { + "epoch": 2.7568956192536507, + "grad_norm": 1.318261742591858, + "learning_rate": 9.94382685049867e-06, + "loss": 1.9878, + "mean_token_accuracy": 0.5484504699707031, + "num_tokens": 5211939962.0, + "step": 10195 + }, + { + "epoch": 2.7571660356949703, + "grad_norm": 1.3815507888793945, + "learning_rate": 9.942261270549211e-06, + "loss": 1.8943, + "mean_token_accuracy": 0.5604000687599182, + "num_tokens": 5212464238.0, + "step": 10196 + }, + { + "epoch": 2.75743645213629, + "grad_norm": 1.0790129899978638, + "learning_rate": 9.940695723054284e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.570008397102356, + "num_tokens": 5212988373.0, + "step": 10197 + }, + { + "epoch": 2.7577068685776096, + "grad_norm": 1.1601293087005615, + "learning_rate": 9.93913020806193e-06, + "loss": 1.9808, + "mean_token_accuracy": 0.54177325963974, + "num_tokens": 5213497516.0, + "step": 10198 + }, + { + "epoch": 2.7579772850189292, + "grad_norm": 1.2037941217422485, + "learning_rate": 9.93756472562018e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.5685034990310669, + "num_tokens": 5214021790.0, + "step": 10199 + }, + { + "epoch": 2.758247701460249, + "grad_norm": 1.0748423337936401, + "learning_rate": 9.935999275777068e-06, + "loss": 1.8784, + "mean_token_accuracy": 0.5841564536094666, + "num_tokens": 5214506727.0, + "step": 10200 + }, + { + "epoch": 2.7585181179015685, + "grad_norm": 0.4653514325618744, + "learning_rate": 9.934433858580628e-06, + "loss": 1.1285, + "mean_token_accuracy": 0.6965783834457397, + "num_tokens": 5215030934.0, + "step": 10201 + }, + { + "epoch": 2.758788534342888, + "grad_norm": 1.9954686164855957, + "learning_rate": 9.932868474078891e-06, + "loss": 1.9295, + "mean_token_accuracy": 0.5693224668502808, + "num_tokens": 5215555192.0, + "step": 10202 + }, + { + "epoch": 2.759058950784208, + "grad_norm": 1.92921781539917, + "learning_rate": 9.931303122319885e-06, + "loss": 1.8072, + "mean_token_accuracy": 0.5940327644348145, + "num_tokens": 5216079357.0, + "step": 10203 + }, + { + "epoch": 2.7593293672255275, + "grad_norm": 1.0271533727645874, + "learning_rate": 9.929737803351647e-06, + "loss": 1.8496, + "mean_token_accuracy": 0.5796927213668823, + "num_tokens": 5216603636.0, + "step": 10204 + }, + { + "epoch": 2.759599783666847, + "grad_norm": 1.6163365840911865, + "learning_rate": 9.928172517222198e-06, + "loss": 1.8941, + "mean_token_accuracy": 0.5761122107505798, + "num_tokens": 5217102837.0, + "step": 10205 + }, + { + "epoch": 2.7598702001081667, + "grad_norm": 1.4125462770462036, + "learning_rate": 9.926607263979566e-06, + "loss": 2.0134, + "mean_token_accuracy": 0.5307372808456421, + "num_tokens": 5217627087.0, + "step": 10206 + }, + { + "epoch": 2.760140616549486, + "grad_norm": 1.1169127225875854, + "learning_rate": 9.925042043671783e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5770447254180908, + "num_tokens": 5218151367.0, + "step": 10207 + }, + { + "epoch": 2.760411032990806, + "grad_norm": 1.3227927684783936, + "learning_rate": 9.923476856346867e-06, + "loss": 1.8977, + "mean_token_accuracy": 0.572705864906311, + "num_tokens": 5218675614.0, + "step": 10208 + }, + { + "epoch": 2.7606814494321252, + "grad_norm": 1.372928261756897, + "learning_rate": 9.921911702052852e-06, + "loss": 1.8916, + "mean_token_accuracy": 0.5745424032211304, + "num_tokens": 5219178023.0, + "step": 10209 + }, + { + "epoch": 2.7609518658734453, + "grad_norm": 1.1154792308807373, + "learning_rate": 9.920346580837754e-06, + "loss": 1.9344, + "mean_token_accuracy": 0.5624598264694214, + "num_tokens": 5219702165.0, + "step": 10210 + }, + { + "epoch": 2.7612222823147645, + "grad_norm": 1.2470484972000122, + "learning_rate": 9.918781492749596e-06, + "loss": 1.9071, + "mean_token_accuracy": 0.566316545009613, + "num_tokens": 5220226409.0, + "step": 10211 + }, + { + "epoch": 2.7614926987560846, + "grad_norm": 1.053774356842041, + "learning_rate": 9.917216437836405e-06, + "loss": 1.9451, + "mean_token_accuracy": 0.5801759958267212, + "num_tokens": 5220687257.0, + "step": 10212 + }, + { + "epoch": 2.761763115197404, + "grad_norm": 1.0063976049423218, + "learning_rate": 9.9156514161462e-06, + "loss": 1.7681, + "mean_token_accuracy": 0.5817097425460815, + "num_tokens": 5221207116.0, + "step": 10213 + }, + { + "epoch": 2.762033531638724, + "grad_norm": 1.5809849500656128, + "learning_rate": 9.914086427726992e-06, + "loss": 1.9629, + "mean_token_accuracy": 0.5552070140838623, + "num_tokens": 5221731262.0, + "step": 10214 + }, + { + "epoch": 2.762303948080043, + "grad_norm": 1.1232393980026245, + "learning_rate": 9.91252147262681e-06, + "loss": 1.9417, + "mean_token_accuracy": 0.5568988919258118, + "num_tokens": 5222199896.0, + "step": 10215 + }, + { + "epoch": 2.7625743645213627, + "grad_norm": 1.1192415952682495, + "learning_rate": 9.910956550893666e-06, + "loss": 1.7878, + "mean_token_accuracy": 0.5904816389083862, + "num_tokens": 5222711340.0, + "step": 10216 + }, + { + "epoch": 2.7628447809626824, + "grad_norm": 1.3134572505950928, + "learning_rate": 9.909391662575576e-06, + "loss": 1.9164, + "mean_token_accuracy": 0.556402862071991, + "num_tokens": 5223235502.0, + "step": 10217 + }, + { + "epoch": 2.763115197404002, + "grad_norm": 1.1143214702606201, + "learning_rate": 9.90782680772056e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5843122005462646, + "num_tokens": 5223759786.0, + "step": 10218 + }, + { + "epoch": 2.7633856138453217, + "grad_norm": 1.0632834434509277, + "learning_rate": 9.90626198637663e-06, + "loss": 1.782, + "mean_token_accuracy": 0.5925049781799316, + "num_tokens": 5224264207.0, + "step": 10219 + }, + { + "epoch": 2.7636560302866413, + "grad_norm": 1.1944060325622559, + "learning_rate": 9.904697198591794e-06, + "loss": 1.8678, + "mean_token_accuracy": 0.5868676900863647, + "num_tokens": 5224762333.0, + "step": 10220 + }, + { + "epoch": 2.763926446727961, + "grad_norm": 0.4413280189037323, + "learning_rate": 9.903132444414077e-06, + "loss": 1.1325, + "mean_token_accuracy": 0.6918028593063354, + "num_tokens": 5225286477.0, + "step": 10221 + }, + { + "epoch": 2.7641968631692806, + "grad_norm": 1.3223010301589966, + "learning_rate": 9.901567723891477e-06, + "loss": 1.8559, + "mean_token_accuracy": 0.5637112855911255, + "num_tokens": 5225810715.0, + "step": 10222 + }, + { + "epoch": 2.7644672796106002, + "grad_norm": 1.289085030555725, + "learning_rate": 9.900003037072008e-06, + "loss": 1.79, + "mean_token_accuracy": 0.5859658122062683, + "num_tokens": 5226268180.0, + "step": 10223 + }, + { + "epoch": 2.76473769605192, + "grad_norm": 0.9782171249389648, + "learning_rate": 9.898438384003681e-06, + "loss": 1.8285, + "mean_token_accuracy": 0.5861940383911133, + "num_tokens": 5226792457.0, + "step": 10224 + }, + { + "epoch": 2.7650081124932395, + "grad_norm": 1.083433747291565, + "learning_rate": 9.896873764734509e-06, + "loss": 1.6874, + "mean_token_accuracy": 0.6142687797546387, + "num_tokens": 5227316726.0, + "step": 10225 + }, + { + "epoch": 2.765278528934559, + "grad_norm": 1.3259519338607788, + "learning_rate": 9.895309179312489e-06, + "loss": 1.9264, + "mean_token_accuracy": 0.5641668438911438, + "num_tokens": 5227810757.0, + "step": 10226 + }, + { + "epoch": 2.765548945375879, + "grad_norm": 1.3316696882247925, + "learning_rate": 9.893744627785635e-06, + "loss": 2.0209, + "mean_token_accuracy": 0.5557233095169067, + "num_tokens": 5228335028.0, + "step": 10227 + }, + { + "epoch": 2.7658193618171985, + "grad_norm": 0.873411238193512, + "learning_rate": 9.892180110201948e-06, + "loss": 1.8805, + "mean_token_accuracy": 0.5646084547042847, + "num_tokens": 5228859306.0, + "step": 10228 + }, + { + "epoch": 2.766089778258518, + "grad_norm": 1.1713790893554688, + "learning_rate": 9.89061562660943e-06, + "loss": 1.8826, + "mean_token_accuracy": 0.5744049549102783, + "num_tokens": 5229380057.0, + "step": 10229 + }, + { + "epoch": 2.7663601946998377, + "grad_norm": 1.125638723373413, + "learning_rate": 9.88905117705609e-06, + "loss": 1.946, + "mean_token_accuracy": 0.5470950603485107, + "num_tokens": 5229904178.0, + "step": 10230 + }, + { + "epoch": 2.7666306111411574, + "grad_norm": 1.0717030763626099, + "learning_rate": 9.887486761589927e-06, + "loss": 1.8789, + "mean_token_accuracy": 0.5641351342201233, + "num_tokens": 5230428432.0, + "step": 10231 + }, + { + "epoch": 2.766901027582477, + "grad_norm": 1.069867730140686, + "learning_rate": 9.885922380258941e-06, + "loss": 1.905, + "mean_token_accuracy": 0.5481895208358765, + "num_tokens": 5230952509.0, + "step": 10232 + }, + { + "epoch": 2.7671714440237967, + "grad_norm": 0.9839425086975098, + "learning_rate": 9.884358033111136e-06, + "loss": 1.8248, + "mean_token_accuracy": 0.5742335319519043, + "num_tokens": 5231476794.0, + "step": 10233 + }, + { + "epoch": 2.7674418604651163, + "grad_norm": 1.1001964807510376, + "learning_rate": 9.8827937201945e-06, + "loss": 1.8971, + "mean_token_accuracy": 0.5756255388259888, + "num_tokens": 5231975809.0, + "step": 10234 + }, + { + "epoch": 2.767712276906436, + "grad_norm": 1.0255019664764404, + "learning_rate": 9.881229441557046e-06, + "loss": 1.9603, + "mean_token_accuracy": 0.5450694561004639, + "num_tokens": 5232500040.0, + "step": 10235 + }, + { + "epoch": 2.7679826933477556, + "grad_norm": 1.0025792121887207, + "learning_rate": 9.879665197246762e-06, + "loss": 1.708, + "mean_token_accuracy": 0.5825685262680054, + "num_tokens": 5233024192.0, + "step": 10236 + }, + { + "epoch": 2.7682531097890752, + "grad_norm": 1.2544063329696655, + "learning_rate": 9.87810098731164e-06, + "loss": 1.9389, + "mean_token_accuracy": 0.5543735027313232, + "num_tokens": 5233548368.0, + "step": 10237 + }, + { + "epoch": 2.768523526230395, + "grad_norm": 1.0309529304504395, + "learning_rate": 9.87653681179968e-06, + "loss": 1.8768, + "mean_token_accuracy": 0.5677508115768433, + "num_tokens": 5234072649.0, + "step": 10238 + }, + { + "epoch": 2.7687939426717145, + "grad_norm": 1.1890335083007812, + "learning_rate": 9.874972670758878e-06, + "loss": 1.8621, + "mean_token_accuracy": 0.5627553462982178, + "num_tokens": 5234595900.0, + "step": 10239 + }, + { + "epoch": 2.769064359113034, + "grad_norm": 1.194354772567749, + "learning_rate": 9.87340856423722e-06, + "loss": 1.9063, + "mean_token_accuracy": 0.5725187659263611, + "num_tokens": 5235114681.0, + "step": 10240 + }, + { + "epoch": 2.769334775554354, + "grad_norm": 0.5530403852462769, + "learning_rate": 9.8718444922827e-06, + "loss": 1.1643, + "mean_token_accuracy": 0.684539794921875, + "num_tokens": 5235638964.0, + "step": 10241 + }, + { + "epoch": 2.7696051919956735, + "grad_norm": 1.2667694091796875, + "learning_rate": 9.870280454943313e-06, + "loss": 1.8039, + "mean_token_accuracy": 0.5933828949928284, + "num_tokens": 5236163126.0, + "step": 10242 + }, + { + "epoch": 2.769875608436993, + "grad_norm": 1.2552058696746826, + "learning_rate": 9.868716452267038e-06, + "loss": 1.8827, + "mean_token_accuracy": 0.5789755582809448, + "num_tokens": 5236653222.0, + "step": 10243 + }, + { + "epoch": 2.7701460248783127, + "grad_norm": 1.0154101848602295, + "learning_rate": 9.867152484301872e-06, + "loss": 1.9279, + "mean_token_accuracy": 0.5659030079841614, + "num_tokens": 5237177397.0, + "step": 10244 + }, + { + "epoch": 2.7704164413196324, + "grad_norm": 1.2221193313598633, + "learning_rate": 9.8655885510958e-06, + "loss": 1.9536, + "mean_token_accuracy": 0.5402791500091553, + "num_tokens": 5237701676.0, + "step": 10245 + }, + { + "epoch": 2.770686857760952, + "grad_norm": 1.078748106956482, + "learning_rate": 9.864024652696802e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5670191049575806, + "num_tokens": 5238225961.0, + "step": 10246 + }, + { + "epoch": 2.7709572742022717, + "grad_norm": 1.19475519657135, + "learning_rate": 9.862460789152877e-06, + "loss": 1.9506, + "mean_token_accuracy": 0.5457367300987244, + "num_tokens": 5238750056.0, + "step": 10247 + }, + { + "epoch": 2.771227690643591, + "grad_norm": 1.098604440689087, + "learning_rate": 9.860896960511996e-06, + "loss": 1.8881, + "mean_token_accuracy": 0.5664734244346619, + "num_tokens": 5239214185.0, + "step": 10248 + }, + { + "epoch": 2.771498107084911, + "grad_norm": 1.2294069528579712, + "learning_rate": 9.859333166822145e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5838532447814941, + "num_tokens": 5239738371.0, + "step": 10249 + }, + { + "epoch": 2.77176852352623, + "grad_norm": 1.1513094902038574, + "learning_rate": 9.857769408131309e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5827575922012329, + "num_tokens": 5240262594.0, + "step": 10250 + }, + { + "epoch": 2.7720389399675502, + "grad_norm": 1.1018438339233398, + "learning_rate": 9.856205684487466e-06, + "loss": 1.8054, + "mean_token_accuracy": 0.5961217284202576, + "num_tokens": 5240783054.0, + "step": 10251 + }, + { + "epoch": 2.7723093564088694, + "grad_norm": 1.1326457262039185, + "learning_rate": 9.854641995938593e-06, + "loss": 1.9364, + "mean_token_accuracy": 0.5662005543708801, + "num_tokens": 5241248248.0, + "step": 10252 + }, + { + "epoch": 2.7725797728501895, + "grad_norm": 0.997343897819519, + "learning_rate": 9.853078342532675e-06, + "loss": 1.8382, + "mean_token_accuracy": 0.5794060826301575, + "num_tokens": 5241772438.0, + "step": 10253 + }, + { + "epoch": 2.7728501892915087, + "grad_norm": 1.0140546560287476, + "learning_rate": 9.851514724317684e-06, + "loss": 1.8733, + "mean_token_accuracy": 0.56809002161026, + "num_tokens": 5242296650.0, + "step": 10254 + }, + { + "epoch": 2.773120605732829, + "grad_norm": 1.0510390996932983, + "learning_rate": 9.849951141341603e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.5611165761947632, + "num_tokens": 5242820913.0, + "step": 10255 + }, + { + "epoch": 2.773391022174148, + "grad_norm": 1.0801032781600952, + "learning_rate": 9.848387593652402e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5774002075195312, + "num_tokens": 5243345113.0, + "step": 10256 + }, + { + "epoch": 2.7736614386154677, + "grad_norm": 1.1097748279571533, + "learning_rate": 9.846824081298054e-06, + "loss": 1.8655, + "mean_token_accuracy": 0.5773792266845703, + "num_tokens": 5243869365.0, + "step": 10257 + }, + { + "epoch": 2.7739318550567873, + "grad_norm": 1.0488955974578857, + "learning_rate": 9.845260604326537e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5670477151870728, + "num_tokens": 5244393586.0, + "step": 10258 + }, + { + "epoch": 2.774202271498107, + "grad_norm": 1.298087239265442, + "learning_rate": 9.84369716278582e-06, + "loss": 2.0126, + "mean_token_accuracy": 0.5474071502685547, + "num_tokens": 5244917777.0, + "step": 10259 + }, + { + "epoch": 2.7744726879394266, + "grad_norm": 1.332668662071228, + "learning_rate": 9.84213375672387e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5804107189178467, + "num_tokens": 5245441996.0, + "step": 10260 + }, + { + "epoch": 2.7747431043807462, + "grad_norm": 0.4912553131580353, + "learning_rate": 9.840570386188667e-06, + "loss": 1.1591, + "mean_token_accuracy": 0.6849253177642822, + "num_tokens": 5245966057.0, + "step": 10261 + }, + { + "epoch": 2.775013520822066, + "grad_norm": 1.672560453414917, + "learning_rate": 9.839007051228172e-06, + "loss": 1.9934, + "mean_token_accuracy": 0.5554161071777344, + "num_tokens": 5246458916.0, + "step": 10262 + }, + { + "epoch": 2.7752839372633855, + "grad_norm": 1.2247086763381958, + "learning_rate": 9.837443751890354e-06, + "loss": 1.8753, + "mean_token_accuracy": 0.5717501640319824, + "num_tokens": 5246983094.0, + "step": 10263 + }, + { + "epoch": 2.775554353704705, + "grad_norm": 1.056445598602295, + "learning_rate": 9.835880488223184e-06, + "loss": 1.9809, + "mean_token_accuracy": 0.5580496788024902, + "num_tokens": 5247507335.0, + "step": 10264 + }, + { + "epoch": 2.775824770146025, + "grad_norm": 1.2981278896331787, + "learning_rate": 9.834317260274622e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.5596095323562622, + "num_tokens": 5248031497.0, + "step": 10265 + }, + { + "epoch": 2.7760951865873444, + "grad_norm": 1.1262333393096924, + "learning_rate": 9.832754068092634e-06, + "loss": 1.8045, + "mean_token_accuracy": 0.5781542658805847, + "num_tokens": 5248555745.0, + "step": 10266 + }, + { + "epoch": 2.776365603028664, + "grad_norm": 1.1464020013809204, + "learning_rate": 9.831190911725183e-06, + "loss": 1.8926, + "mean_token_accuracy": 0.56620854139328, + "num_tokens": 5249079947.0, + "step": 10267 + }, + { + "epoch": 2.7766360194699837, + "grad_norm": 1.0158967971801758, + "learning_rate": 9.829627791220236e-06, + "loss": 1.8977, + "mean_token_accuracy": 0.5563598871231079, + "num_tokens": 5249604190.0, + "step": 10268 + }, + { + "epoch": 2.7769064359113034, + "grad_norm": 0.9716739058494568, + "learning_rate": 9.828064706625744e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5764837265014648, + "num_tokens": 5250128422.0, + "step": 10269 + }, + { + "epoch": 2.777176852352623, + "grad_norm": 1.015226125717163, + "learning_rate": 9.826501657989677e-06, + "loss": 1.7982, + "mean_token_accuracy": 0.5999569296836853, + "num_tokens": 5250567947.0, + "step": 10270 + }, + { + "epoch": 2.7774472687939427, + "grad_norm": 1.0909814834594727, + "learning_rate": 9.824938645359993e-06, + "loss": 1.9016, + "mean_token_accuracy": 0.5862947702407837, + "num_tokens": 5251053853.0, + "step": 10271 + }, + { + "epoch": 2.7777176852352623, + "grad_norm": 1.0362745523452759, + "learning_rate": 9.82337566878464e-06, + "loss": 1.9512, + "mean_token_accuracy": 0.5702204704284668, + "num_tokens": 5251543179.0, + "step": 10272 + }, + { + "epoch": 2.777988101676582, + "grad_norm": 1.1108967065811157, + "learning_rate": 9.821812728311585e-06, + "loss": 1.9023, + "mean_token_accuracy": 0.5802688002586365, + "num_tokens": 5252005645.0, + "step": 10273 + }, + { + "epoch": 2.7782585181179016, + "grad_norm": 1.0963199138641357, + "learning_rate": 9.820249823988786e-06, + "loss": 1.913, + "mean_token_accuracy": 0.5754001140594482, + "num_tokens": 5252513077.0, + "step": 10274 + }, + { + "epoch": 2.7785289345592212, + "grad_norm": 1.1164093017578125, + "learning_rate": 9.818686955864184e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5702887773513794, + "num_tokens": 5253037064.0, + "step": 10275 + }, + { + "epoch": 2.778799351000541, + "grad_norm": 1.0354331731796265, + "learning_rate": 9.817124123985742e-06, + "loss": 1.8814, + "mean_token_accuracy": 0.5783972144126892, + "num_tokens": 5253561236.0, + "step": 10276 + }, + { + "epoch": 2.7790697674418605, + "grad_norm": 1.1929222345352173, + "learning_rate": 9.81556132840141e-06, + "loss": 1.9439, + "mean_token_accuracy": 0.5681688785552979, + "num_tokens": 5254085342.0, + "step": 10277 + }, + { + "epoch": 2.77934018388318, + "grad_norm": 1.0583019256591797, + "learning_rate": 9.813998569159145e-06, + "loss": 1.8478, + "mean_token_accuracy": 0.593253493309021, + "num_tokens": 5254569709.0, + "step": 10278 + }, + { + "epoch": 2.7796106003245, + "grad_norm": 1.0385613441467285, + "learning_rate": 9.812435846306884e-06, + "loss": 1.9495, + "mean_token_accuracy": 0.5598019361495972, + "num_tokens": 5255059827.0, + "step": 10279 + }, + { + "epoch": 2.7798810167658194, + "grad_norm": 1.112192153930664, + "learning_rate": 9.810873159892589e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.5801919102668762, + "num_tokens": 5255584065.0, + "step": 10280 + }, + { + "epoch": 2.780151433207139, + "grad_norm": 0.4643997251987457, + "learning_rate": 9.809310509964199e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.7117511034011841, + "num_tokens": 5256108245.0, + "step": 10281 + }, + { + "epoch": 2.7804218496484587, + "grad_norm": 1.2917957305908203, + "learning_rate": 9.807747896569668e-06, + "loss": 1.9048, + "mean_token_accuracy": 0.5673900246620178, + "num_tokens": 5256632478.0, + "step": 10282 + }, + { + "epoch": 2.7806922660897784, + "grad_norm": 1.2580034732818604, + "learning_rate": 9.806185319756933e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5827484726905823, + "num_tokens": 5257156683.0, + "step": 10283 + }, + { + "epoch": 2.780962682531098, + "grad_norm": 1.1123298406600952, + "learning_rate": 9.804622779573949e-06, + "loss": 1.8744, + "mean_token_accuracy": 0.5718494653701782, + "num_tokens": 5257670755.0, + "step": 10284 + }, + { + "epoch": 2.7812330989724177, + "grad_norm": 1.0003266334533691, + "learning_rate": 9.803060276068653e-06, + "loss": 1.9531, + "mean_token_accuracy": 0.5482146143913269, + "num_tokens": 5258154868.0, + "step": 10285 + }, + { + "epoch": 2.7815035154137373, + "grad_norm": 1.052691102027893, + "learning_rate": 9.80149780928898e-06, + "loss": 1.9094, + "mean_token_accuracy": 0.5733944177627563, + "num_tokens": 5258655422.0, + "step": 10286 + }, + { + "epoch": 2.781773931855057, + "grad_norm": 1.3226672410964966, + "learning_rate": 9.799935379282883e-06, + "loss": 1.9316, + "mean_token_accuracy": 0.5580052137374878, + "num_tokens": 5259179675.0, + "step": 10287 + }, + { + "epoch": 2.7820443482963766, + "grad_norm": 1.0850285291671753, + "learning_rate": 9.7983729860983e-06, + "loss": 1.9819, + "mean_token_accuracy": 0.545964241027832, + "num_tokens": 5259703751.0, + "step": 10288 + }, + { + "epoch": 2.782314764737696, + "grad_norm": 1.5111751556396484, + "learning_rate": 9.796810629783167e-06, + "loss": 1.9217, + "mean_token_accuracy": 0.5462905764579773, + "num_tokens": 5260186868.0, + "step": 10289 + }, + { + "epoch": 2.782585181179016, + "grad_norm": 1.1589843034744263, + "learning_rate": 9.795248310385422e-06, + "loss": 1.8647, + "mean_token_accuracy": 0.5786193609237671, + "num_tokens": 5260711124.0, + "step": 10290 + }, + { + "epoch": 2.782855597620335, + "grad_norm": 1.130595326423645, + "learning_rate": 9.793686027953003e-06, + "loss": 1.8919, + "mean_token_accuracy": 0.566411018371582, + "num_tokens": 5261235400.0, + "step": 10291 + }, + { + "epoch": 2.783126014061655, + "grad_norm": 1.0433893203735352, + "learning_rate": 9.792123782533838e-06, + "loss": 1.8655, + "mean_token_accuracy": 0.5567278861999512, + "num_tokens": 5261759565.0, + "step": 10292 + }, + { + "epoch": 2.7833964305029744, + "grad_norm": 1.1288340091705322, + "learning_rate": 9.790561574175873e-06, + "loss": 1.7584, + "mean_token_accuracy": 0.5755770206451416, + "num_tokens": 5262279541.0, + "step": 10293 + }, + { + "epoch": 2.7836668469442944, + "grad_norm": 1.3378008604049683, + "learning_rate": 9.788999402927035e-06, + "loss": 1.8995, + "mean_token_accuracy": 0.563707709312439, + "num_tokens": 5262803725.0, + "step": 10294 + }, + { + "epoch": 2.7839372633856136, + "grad_norm": 1.6363859176635742, + "learning_rate": 9.787437268835254e-06, + "loss": 1.5322, + "mean_token_accuracy": 0.6623499393463135, + "num_tokens": 5263327989.0, + "step": 10295 + }, + { + "epoch": 2.7842076798269337, + "grad_norm": 1.156850814819336, + "learning_rate": 9.78587517194847e-06, + "loss": 1.8918, + "mean_token_accuracy": 0.5589255094528198, + "num_tokens": 5263852157.0, + "step": 10296 + }, + { + "epoch": 2.784478096268253, + "grad_norm": 1.260687232017517, + "learning_rate": 9.784313112314601e-06, + "loss": 1.8505, + "mean_token_accuracy": 0.5625017881393433, + "num_tokens": 5264376380.0, + "step": 10297 + }, + { + "epoch": 2.7847485127095726, + "grad_norm": 1.0232789516448975, + "learning_rate": 9.782751089981579e-06, + "loss": 1.8987, + "mean_token_accuracy": 0.5559813976287842, + "num_tokens": 5264900596.0, + "step": 10298 + }, + { + "epoch": 2.785018929150892, + "grad_norm": 1.1587958335876465, + "learning_rate": 9.781189104997337e-06, + "loss": 1.9328, + "mean_token_accuracy": 0.5568062663078308, + "num_tokens": 5265382158.0, + "step": 10299 + }, + { + "epoch": 2.785289345592212, + "grad_norm": 1.0952703952789307, + "learning_rate": 9.779627157409795e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5766270160675049, + "num_tokens": 5265906254.0, + "step": 10300 + }, + { + "epoch": 2.7855597620335315, + "grad_norm": 0.4522072672843933, + "learning_rate": 9.778065247266884e-06, + "loss": 1.1486, + "mean_token_accuracy": 0.6917673945426941, + "num_tokens": 5266418117.0, + "step": 10301 + }, + { + "epoch": 2.785830178474851, + "grad_norm": 1.4798693656921387, + "learning_rate": 9.776503374616521e-06, + "loss": 1.8937, + "mean_token_accuracy": 0.5792044997215271, + "num_tokens": 5266877468.0, + "step": 10302 + }, + { + "epoch": 2.786100594916171, + "grad_norm": 1.1297693252563477, + "learning_rate": 9.774941539506634e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.5778124332427979, + "num_tokens": 5267401720.0, + "step": 10303 + }, + { + "epoch": 2.7863710113574904, + "grad_norm": 0.967635989189148, + "learning_rate": 9.773379741985145e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5793746113777161, + "num_tokens": 5267925966.0, + "step": 10304 + }, + { + "epoch": 2.78664142779881, + "grad_norm": 1.1772634983062744, + "learning_rate": 9.77181798209997e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5767815113067627, + "num_tokens": 5268450037.0, + "step": 10305 + }, + { + "epoch": 2.7869118442401297, + "grad_norm": 1.3128489255905151, + "learning_rate": 9.77025625989903e-06, + "loss": 1.9811, + "mean_token_accuracy": 0.5400331020355225, + "num_tokens": 5268974254.0, + "step": 10306 + }, + { + "epoch": 2.7871822606814494, + "grad_norm": 1.1213375329971313, + "learning_rate": 9.768694575430251e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5907069444656372, + "num_tokens": 5269450586.0, + "step": 10307 + }, + { + "epoch": 2.787452677122769, + "grad_norm": 1.5577894449234009, + "learning_rate": 9.767132928741538e-06, + "loss": 2.0744, + "mean_token_accuracy": 0.5486934185028076, + "num_tokens": 5269958845.0, + "step": 10308 + }, + { + "epoch": 2.7877230935640886, + "grad_norm": 4.935154914855957, + "learning_rate": 9.765571319880813e-06, + "loss": 1.6827, + "mean_token_accuracy": 0.6206418871879578, + "num_tokens": 5270483018.0, + "step": 10309 + }, + { + "epoch": 2.7879935100054083, + "grad_norm": 1.278552532196045, + "learning_rate": 9.764009748895993e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5908388495445251, + "num_tokens": 5270962837.0, + "step": 10310 + }, + { + "epoch": 2.788263926446728, + "grad_norm": 1.1649887561798096, + "learning_rate": 9.762448215834987e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5704101324081421, + "num_tokens": 5271487039.0, + "step": 10311 + }, + { + "epoch": 2.7885343428880476, + "grad_norm": 1.0427830219268799, + "learning_rate": 9.760886720745705e-06, + "loss": 1.8564, + "mean_token_accuracy": 0.5450155138969421, + "num_tokens": 5272011303.0, + "step": 10312 + }, + { + "epoch": 2.7888047593293672, + "grad_norm": 1.4423751831054688, + "learning_rate": 9.75932526367607e-06, + "loss": 1.8599, + "mean_token_accuracy": 0.5727324485778809, + "num_tokens": 5272535414.0, + "step": 10313 + }, + { + "epoch": 2.789075175770687, + "grad_norm": 1.1513255834579468, + "learning_rate": 9.75776384467398e-06, + "loss": 1.8813, + "mean_token_accuracy": 0.5557050704956055, + "num_tokens": 5273034702.0, + "step": 10314 + }, + { + "epoch": 2.7893455922120065, + "grad_norm": 1.136412262916565, + "learning_rate": 9.756202463787348e-06, + "loss": 1.8382, + "mean_token_accuracy": 0.5887148380279541, + "num_tokens": 5273549815.0, + "step": 10315 + }, + { + "epoch": 2.789616008653326, + "grad_norm": 1.2855409383773804, + "learning_rate": 9.754641121064083e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5864697694778442, + "num_tokens": 5274034536.0, + "step": 10316 + }, + { + "epoch": 2.789886425094646, + "grad_norm": 0.9520747065544128, + "learning_rate": 9.753079816552093e-06, + "loss": 1.834, + "mean_token_accuracy": 0.5713016986846924, + "num_tokens": 5274558767.0, + "step": 10317 + }, + { + "epoch": 2.7901568415359654, + "grad_norm": 1.0522915124893188, + "learning_rate": 9.751518550299277e-06, + "loss": 2.04, + "mean_token_accuracy": 0.5340543985366821, + "num_tokens": 5275082927.0, + "step": 10318 + }, + { + "epoch": 2.790427257977285, + "grad_norm": 1.050158143043518, + "learning_rate": 9.749957322353547e-06, + "loss": 1.9071, + "mean_token_accuracy": 0.5647430419921875, + "num_tokens": 5275607194.0, + "step": 10319 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 1.0549331903457642, + "learning_rate": 9.748396132762803e-06, + "loss": 1.7627, + "mean_token_accuracy": 0.5791631937026978, + "num_tokens": 5276131468.0, + "step": 10320 + }, + { + "epoch": 2.7909680908599244, + "grad_norm": 0.44931304454803467, + "learning_rate": 9.746834981574942e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.7301291227340698, + "num_tokens": 5276538066.0, + "step": 10321 + }, + { + "epoch": 2.791238507301244, + "grad_norm": 1.6543742418289185, + "learning_rate": 9.74527386883787e-06, + "loss": 1.9279, + "mean_token_accuracy": 0.556358277797699, + "num_tokens": 5277062289.0, + "step": 10322 + }, + { + "epoch": 2.7915089237425637, + "grad_norm": 1.2721138000488281, + "learning_rate": 9.74371279459949e-06, + "loss": 1.8707, + "mean_token_accuracy": 0.5612094402313232, + "num_tokens": 5277586567.0, + "step": 10323 + }, + { + "epoch": 2.7917793401838833, + "grad_norm": 1.0944783687591553, + "learning_rate": 9.74215175890769e-06, + "loss": 1.8627, + "mean_token_accuracy": 0.5760208964347839, + "num_tokens": 5278110841.0, + "step": 10324 + }, + { + "epoch": 2.792049756625203, + "grad_norm": 1.1677279472351074, + "learning_rate": 9.740590761810378e-06, + "loss": 1.6897, + "mean_token_accuracy": 0.6069810390472412, + "num_tokens": 5278635016.0, + "step": 10325 + }, + { + "epoch": 2.7923201730665226, + "grad_norm": 0.9274718165397644, + "learning_rate": 9.739029803355441e-06, + "loss": 1.9906, + "mean_token_accuracy": 0.545681357383728, + "num_tokens": 5279159234.0, + "step": 10326 + }, + { + "epoch": 2.7925905895078422, + "grad_norm": 1.0844054222106934, + "learning_rate": 9.737468883590781e-06, + "loss": 1.7787, + "mean_token_accuracy": 0.5988547801971436, + "num_tokens": 5279620489.0, + "step": 10327 + }, + { + "epoch": 2.792861005949162, + "grad_norm": 1.0217951536178589, + "learning_rate": 9.735908002564289e-06, + "loss": 1.8139, + "mean_token_accuracy": 0.592975914478302, + "num_tokens": 5280144504.0, + "step": 10328 + }, + { + "epoch": 2.7931314223904815, + "grad_norm": 0.901214063167572, + "learning_rate": 9.734347160323853e-06, + "loss": 1.8025, + "mean_token_accuracy": 0.5853272676467896, + "num_tokens": 5280657170.0, + "step": 10329 + }, + { + "epoch": 2.7934018388318007, + "grad_norm": 1.01869797706604, + "learning_rate": 9.732786356917373e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5722153782844543, + "num_tokens": 5281181397.0, + "step": 10330 + }, + { + "epoch": 2.793672255273121, + "grad_norm": 1.030289888381958, + "learning_rate": 9.731225592392732e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5885409116744995, + "num_tokens": 5281705540.0, + "step": 10331 + }, + { + "epoch": 2.79394267171444, + "grad_norm": 1.1317131519317627, + "learning_rate": 9.729664866797818e-06, + "loss": 1.8615, + "mean_token_accuracy": 0.557589590549469, + "num_tokens": 5282229817.0, + "step": 10332 + }, + { + "epoch": 2.79421308815576, + "grad_norm": 0.969333827495575, + "learning_rate": 9.728104180180529e-06, + "loss": 1.7217, + "mean_token_accuracy": 0.5946340560913086, + "num_tokens": 5282753925.0, + "step": 10333 + }, + { + "epoch": 2.7944835045970793, + "grad_norm": 1.0051779747009277, + "learning_rate": 9.726543532588742e-06, + "loss": 1.9654, + "mean_token_accuracy": 0.5658718347549438, + "num_tokens": 5283278201.0, + "step": 10334 + }, + { + "epoch": 2.7947539210383994, + "grad_norm": 1.015352487564087, + "learning_rate": 9.724982924070338e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.553213357925415, + "num_tokens": 5283802362.0, + "step": 10335 + }, + { + "epoch": 2.7950243374797186, + "grad_norm": 1.0452446937561035, + "learning_rate": 9.723422354673216e-06, + "loss": 1.9916, + "mean_token_accuracy": 0.5413385629653931, + "num_tokens": 5284326530.0, + "step": 10336 + }, + { + "epoch": 2.7952947539210387, + "grad_norm": 2.3592216968536377, + "learning_rate": 9.721861824445244e-06, + "loss": 1.5749, + "mean_token_accuracy": 0.603458046913147, + "num_tokens": 5284850776.0, + "step": 10337 + }, + { + "epoch": 2.795565170362358, + "grad_norm": 1.4809792041778564, + "learning_rate": 9.72030133343431e-06, + "loss": 1.9569, + "mean_token_accuracy": 0.5625859498977661, + "num_tokens": 5285374929.0, + "step": 10338 + }, + { + "epoch": 2.7958355868036775, + "grad_norm": 1.0120741128921509, + "learning_rate": 9.718740881688297e-06, + "loss": 1.8188, + "mean_token_accuracy": 0.5901061296463013, + "num_tokens": 5285849830.0, + "step": 10339 + }, + { + "epoch": 2.796106003244997, + "grad_norm": 0.9788710474967957, + "learning_rate": 9.717180469255083e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5919478535652161, + "num_tokens": 5286351606.0, + "step": 10340 + }, + { + "epoch": 2.796376419686317, + "grad_norm": 0.47071194648742676, + "learning_rate": 9.715620096182543e-06, + "loss": 1.1012, + "mean_token_accuracy": 0.7065777778625488, + "num_tokens": 5286875839.0, + "step": 10341 + }, + { + "epoch": 2.7966468361276364, + "grad_norm": 1.502936601638794, + "learning_rate": 9.714059762518555e-06, + "loss": 1.8847, + "mean_token_accuracy": 0.5792863368988037, + "num_tokens": 5287400016.0, + "step": 10342 + }, + { + "epoch": 2.796917252568956, + "grad_norm": 1.557481050491333, + "learning_rate": 9.712499468310996e-06, + "loss": 1.8946, + "mean_token_accuracy": 0.5717417001724243, + "num_tokens": 5287924255.0, + "step": 10343 + }, + { + "epoch": 2.7971876690102757, + "grad_norm": 1.3555454015731812, + "learning_rate": 9.710939213607739e-06, + "loss": 2.0465, + "mean_token_accuracy": 0.5285653471946716, + "num_tokens": 5288387924.0, + "step": 10344 + }, + { + "epoch": 2.7974580854515954, + "grad_norm": 1.228171467781067, + "learning_rate": 9.709378998456658e-06, + "loss": 1.9011, + "mean_token_accuracy": 0.5624876618385315, + "num_tokens": 5288911710.0, + "step": 10345 + }, + { + "epoch": 2.797728501892915, + "grad_norm": 1.1621516942977905, + "learning_rate": 9.707818822905623e-06, + "loss": 1.9605, + "mean_token_accuracy": 0.5653798580169678, + "num_tokens": 5289435799.0, + "step": 10346 + }, + { + "epoch": 2.7979989183342346, + "grad_norm": 1.0456022024154663, + "learning_rate": 9.70625868700251e-06, + "loss": 1.9383, + "mean_token_accuracy": 0.5435235500335693, + "num_tokens": 5289955420.0, + "step": 10347 + }, + { + "epoch": 2.7982693347755543, + "grad_norm": 1.220834732055664, + "learning_rate": 9.704698590795186e-06, + "loss": 1.8266, + "mean_token_accuracy": 0.5818995237350464, + "num_tokens": 5290479648.0, + "step": 10348 + }, + { + "epoch": 2.798539751216874, + "grad_norm": 1.2824864387512207, + "learning_rate": 9.703138534331513e-06, + "loss": 1.7716, + "mean_token_accuracy": 0.6067848801612854, + "num_tokens": 5290946105.0, + "step": 10349 + }, + { + "epoch": 2.7988101676581936, + "grad_norm": 1.0580112934112549, + "learning_rate": 9.70157851765937e-06, + "loss": 1.8953, + "mean_token_accuracy": 0.588698148727417, + "num_tokens": 5291371224.0, + "step": 10350 + }, + { + "epoch": 2.799080584099513, + "grad_norm": 1.279344916343689, + "learning_rate": 9.700018540826613e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.5744968056678772, + "num_tokens": 5291895429.0, + "step": 10351 + }, + { + "epoch": 2.799351000540833, + "grad_norm": 1.1404385566711426, + "learning_rate": 9.698458603881107e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5575302243232727, + "num_tokens": 5292419615.0, + "step": 10352 + }, + { + "epoch": 2.7996214169821525, + "grad_norm": 1.1909937858581543, + "learning_rate": 9.696898706870722e-06, + "loss": 1.8721, + "mean_token_accuracy": 0.5568461418151855, + "num_tokens": 5292943651.0, + "step": 10353 + }, + { + "epoch": 2.799891833423472, + "grad_norm": 1.0027483701705933, + "learning_rate": 9.695338849843319e-06, + "loss": 1.7928, + "mean_token_accuracy": 0.5771230459213257, + "num_tokens": 5293467854.0, + "step": 10354 + }, + { + "epoch": 2.800162249864792, + "grad_norm": 1.2094855308532715, + "learning_rate": 9.693779032846753e-06, + "loss": 2.0176, + "mean_token_accuracy": 0.5772398710250854, + "num_tokens": 5293892538.0, + "step": 10355 + }, + { + "epoch": 2.8004326663061114, + "grad_norm": 1.0286962985992432, + "learning_rate": 9.692219255928891e-06, + "loss": 1.8868, + "mean_token_accuracy": 0.5737333297729492, + "num_tokens": 5294416697.0, + "step": 10356 + }, + { + "epoch": 2.800703082747431, + "grad_norm": 1.0706660747528076, + "learning_rate": 9.690659519137586e-06, + "loss": 1.8817, + "mean_token_accuracy": 0.5783510804176331, + "num_tokens": 5294908140.0, + "step": 10357 + }, + { + "epoch": 2.8009734991887507, + "grad_norm": 1.0586527585983276, + "learning_rate": 9.689099822520697e-06, + "loss": 1.9193, + "mean_token_accuracy": 0.5527554750442505, + "num_tokens": 5295432355.0, + "step": 10358 + }, + { + "epoch": 2.8012439156300704, + "grad_norm": 1.1061944961547852, + "learning_rate": 9.687540166126081e-06, + "loss": 1.8196, + "mean_token_accuracy": 0.5926135182380676, + "num_tokens": 5295956554.0, + "step": 10359 + }, + { + "epoch": 2.80151433207139, + "grad_norm": 1.340054988861084, + "learning_rate": 9.685980550001595e-06, + "loss": 1.9073, + "mean_token_accuracy": 0.5826447606086731, + "num_tokens": 5296408680.0, + "step": 10360 + }, + { + "epoch": 2.8017847485127096, + "grad_norm": 0.4801499843597412, + "learning_rate": 9.684420974195086e-06, + "loss": 1.1878, + "mean_token_accuracy": 0.6763081550598145, + "num_tokens": 5296932874.0, + "step": 10361 + }, + { + "epoch": 2.8020551649540293, + "grad_norm": 1.0587825775146484, + "learning_rate": 9.682861438754413e-06, + "loss": 1.9881, + "mean_token_accuracy": 0.550514817237854, + "num_tokens": 5297457151.0, + "step": 10362 + }, + { + "epoch": 2.802325581395349, + "grad_norm": 0.979323148727417, + "learning_rate": 9.681301943727426e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5729667544364929, + "num_tokens": 5297957822.0, + "step": 10363 + }, + { + "epoch": 2.8025959978366686, + "grad_norm": 0.9009562730789185, + "learning_rate": 9.679742489161969e-06, + "loss": 1.8929, + "mean_token_accuracy": 0.5570034980773926, + "num_tokens": 5298452561.0, + "step": 10364 + }, + { + "epoch": 2.802866414277988, + "grad_norm": 1.1817413568496704, + "learning_rate": 9.678183075105897e-06, + "loss": 1.8248, + "mean_token_accuracy": 0.6077792644500732, + "num_tokens": 5298976828.0, + "step": 10365 + }, + { + "epoch": 2.803136830719308, + "grad_norm": 1.2237828969955444, + "learning_rate": 9.676623701607058e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5895024538040161, + "num_tokens": 5299501091.0, + "step": 10366 + }, + { + "epoch": 2.8034072471606275, + "grad_norm": 1.02301824092865, + "learning_rate": 9.67506436871329e-06, + "loss": 1.8828, + "mean_token_accuracy": 0.5799615383148193, + "num_tokens": 5299962602.0, + "step": 10367 + }, + { + "epoch": 2.803677663601947, + "grad_norm": 1.1840019226074219, + "learning_rate": 9.673505076472449e-06, + "loss": 1.7798, + "mean_token_accuracy": 0.5882934331893921, + "num_tokens": 5300484457.0, + "step": 10368 + }, + { + "epoch": 2.803948080043267, + "grad_norm": 1.0672249794006348, + "learning_rate": 9.671945824932368e-06, + "loss": 1.7593, + "mean_token_accuracy": 0.5944609045982361, + "num_tokens": 5301008696.0, + "step": 10369 + }, + { + "epoch": 2.8042184964845864, + "grad_norm": 0.9316876530647278, + "learning_rate": 9.6703866141409e-06, + "loss": 1.8243, + "mean_token_accuracy": 0.5628225803375244, + "num_tokens": 5301532776.0, + "step": 10370 + }, + { + "epoch": 2.8044889129259056, + "grad_norm": 1.2251750230789185, + "learning_rate": 9.66882744414588e-06, + "loss": 1.8946, + "mean_token_accuracy": 0.5819661021232605, + "num_tokens": 5302027458.0, + "step": 10371 + }, + { + "epoch": 2.8047593293672257, + "grad_norm": 1.1920703649520874, + "learning_rate": 9.667268314995147e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.5921832323074341, + "num_tokens": 5302543734.0, + "step": 10372 + }, + { + "epoch": 2.805029745808545, + "grad_norm": 1.0316855907440186, + "learning_rate": 9.66570922673654e-06, + "loss": 1.8266, + "mean_token_accuracy": 0.5939964652061462, + "num_tokens": 5303030894.0, + "step": 10373 + }, + { + "epoch": 2.805300162249865, + "grad_norm": 1.2095648050308228, + "learning_rate": 9.6641501794179e-06, + "loss": 1.8569, + "mean_token_accuracy": 0.5779342651367188, + "num_tokens": 5303486032.0, + "step": 10374 + }, + { + "epoch": 2.805570578691184, + "grad_norm": 1.186545968055725, + "learning_rate": 9.66259117308706e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5668936371803284, + "num_tokens": 5304010262.0, + "step": 10375 + }, + { + "epoch": 2.8058409951325043, + "grad_norm": 0.9178842306137085, + "learning_rate": 9.661032207791856e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.5724948048591614, + "num_tokens": 5304501840.0, + "step": 10376 + }, + { + "epoch": 2.8061114115738235, + "grad_norm": 0.9226126074790955, + "learning_rate": 9.659473283580123e-06, + "loss": 1.7892, + "mean_token_accuracy": 0.5589711666107178, + "num_tokens": 5305025991.0, + "step": 10377 + }, + { + "epoch": 2.8063818280151436, + "grad_norm": 1.06803297996521, + "learning_rate": 9.657914400499688e-06, + "loss": 1.7906, + "mean_token_accuracy": 0.5964347124099731, + "num_tokens": 5305550245.0, + "step": 10378 + }, + { + "epoch": 2.8066522444564628, + "grad_norm": 1.0439225435256958, + "learning_rate": 9.656355558598391e-06, + "loss": 2.0388, + "mean_token_accuracy": 0.5518374443054199, + "num_tokens": 5306036699.0, + "step": 10379 + }, + { + "epoch": 2.806922660897783, + "grad_norm": 1.1301380395889282, + "learning_rate": 9.654796757924056e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.5684914588928223, + "num_tokens": 5306560930.0, + "step": 10380 + }, + { + "epoch": 2.807193077339102, + "grad_norm": 0.4455315172672272, + "learning_rate": 9.653237998524508e-06, + "loss": 1.1301, + "mean_token_accuracy": 0.6827276349067688, + "num_tokens": 5307085171.0, + "step": 10381 + }, + { + "epoch": 2.8074634937804217, + "grad_norm": 1.4727530479431152, + "learning_rate": 9.651679280447583e-06, + "loss": 1.836, + "mean_token_accuracy": 0.5775091648101807, + "num_tokens": 5307588284.0, + "step": 10382 + }, + { + "epoch": 2.8077339102217413, + "grad_norm": 1.3379989862442017, + "learning_rate": 9.650120603741104e-06, + "loss": 1.8924, + "mean_token_accuracy": 0.5768861770629883, + "num_tokens": 5308112369.0, + "step": 10383 + }, + { + "epoch": 2.808004326663061, + "grad_norm": 1.1066176891326904, + "learning_rate": 9.64856196845289e-06, + "loss": 1.9175, + "mean_token_accuracy": 0.5654511451721191, + "num_tokens": 5308636454.0, + "step": 10384 + }, + { + "epoch": 2.8082747431043806, + "grad_norm": 1.1220382452011108, + "learning_rate": 9.647003374630773e-06, + "loss": 1.8083, + "mean_token_accuracy": 0.5736713409423828, + "num_tokens": 5309160597.0, + "step": 10385 + }, + { + "epoch": 2.8085451595457003, + "grad_norm": 1.1895132064819336, + "learning_rate": 9.64544482232257e-06, + "loss": 1.9883, + "mean_token_accuracy": 0.5532900094985962, + "num_tokens": 5309684874.0, + "step": 10386 + }, + { + "epoch": 2.80881557598702, + "grad_norm": 1.34842848777771, + "learning_rate": 9.6438863115761e-06, + "loss": 1.8015, + "mean_token_accuracy": 0.5704842805862427, + "num_tokens": 5310112962.0, + "step": 10387 + }, + { + "epoch": 2.8090859924283396, + "grad_norm": 0.9701316356658936, + "learning_rate": 9.642327842439192e-06, + "loss": 1.8879, + "mean_token_accuracy": 0.5697636604309082, + "num_tokens": 5310637171.0, + "step": 10388 + }, + { + "epoch": 2.809356408869659, + "grad_norm": 1.0162187814712524, + "learning_rate": 9.640769414959656e-06, + "loss": 1.9782, + "mean_token_accuracy": 0.5480449199676514, + "num_tokens": 5311161426.0, + "step": 10389 + }, + { + "epoch": 2.809626825310979, + "grad_norm": 1.0554848909378052, + "learning_rate": 9.639211029185309e-06, + "loss": 1.9084, + "mean_token_accuracy": 0.5492191910743713, + "num_tokens": 5311685677.0, + "step": 10390 + }, + { + "epoch": 2.8098972417522985, + "grad_norm": 1.2522543668746948, + "learning_rate": 9.637652685163975e-06, + "loss": 1.881, + "mean_token_accuracy": 0.5864201784133911, + "num_tokens": 5312146703.0, + "step": 10391 + }, + { + "epoch": 2.810167658193618, + "grad_norm": 1.0450725555419922, + "learning_rate": 9.636094382943454e-06, + "loss": 1.8247, + "mean_token_accuracy": 0.5949475765228271, + "num_tokens": 5312670900.0, + "step": 10392 + }, + { + "epoch": 2.8104380746349378, + "grad_norm": 1.098853588104248, + "learning_rate": 9.634536122571575e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5585070848464966, + "num_tokens": 5313195143.0, + "step": 10393 + }, + { + "epoch": 2.8107084910762574, + "grad_norm": 1.3233861923217773, + "learning_rate": 9.632977904096144e-06, + "loss": 1.9855, + "mean_token_accuracy": 0.5638375282287598, + "num_tokens": 5313659811.0, + "step": 10394 + }, + { + "epoch": 2.810978907517577, + "grad_norm": 1.0256870985031128, + "learning_rate": 9.631419727564968e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.581868052482605, + "num_tokens": 5314184026.0, + "step": 10395 + }, + { + "epoch": 2.8112493239588967, + "grad_norm": 1.034995198249817, + "learning_rate": 9.629861593025861e-06, + "loss": 1.9163, + "mean_token_accuracy": 0.5486589670181274, + "num_tokens": 5314708278.0, + "step": 10396 + }, + { + "epoch": 2.8115197404002163, + "grad_norm": 1.0312864780426025, + "learning_rate": 9.628303500526633e-06, + "loss": 1.8711, + "mean_token_accuracy": 0.5823236703872681, + "num_tokens": 5315232441.0, + "step": 10397 + }, + { + "epoch": 2.811790156841536, + "grad_norm": 1.0632221698760986, + "learning_rate": 9.62674545011508e-06, + "loss": 1.9123, + "mean_token_accuracy": 0.5733599662780762, + "num_tokens": 5315756646.0, + "step": 10398 + }, + { + "epoch": 2.8120605732828556, + "grad_norm": 1.1196587085723877, + "learning_rate": 9.625187441839021e-06, + "loss": 1.8827, + "mean_token_accuracy": 0.5886991620063782, + "num_tokens": 5316216170.0, + "step": 10399 + }, + { + "epoch": 2.8123309897241753, + "grad_norm": 1.0925548076629639, + "learning_rate": 9.623629475746253e-06, + "loss": 1.6798, + "mean_token_accuracy": 0.6145251989364624, + "num_tokens": 5316740317.0, + "step": 10400 + }, + { + "epoch": 2.812601406165495, + "grad_norm": 0.5101216435432434, + "learning_rate": 9.622071551884577e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.7013633251190186, + "num_tokens": 5317264385.0, + "step": 10401 + }, + { + "epoch": 2.8128718226068146, + "grad_norm": 1.596036672592163, + "learning_rate": 9.6205136703018e-06, + "loss": 1.7604, + "mean_token_accuracy": 0.5700188279151917, + "num_tokens": 5317788620.0, + "step": 10402 + }, + { + "epoch": 2.813142239048134, + "grad_norm": 1.3445096015930176, + "learning_rate": 9.618955831045725e-06, + "loss": 1.8674, + "mean_token_accuracy": 0.5806925296783447, + "num_tokens": 5318255138.0, + "step": 10403 + }, + { + "epoch": 2.813412655489454, + "grad_norm": 1.1721291542053223, + "learning_rate": 9.61739803416414e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5585673451423645, + "num_tokens": 5318779268.0, + "step": 10404 + }, + { + "epoch": 2.8136830719307735, + "grad_norm": 1.177276372909546, + "learning_rate": 9.615840279704853e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5652807354927063, + "num_tokens": 5319287193.0, + "step": 10405 + }, + { + "epoch": 2.813953488372093, + "grad_norm": 1.1536628007888794, + "learning_rate": 9.614282567715659e-06, + "loss": 1.9023, + "mean_token_accuracy": 0.5675274729728699, + "num_tokens": 5319756413.0, + "step": 10406 + }, + { + "epoch": 2.814223904813413, + "grad_norm": 1.3810336589813232, + "learning_rate": 9.612724898244345e-06, + "loss": 1.7504, + "mean_token_accuracy": 0.6217453479766846, + "num_tokens": 5320280605.0, + "step": 10407 + }, + { + "epoch": 2.8144943212547324, + "grad_norm": 1.1474183797836304, + "learning_rate": 9.611167271338712e-06, + "loss": 1.8078, + "mean_token_accuracy": 0.5679098963737488, + "num_tokens": 5320775805.0, + "step": 10408 + }, + { + "epoch": 2.814764737696052, + "grad_norm": 1.2892731428146362, + "learning_rate": 9.609609687046555e-06, + "loss": 1.9265, + "mean_token_accuracy": 0.5299224257469177, + "num_tokens": 5321299955.0, + "step": 10409 + }, + { + "epoch": 2.8150351541373717, + "grad_norm": 1.2688785791397095, + "learning_rate": 9.608052145415658e-06, + "loss": 1.7908, + "mean_token_accuracy": 0.5771518349647522, + "num_tokens": 5321738825.0, + "step": 10410 + }, + { + "epoch": 2.8153055705786914, + "grad_norm": 1.1855326890945435, + "learning_rate": 9.606494646493818e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5857757925987244, + "num_tokens": 5322262955.0, + "step": 10411 + }, + { + "epoch": 2.8155759870200106, + "grad_norm": 1.2759411334991455, + "learning_rate": 9.60493719032882e-06, + "loss": 1.9379, + "mean_token_accuracy": 0.5486770868301392, + "num_tokens": 5322729584.0, + "step": 10412 + }, + { + "epoch": 2.8158464034613306, + "grad_norm": 0.9323915839195251, + "learning_rate": 9.603379776968446e-06, + "loss": 1.7735, + "mean_token_accuracy": 0.5991080403327942, + "num_tokens": 5323253787.0, + "step": 10413 + }, + { + "epoch": 2.81611681990265, + "grad_norm": 0.946893036365509, + "learning_rate": 9.601822406460495e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5877857208251953, + "num_tokens": 5323778068.0, + "step": 10414 + }, + { + "epoch": 2.81638723634397, + "grad_norm": 1.056853175163269, + "learning_rate": 9.600265078852738e-06, + "loss": 1.8469, + "mean_token_accuracy": 0.5765208005905151, + "num_tokens": 5324302325.0, + "step": 10415 + }, + { + "epoch": 2.816657652785289, + "grad_norm": 1.120895266532898, + "learning_rate": 9.598707794192968e-06, + "loss": 1.7646, + "mean_token_accuracy": 0.5973421931266785, + "num_tokens": 5324826498.0, + "step": 10416 + }, + { + "epoch": 2.816928069226609, + "grad_norm": 1.1912055015563965, + "learning_rate": 9.597150552528965e-06, + "loss": 1.8525, + "mean_token_accuracy": 0.5670374631881714, + "num_tokens": 5325350782.0, + "step": 10417 + }, + { + "epoch": 2.8171984856679284, + "grad_norm": 1.2653558254241943, + "learning_rate": 9.595593353908506e-06, + "loss": 1.7994, + "mean_token_accuracy": 0.5888818502426147, + "num_tokens": 5325874218.0, + "step": 10418 + }, + { + "epoch": 2.8174689021092485, + "grad_norm": 1.17318856716156, + "learning_rate": 9.594036198379375e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5642156600952148, + "num_tokens": 5326398398.0, + "step": 10419 + }, + { + "epoch": 2.8177393185505677, + "grad_norm": 1.1860713958740234, + "learning_rate": 9.592479085989348e-06, + "loss": 1.8702, + "mean_token_accuracy": 0.5649082064628601, + "num_tokens": 5326922520.0, + "step": 10420 + }, + { + "epoch": 2.818009734991888, + "grad_norm": 0.5139358043670654, + "learning_rate": 9.5909220167862e-06, + "loss": 1.1353, + "mean_token_accuracy": 0.709494411945343, + "num_tokens": 5327446798.0, + "step": 10421 + }, + { + "epoch": 2.818280151433207, + "grad_norm": 1.4683338403701782, + "learning_rate": 9.589364990817708e-06, + "loss": 1.9268, + "mean_token_accuracy": 0.5630673170089722, + "num_tokens": 5327970954.0, + "step": 10422 + }, + { + "epoch": 2.8185505678745266, + "grad_norm": 1.3482861518859863, + "learning_rate": 9.587808008131649e-06, + "loss": 1.7714, + "mean_token_accuracy": 0.5959645509719849, + "num_tokens": 5328495140.0, + "step": 10423 + }, + { + "epoch": 2.8188209843158463, + "grad_norm": 1.3036088943481445, + "learning_rate": 9.586251068775792e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5638452172279358, + "num_tokens": 5329019259.0, + "step": 10424 + }, + { + "epoch": 2.819091400757166, + "grad_norm": 1.3029804229736328, + "learning_rate": 9.58469417279791e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.5759912133216858, + "num_tokens": 5329504095.0, + "step": 10425 + }, + { + "epoch": 2.8193618171984856, + "grad_norm": 1.4403458833694458, + "learning_rate": 9.583137320245774e-06, + "loss": 2.0057, + "mean_token_accuracy": 0.5492061376571655, + "num_tokens": 5330028240.0, + "step": 10426 + }, + { + "epoch": 2.819632233639805, + "grad_norm": 1.3980656862258911, + "learning_rate": 9.58158051116715e-06, + "loss": 1.7354, + "mean_token_accuracy": 0.5947506427764893, + "num_tokens": 5330517991.0, + "step": 10427 + }, + { + "epoch": 2.819902650081125, + "grad_norm": 1.1532541513442993, + "learning_rate": 9.58002374560981e-06, + "loss": 1.7596, + "mean_token_accuracy": 0.5714964270591736, + "num_tokens": 5331036295.0, + "step": 10428 + }, + { + "epoch": 2.8201730665224445, + "grad_norm": 1.2489246129989624, + "learning_rate": 9.578467023621514e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5802987217903137, + "num_tokens": 5331560569.0, + "step": 10429 + }, + { + "epoch": 2.820443482963764, + "grad_norm": 1.2579058408737183, + "learning_rate": 9.57691034525003e-06, + "loss": 1.868, + "mean_token_accuracy": 0.5801587700843811, + "num_tokens": 5332077555.0, + "step": 10430 + }, + { + "epoch": 2.8207138994050838, + "grad_norm": 1.1806732416152954, + "learning_rate": 9.575353710543125e-06, + "loss": 1.9631, + "mean_token_accuracy": 0.5486527681350708, + "num_tokens": 5332601825.0, + "step": 10431 + }, + { + "epoch": 2.8209843158464034, + "grad_norm": 0.953342080116272, + "learning_rate": 9.573797119548558e-06, + "loss": 1.8813, + "mean_token_accuracy": 0.5756039023399353, + "num_tokens": 5333109051.0, + "step": 10432 + }, + { + "epoch": 2.821254732287723, + "grad_norm": 1.0092259645462036, + "learning_rate": 9.572240572314085e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.573689341545105, + "num_tokens": 5333629761.0, + "step": 10433 + }, + { + "epoch": 2.8215251487290427, + "grad_norm": 1.2794229984283447, + "learning_rate": 9.570684068887473e-06, + "loss": 1.9092, + "mean_token_accuracy": 0.5564327239990234, + "num_tokens": 5334154040.0, + "step": 10434 + }, + { + "epoch": 2.8217955651703623, + "grad_norm": 1.2057085037231445, + "learning_rate": 9.569127609316478e-06, + "loss": 1.878, + "mean_token_accuracy": 0.6033666729927063, + "num_tokens": 5334678268.0, + "step": 10435 + }, + { + "epoch": 2.822065981611682, + "grad_norm": 1.133697271347046, + "learning_rate": 9.567571193648852e-06, + "loss": 1.8944, + "mean_token_accuracy": 0.5524296760559082, + "num_tokens": 5335202480.0, + "step": 10436 + }, + { + "epoch": 2.8223363980530016, + "grad_norm": 1.2215758562088013, + "learning_rate": 9.566014821932357e-06, + "loss": 1.8583, + "mean_token_accuracy": 0.5476833581924438, + "num_tokens": 5335726731.0, + "step": 10437 + }, + { + "epoch": 2.8226068144943213, + "grad_norm": 1.2741913795471191, + "learning_rate": 9.56445849421474e-06, + "loss": 1.9078, + "mean_token_accuracy": 0.5779024362564087, + "num_tokens": 5336227186.0, + "step": 10438 + }, + { + "epoch": 2.822877230935641, + "grad_norm": 1.1805088520050049, + "learning_rate": 9.562902210543764e-06, + "loss": 1.764, + "mean_token_accuracy": 0.5996596813201904, + "num_tokens": 5336751434.0, + "step": 10439 + }, + { + "epoch": 2.8231476473769606, + "grad_norm": 1.1354637145996094, + "learning_rate": 9.561345970967175e-06, + "loss": 1.8407, + "mean_token_accuracy": 0.597713828086853, + "num_tokens": 5337275693.0, + "step": 10440 + }, + { + "epoch": 2.82341806381828, + "grad_norm": 0.4552859663963318, + "learning_rate": 9.559789775532717e-06, + "loss": 1.1572, + "mean_token_accuracy": 0.6934309005737305, + "num_tokens": 5337786647.0, + "step": 10441 + }, + { + "epoch": 2.8236884802596, + "grad_norm": 1.54413902759552, + "learning_rate": 9.558233624288149e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5737321972846985, + "num_tokens": 5338310822.0, + "step": 10442 + }, + { + "epoch": 2.8239588967009195, + "grad_norm": 1.433255672454834, + "learning_rate": 9.556677517281212e-06, + "loss": 1.8371, + "mean_token_accuracy": 0.5791162252426147, + "num_tokens": 5338835004.0, + "step": 10443 + }, + { + "epoch": 2.824229313142239, + "grad_norm": 1.33138108253479, + "learning_rate": 9.55512145455965e-06, + "loss": 1.9036, + "mean_token_accuracy": 0.5679507255554199, + "num_tokens": 5339359185.0, + "step": 10444 + }, + { + "epoch": 2.8244997295835588, + "grad_norm": 1.0508887767791748, + "learning_rate": 9.553565436171216e-06, + "loss": 1.9068, + "mean_token_accuracy": 0.5675346255302429, + "num_tokens": 5339857522.0, + "step": 10445 + }, + { + "epoch": 2.8247701460248784, + "grad_norm": 1.2452731132507324, + "learning_rate": 9.552009462163645e-06, + "loss": 1.8868, + "mean_token_accuracy": 0.5763607025146484, + "num_tokens": 5340381796.0, + "step": 10446 + }, + { + "epoch": 2.825040562466198, + "grad_norm": 1.380386471748352, + "learning_rate": 9.550453532584682e-06, + "loss": 2.03, + "mean_token_accuracy": 0.551530122756958, + "num_tokens": 5340905982.0, + "step": 10447 + }, + { + "epoch": 2.8253109789075177, + "grad_norm": 1.087733268737793, + "learning_rate": 9.548897647482069e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5743502378463745, + "num_tokens": 5341368647.0, + "step": 10448 + }, + { + "epoch": 2.8255813953488373, + "grad_norm": 1.1658939123153687, + "learning_rate": 9.547341806903545e-06, + "loss": 1.847, + "mean_token_accuracy": 0.5863608121871948, + "num_tokens": 5341892918.0, + "step": 10449 + }, + { + "epoch": 2.825851811790157, + "grad_norm": 1.037984848022461, + "learning_rate": 9.545786010896841e-06, + "loss": 1.8308, + "mean_token_accuracy": 0.562838077545166, + "num_tokens": 5342417102.0, + "step": 10450 + }, + { + "epoch": 2.8261222282314766, + "grad_norm": 1.0266304016113281, + "learning_rate": 9.544230259509702e-06, + "loss": 1.9158, + "mean_token_accuracy": 0.5607005953788757, + "num_tokens": 5342941291.0, + "step": 10451 + }, + { + "epoch": 2.8263926446727963, + "grad_norm": 0.974172830581665, + "learning_rate": 9.542674552789861e-06, + "loss": 1.8138, + "mean_token_accuracy": 0.5717071294784546, + "num_tokens": 5343465456.0, + "step": 10452 + }, + { + "epoch": 2.8266630611141155, + "grad_norm": 1.2120543718338013, + "learning_rate": 9.541118890785047e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5778481960296631, + "num_tokens": 5343989723.0, + "step": 10453 + }, + { + "epoch": 2.8269334775554356, + "grad_norm": 1.1005277633666992, + "learning_rate": 9.539563273542998e-06, + "loss": 1.8404, + "mean_token_accuracy": 0.5616863369941711, + "num_tokens": 5344513943.0, + "step": 10454 + }, + { + "epoch": 2.8272038939967548, + "grad_norm": 1.0368447303771973, + "learning_rate": 9.538007701111444e-06, + "loss": 1.8912, + "mean_token_accuracy": 0.5369751453399658, + "num_tokens": 5345038038.0, + "step": 10455 + }, + { + "epoch": 2.827474310438075, + "grad_norm": 1.1145312786102295, + "learning_rate": 9.536452173538109e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.596683144569397, + "num_tokens": 5345532121.0, + "step": 10456 + }, + { + "epoch": 2.827744726879394, + "grad_norm": 1.043999195098877, + "learning_rate": 9.534896690870729e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.5753421783447266, + "num_tokens": 5345998637.0, + "step": 10457 + }, + { + "epoch": 2.828015143320714, + "grad_norm": 1.090033769607544, + "learning_rate": 9.533341253157028e-06, + "loss": 1.8894, + "mean_token_accuracy": 0.5547678470611572, + "num_tokens": 5346522745.0, + "step": 10458 + }, + { + "epoch": 2.8282855597620333, + "grad_norm": 1.0724658966064453, + "learning_rate": 9.531785860444724e-06, + "loss": 1.904, + "mean_token_accuracy": 0.5719159841537476, + "num_tokens": 5347033214.0, + "step": 10459 + }, + { + "epoch": 2.8285559762033534, + "grad_norm": 1.2193955183029175, + "learning_rate": 9.530230512781555e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5490351319313049, + "num_tokens": 5347557369.0, + "step": 10460 + }, + { + "epoch": 2.8288263926446726, + "grad_norm": 0.4400581121444702, + "learning_rate": 9.52867521021523e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7319352626800537, + "num_tokens": 5348052854.0, + "step": 10461 + }, + { + "epoch": 2.8290968090859927, + "grad_norm": 1.9113025665283203, + "learning_rate": 9.527119952793482e-06, + "loss": 1.926, + "mean_token_accuracy": 0.5694321393966675, + "num_tokens": 5348577096.0, + "step": 10462 + }, + { + "epoch": 2.829367225527312, + "grad_norm": 1.7134124040603638, + "learning_rate": 9.525564740564023e-06, + "loss": 1.8276, + "mean_token_accuracy": 0.6192293763160706, + "num_tokens": 5349035884.0, + "step": 10463 + }, + { + "epoch": 2.8296376419686315, + "grad_norm": 1.297124981880188, + "learning_rate": 9.524009573574573e-06, + "loss": 1.8792, + "mean_token_accuracy": 0.5683691501617432, + "num_tokens": 5349560162.0, + "step": 10464 + }, + { + "epoch": 2.829908058409951, + "grad_norm": 1.1147117614746094, + "learning_rate": 9.52245445187285e-06, + "loss": 1.7513, + "mean_token_accuracy": 0.5666781663894653, + "num_tokens": 5350084337.0, + "step": 10465 + }, + { + "epoch": 2.830178474851271, + "grad_norm": 1.3433148860931396, + "learning_rate": 9.520899375506571e-06, + "loss": 1.7645, + "mean_token_accuracy": 0.5940911173820496, + "num_tokens": 5350545308.0, + "step": 10466 + }, + { + "epoch": 2.8304488912925905, + "grad_norm": 1.7983115911483765, + "learning_rate": 9.519344344523447e-06, + "loss": 1.9088, + "mean_token_accuracy": 0.5765421390533447, + "num_tokens": 5351058280.0, + "step": 10467 + }, + { + "epoch": 2.83071930773391, + "grad_norm": 1.445515513420105, + "learning_rate": 9.517789358971195e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5771905183792114, + "num_tokens": 5351582552.0, + "step": 10468 + }, + { + "epoch": 2.8309897241752298, + "grad_norm": 1.527683973312378, + "learning_rate": 9.516234418897525e-06, + "loss": 1.9514, + "mean_token_accuracy": 0.5746526718139648, + "num_tokens": 5352030518.0, + "step": 10469 + }, + { + "epoch": 2.8312601406165494, + "grad_norm": 1.2687842845916748, + "learning_rate": 9.514679524350142e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5918734669685364, + "num_tokens": 5352554698.0, + "step": 10470 + }, + { + "epoch": 2.831530557057869, + "grad_norm": 0.9818127155303955, + "learning_rate": 9.513124675376763e-06, + "loss": 1.7929, + "mean_token_accuracy": 0.5954644680023193, + "num_tokens": 5353078953.0, + "step": 10471 + }, + { + "epoch": 2.8318009734991887, + "grad_norm": 1.451074481010437, + "learning_rate": 9.511569872025092e-06, + "loss": 1.9087, + "mean_token_accuracy": 0.573049008846283, + "num_tokens": 5353563999.0, + "step": 10472 + }, + { + "epoch": 2.8320713899405083, + "grad_norm": 1.375911831855774, + "learning_rate": 9.51001511434283e-06, + "loss": 1.8599, + "mean_token_accuracy": 0.5628213882446289, + "num_tokens": 5354088085.0, + "step": 10473 + }, + { + "epoch": 2.832341806381828, + "grad_norm": 1.2550456523895264, + "learning_rate": 9.50846040237769e-06, + "loss": 1.6545, + "mean_token_accuracy": 0.6087965965270996, + "num_tokens": 5354612333.0, + "step": 10474 + }, + { + "epoch": 2.8326122228231476, + "grad_norm": 0.967885434627533, + "learning_rate": 9.506905736177372e-06, + "loss": 1.7448, + "mean_token_accuracy": 0.5852937698364258, + "num_tokens": 5355136383.0, + "step": 10475 + }, + { + "epoch": 2.8328826392644673, + "grad_norm": 1.1592258214950562, + "learning_rate": 9.505351115789573e-06, + "loss": 1.9616, + "mean_token_accuracy": 0.5769021511077881, + "num_tokens": 5355562510.0, + "step": 10476 + }, + { + "epoch": 2.833153055705787, + "grad_norm": 1.0560154914855957, + "learning_rate": 9.503796541262003e-06, + "loss": 1.8158, + "mean_token_accuracy": 0.5732794404029846, + "num_tokens": 5356086646.0, + "step": 10477 + }, + { + "epoch": 2.8334234721471065, + "grad_norm": 1.3178385496139526, + "learning_rate": 9.502242012642351e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5707016587257385, + "num_tokens": 5356610880.0, + "step": 10478 + }, + { + "epoch": 2.833693888588426, + "grad_norm": 1.1437360048294067, + "learning_rate": 9.500687529978318e-06, + "loss": 1.9241, + "mean_token_accuracy": 0.5626437664031982, + "num_tokens": 5357134960.0, + "step": 10479 + }, + { + "epoch": 2.833964305029746, + "grad_norm": 1.2818747758865356, + "learning_rate": 9.499133093317603e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5584918260574341, + "num_tokens": 5357659173.0, + "step": 10480 + }, + { + "epoch": 2.8342347214710655, + "grad_norm": 0.4462306797504425, + "learning_rate": 9.4975787027079e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.6949392557144165, + "num_tokens": 5358183439.0, + "step": 10481 + }, + { + "epoch": 2.834505137912385, + "grad_norm": 1.5172717571258545, + "learning_rate": 9.496024358196894e-06, + "loss": 1.851, + "mean_token_accuracy": 0.5693889856338501, + "num_tokens": 5358707640.0, + "step": 10482 + }, + { + "epoch": 2.8347755543537048, + "grad_norm": 1.6277594566345215, + "learning_rate": 9.49447005983229e-06, + "loss": 1.8272, + "mean_token_accuracy": 0.5710761547088623, + "num_tokens": 5359231799.0, + "step": 10483 + }, + { + "epoch": 2.8350459707950244, + "grad_norm": 1.122253179550171, + "learning_rate": 9.492915807661766e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5946208238601685, + "num_tokens": 5359720436.0, + "step": 10484 + }, + { + "epoch": 2.835316387236344, + "grad_norm": 1.1883763074874878, + "learning_rate": 9.491361601733024e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5620729327201843, + "num_tokens": 5360244481.0, + "step": 10485 + }, + { + "epoch": 2.8355868036776637, + "grad_norm": 1.7630581855773926, + "learning_rate": 9.489807442093742e-06, + "loss": 1.9259, + "mean_token_accuracy": 0.5687539577484131, + "num_tokens": 5360768669.0, + "step": 10486 + }, + { + "epoch": 2.8358572201189833, + "grad_norm": 1.3157211542129517, + "learning_rate": 9.488253328791605e-06, + "loss": 1.9911, + "mean_token_accuracy": 0.5628929734230042, + "num_tokens": 5361215121.0, + "step": 10487 + }, + { + "epoch": 2.836127636560303, + "grad_norm": 1.157102108001709, + "learning_rate": 9.486699261874307e-06, + "loss": 1.8924, + "mean_token_accuracy": 0.5530519485473633, + "num_tokens": 5361739316.0, + "step": 10488 + }, + { + "epoch": 2.8363980530016226, + "grad_norm": 1.3674449920654297, + "learning_rate": 9.485145241389526e-06, + "loss": 1.9501, + "mean_token_accuracy": 0.5737510919570923, + "num_tokens": 5362208500.0, + "step": 10489 + }, + { + "epoch": 2.8366684694429423, + "grad_norm": 1.080207109451294, + "learning_rate": 9.483591267384939e-06, + "loss": 1.9242, + "mean_token_accuracy": 0.5629334449768066, + "num_tokens": 5362698383.0, + "step": 10490 + }, + { + "epoch": 2.836938885884262, + "grad_norm": 1.1122376918792725, + "learning_rate": 9.48203733990824e-06, + "loss": 1.9447, + "mean_token_accuracy": 0.5532780289649963, + "num_tokens": 5363222641.0, + "step": 10491 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 1.1191225051879883, + "learning_rate": 9.48048345900709e-06, + "loss": 1.8641, + "mean_token_accuracy": 0.5689881443977356, + "num_tokens": 5363699459.0, + "step": 10492 + }, + { + "epoch": 2.837479718766901, + "grad_norm": 0.9850507974624634, + "learning_rate": 9.47892962472918e-06, + "loss": 1.8621, + "mean_token_accuracy": 0.5805766582489014, + "num_tokens": 5364223647.0, + "step": 10493 + }, + { + "epoch": 2.8377501352082204, + "grad_norm": 0.9138228297233582, + "learning_rate": 9.477375837122182e-06, + "loss": 1.7329, + "mean_token_accuracy": 0.5767004489898682, + "num_tokens": 5364747896.0, + "step": 10494 + }, + { + "epoch": 2.8380205516495405, + "grad_norm": 1.215599775314331, + "learning_rate": 9.475822096233773e-06, + "loss": 1.8336, + "mean_token_accuracy": 0.602547287940979, + "num_tokens": 5365207778.0, + "step": 10495 + }, + { + "epoch": 2.8382909680908597, + "grad_norm": 0.8974266052246094, + "learning_rate": 9.474268402111622e-06, + "loss": 1.809, + "mean_token_accuracy": 0.5666565299034119, + "num_tokens": 5365731968.0, + "step": 10496 + }, + { + "epoch": 2.8385613845321798, + "grad_norm": 1.2313446998596191, + "learning_rate": 9.472714754803406e-06, + "loss": 1.7519, + "mean_token_accuracy": 0.6063660383224487, + "num_tokens": 5366208027.0, + "step": 10497 + }, + { + "epoch": 2.838831800973499, + "grad_norm": 0.892387866973877, + "learning_rate": 9.471161154356791e-06, + "loss": 1.9357, + "mean_token_accuracy": 0.5597569942474365, + "num_tokens": 5366732285.0, + "step": 10498 + }, + { + "epoch": 2.839102217414819, + "grad_norm": 0.8480839729309082, + "learning_rate": 9.469607600819445e-06, + "loss": 1.9418, + "mean_token_accuracy": 0.539657711982727, + "num_tokens": 5367256429.0, + "step": 10499 + }, + { + "epoch": 2.8393726338561383, + "grad_norm": 1.0023119449615479, + "learning_rate": 9.46805409423904e-06, + "loss": 1.799, + "mean_token_accuracy": 0.5694456100463867, + "num_tokens": 5367757064.0, + "step": 10500 + }, + { + "epoch": 2.8396430502974583, + "grad_norm": 0.40478357672691345, + "learning_rate": 9.466500634663244e-06, + "loss": 1.1946, + "mean_token_accuracy": 0.6915343999862671, + "num_tokens": 5368246857.0, + "step": 10501 + }, + { + "epoch": 2.8399134667387775, + "grad_norm": 1.3028939962387085, + "learning_rate": 9.464947222139713e-06, + "loss": 1.9487, + "mean_token_accuracy": 0.5666953325271606, + "num_tokens": 5368771111.0, + "step": 10502 + }, + { + "epoch": 2.8401838831800976, + "grad_norm": 1.0992839336395264, + "learning_rate": 9.46339385671612e-06, + "loss": 1.9711, + "mean_token_accuracy": 0.5653135776519775, + "num_tokens": 5369290477.0, + "step": 10503 + }, + { + "epoch": 2.840454299621417, + "grad_norm": 0.9098402261734009, + "learning_rate": 9.461840538440124e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5733178853988647, + "num_tokens": 5369814680.0, + "step": 10504 + }, + { + "epoch": 2.8407247160627365, + "grad_norm": 1.4261219501495361, + "learning_rate": 9.460287267359378e-06, + "loss": 1.7637, + "mean_token_accuracy": 0.5844855308532715, + "num_tokens": 5370338730.0, + "step": 10505 + }, + { + "epoch": 2.840995132504056, + "grad_norm": 1.2280194759368896, + "learning_rate": 9.458734043521548e-06, + "loss": 1.9319, + "mean_token_accuracy": 0.5610462427139282, + "num_tokens": 5370807109.0, + "step": 10506 + }, + { + "epoch": 2.8412655489453758, + "grad_norm": 1.113058090209961, + "learning_rate": 9.45718086697429e-06, + "loss": 1.8808, + "mean_token_accuracy": 0.5726819038391113, + "num_tokens": 5371273379.0, + "step": 10507 + }, + { + "epoch": 2.8415359653866954, + "grad_norm": 1.0744342803955078, + "learning_rate": 9.455627737765262e-06, + "loss": 1.9171, + "mean_token_accuracy": 0.5378504991531372, + "num_tokens": 5371797657.0, + "step": 10508 + }, + { + "epoch": 2.841806381828015, + "grad_norm": 1.1701788902282715, + "learning_rate": 9.454074655942118e-06, + "loss": 1.8784, + "mean_token_accuracy": 0.5706014633178711, + "num_tokens": 5372321873.0, + "step": 10509 + }, + { + "epoch": 2.8420767982693347, + "grad_norm": 1.1573950052261353, + "learning_rate": 9.452521621552504e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.5810832977294922, + "num_tokens": 5372846042.0, + "step": 10510 + }, + { + "epoch": 2.8423472147106543, + "grad_norm": 1.139780044555664, + "learning_rate": 9.450968634644085e-06, + "loss": 1.9025, + "mean_token_accuracy": 0.5612082481384277, + "num_tokens": 5373362766.0, + "step": 10511 + }, + { + "epoch": 2.842617631151974, + "grad_norm": 1.125240445137024, + "learning_rate": 9.449415695264502e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.6037510633468628, + "num_tokens": 5373886918.0, + "step": 10512 + }, + { + "epoch": 2.8428880475932936, + "grad_norm": 1.1598153114318848, + "learning_rate": 9.4478628034614e-06, + "loss": 1.9319, + "mean_token_accuracy": 0.5505446195602417, + "num_tokens": 5374411137.0, + "step": 10513 + }, + { + "epoch": 2.8431584640346133, + "grad_norm": 1.188469648361206, + "learning_rate": 9.446309959282435e-06, + "loss": 1.9791, + "mean_token_accuracy": 0.5579570531845093, + "num_tokens": 5374935315.0, + "step": 10514 + }, + { + "epoch": 2.843428880475933, + "grad_norm": 1.0556437969207764, + "learning_rate": 9.44475716277525e-06, + "loss": 1.8139, + "mean_token_accuracy": 0.5882564783096313, + "num_tokens": 5375459429.0, + "step": 10515 + }, + { + "epoch": 2.8436992969172525, + "grad_norm": 1.0527408123016357, + "learning_rate": 9.443204413987486e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5659910440444946, + "num_tokens": 5375952235.0, + "step": 10516 + }, + { + "epoch": 2.843969713358572, + "grad_norm": 1.5608218908309937, + "learning_rate": 9.441651712966796e-06, + "loss": 1.521, + "mean_token_accuracy": 0.6338317394256592, + "num_tokens": 5376476324.0, + "step": 10517 + }, + { + "epoch": 2.844240129799892, + "grad_norm": 1.0620802640914917, + "learning_rate": 9.44009905976081e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.582261860370636, + "num_tokens": 5377000492.0, + "step": 10518 + }, + { + "epoch": 2.8445105462412115, + "grad_norm": 0.9914897084236145, + "learning_rate": 9.438546454417168e-06, + "loss": 1.7625, + "mean_token_accuracy": 0.5813374519348145, + "num_tokens": 5377524640.0, + "step": 10519 + }, + { + "epoch": 2.844780962682531, + "grad_norm": 0.9190157651901245, + "learning_rate": 9.436993896983519e-06, + "loss": 1.7352, + "mean_token_accuracy": 0.6116477251052856, + "num_tokens": 5377991479.0, + "step": 10520 + }, + { + "epoch": 2.8450513791238508, + "grad_norm": 0.5287964940071106, + "learning_rate": 9.435441387507492e-06, + "loss": 1.1441, + "mean_token_accuracy": 0.6965402364730835, + "num_tokens": 5378485343.0, + "step": 10521 + }, + { + "epoch": 2.8453217955651704, + "grad_norm": 1.1232422590255737, + "learning_rate": 9.43388892603672e-06, + "loss": 1.9087, + "mean_token_accuracy": 0.5632129907608032, + "num_tokens": 5378916236.0, + "step": 10522 + }, + { + "epoch": 2.84559221200649, + "grad_norm": 1.1373836994171143, + "learning_rate": 9.432336512618847e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.581238865852356, + "num_tokens": 5379440515.0, + "step": 10523 + }, + { + "epoch": 2.8458626284478097, + "grad_norm": 1.1789542436599731, + "learning_rate": 9.430784147301499e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.5671881437301636, + "num_tokens": 5379964682.0, + "step": 10524 + }, + { + "epoch": 2.8461330448891293, + "grad_norm": 0.8568485379219055, + "learning_rate": 9.429231830132306e-06, + "loss": 1.8853, + "mean_token_accuracy": 0.5576885342597961, + "num_tokens": 5380488880.0, + "step": 10525 + }, + { + "epoch": 2.846403461330449, + "grad_norm": 1.0890827178955078, + "learning_rate": 9.427679561158903e-06, + "loss": 1.8794, + "mean_token_accuracy": 0.5801479816436768, + "num_tokens": 5381013111.0, + "step": 10526 + }, + { + "epoch": 2.8466738777717686, + "grad_norm": 1.03863525390625, + "learning_rate": 9.426127340428913e-06, + "loss": 1.9423, + "mean_token_accuracy": 0.5543172359466553, + "num_tokens": 5381537159.0, + "step": 10527 + }, + { + "epoch": 2.8469442942130883, + "grad_norm": 0.9392262101173401, + "learning_rate": 9.424575167989962e-06, + "loss": 1.7838, + "mean_token_accuracy": 0.5753014087677002, + "num_tokens": 5382061359.0, + "step": 10528 + }, + { + "epoch": 2.847214710654408, + "grad_norm": 0.9916481375694275, + "learning_rate": 9.42302304388968e-06, + "loss": 1.8553, + "mean_token_accuracy": 0.5766869783401489, + "num_tokens": 5382585506.0, + "step": 10529 + }, + { + "epoch": 2.8474851270957275, + "grad_norm": 1.0584087371826172, + "learning_rate": 9.421470968175688e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.5728216767311096, + "num_tokens": 5383104293.0, + "step": 10530 + }, + { + "epoch": 2.847755543537047, + "grad_norm": 0.939959704875946, + "learning_rate": 9.419918940895611e-06, + "loss": 1.8719, + "mean_token_accuracy": 0.5810838341712952, + "num_tokens": 5383628307.0, + "step": 10531 + }, + { + "epoch": 2.848025959978367, + "grad_norm": 1.1041831970214844, + "learning_rate": 9.418366962097067e-06, + "loss": 1.8776, + "mean_token_accuracy": 0.5603125095367432, + "num_tokens": 5384152520.0, + "step": 10532 + }, + { + "epoch": 2.8482963764196865, + "grad_norm": 0.8854967951774597, + "learning_rate": 9.416815031827675e-06, + "loss": 1.7504, + "mean_token_accuracy": 0.5759953260421753, + "num_tokens": 5384676700.0, + "step": 10533 + }, + { + "epoch": 2.848566792861006, + "grad_norm": 1.2712955474853516, + "learning_rate": 9.415263150135057e-06, + "loss": 1.9129, + "mean_token_accuracy": 0.5793578624725342, + "num_tokens": 5385168781.0, + "step": 10534 + }, + { + "epoch": 2.8488372093023253, + "grad_norm": 1.0448235273361206, + "learning_rate": 9.41371131706682e-06, + "loss": 1.8493, + "mean_token_accuracy": 0.5821122527122498, + "num_tokens": 5385693016.0, + "step": 10535 + }, + { + "epoch": 2.8491076257436454, + "grad_norm": 1.0978211164474487, + "learning_rate": 9.412159532670588e-06, + "loss": 1.8106, + "mean_token_accuracy": 0.5583933591842651, + "num_tokens": 5386217228.0, + "step": 10536 + }, + { + "epoch": 2.8493780421849646, + "grad_norm": 1.0194728374481201, + "learning_rate": 9.410607796993971e-06, + "loss": 1.7053, + "mean_token_accuracy": 0.5968579649925232, + "num_tokens": 5386741508.0, + "step": 10537 + }, + { + "epoch": 2.8496484586262847, + "grad_norm": 0.9328333735466003, + "learning_rate": 9.409056110084582e-06, + "loss": 1.8608, + "mean_token_accuracy": 0.5697171688079834, + "num_tokens": 5387265531.0, + "step": 10538 + }, + { + "epoch": 2.849918875067604, + "grad_norm": 1.0512357950210571, + "learning_rate": 9.407504471990025e-06, + "loss": 1.9013, + "mean_token_accuracy": 0.5619459748268127, + "num_tokens": 5387789726.0, + "step": 10539 + }, + { + "epoch": 2.850189291508924, + "grad_norm": 1.1680865287780762, + "learning_rate": 9.40595288275792e-06, + "loss": 1.9949, + "mean_token_accuracy": 0.5448955297470093, + "num_tokens": 5388313922.0, + "step": 10540 + }, + { + "epoch": 2.850459707950243, + "grad_norm": 0.38772156834602356, + "learning_rate": 9.40440134243587e-06, + "loss": 1.1081, + "mean_token_accuracy": 0.7180701494216919, + "num_tokens": 5388838081.0, + "step": 10541 + }, + { + "epoch": 2.8507301243915633, + "grad_norm": 1.371286153793335, + "learning_rate": 9.402849851071471e-06, + "loss": 1.8383, + "mean_token_accuracy": 0.5740259885787964, + "num_tokens": 5389362311.0, + "step": 10542 + }, + { + "epoch": 2.8510005408328825, + "grad_norm": 1.2478461265563965, + "learning_rate": 9.40129840871234e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5736398696899414, + "num_tokens": 5389886506.0, + "step": 10543 + }, + { + "epoch": 2.8512709572742025, + "grad_norm": 1.055299997329712, + "learning_rate": 9.399747015406077e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.570199728012085, + "num_tokens": 5390403154.0, + "step": 10544 + }, + { + "epoch": 2.8515413737155217, + "grad_norm": 1.1811200380325317, + "learning_rate": 9.398195671200279e-06, + "loss": 1.9304, + "mean_token_accuracy": 0.555916428565979, + "num_tokens": 5390927417.0, + "step": 10545 + }, + { + "epoch": 2.8518117901568414, + "grad_norm": 1.245727777481079, + "learning_rate": 9.396644376142549e-06, + "loss": 1.9705, + "mean_token_accuracy": 0.5593802332878113, + "num_tokens": 5391451498.0, + "step": 10546 + }, + { + "epoch": 2.852082206598161, + "grad_norm": 1.0956089496612549, + "learning_rate": 9.395093130280488e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5561259388923645, + "num_tokens": 5391975764.0, + "step": 10547 + }, + { + "epoch": 2.8523526230394807, + "grad_norm": 1.2605470418930054, + "learning_rate": 9.393541933661685e-06, + "loss": 1.9199, + "mean_token_accuracy": 0.5585788488388062, + "num_tokens": 5392500013.0, + "step": 10548 + }, + { + "epoch": 2.8526230394808003, + "grad_norm": 1.4112043380737305, + "learning_rate": 9.391990786333743e-06, + "loss": 1.9201, + "mean_token_accuracy": 0.5730599164962769, + "num_tokens": 5392998787.0, + "step": 10549 + }, + { + "epoch": 2.85289345592212, + "grad_norm": 1.1755105257034302, + "learning_rate": 9.390439688344248e-06, + "loss": 1.8482, + "mean_token_accuracy": 0.56058669090271, + "num_tokens": 5393472707.0, + "step": 10550 + }, + { + "epoch": 2.8531638723634396, + "grad_norm": 1.1952075958251953, + "learning_rate": 9.388888639740798e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5895562767982483, + "num_tokens": 5393904506.0, + "step": 10551 + }, + { + "epoch": 2.8534342888047592, + "grad_norm": 1.1516553163528442, + "learning_rate": 9.387337640570986e-06, + "loss": 1.9563, + "mean_token_accuracy": 0.5428513884544373, + "num_tokens": 5394428669.0, + "step": 10552 + }, + { + "epoch": 2.853704705246079, + "grad_norm": 1.219749093055725, + "learning_rate": 9.385786690882396e-06, + "loss": 1.8062, + "mean_token_accuracy": 0.5724267959594727, + "num_tokens": 5394952805.0, + "step": 10553 + }, + { + "epoch": 2.8539751216873985, + "grad_norm": 1.3082226514816284, + "learning_rate": 9.384235790722615e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5580102801322937, + "num_tokens": 5395476998.0, + "step": 10554 + }, + { + "epoch": 2.854245538128718, + "grad_norm": 1.412047028541565, + "learning_rate": 9.382684940139238e-06, + "loss": 1.843, + "mean_token_accuracy": 0.5784054398536682, + "num_tokens": 5396001244.0, + "step": 10555 + }, + { + "epoch": 2.854515954570038, + "grad_norm": 1.2553834915161133, + "learning_rate": 9.381134139179835e-06, + "loss": 1.8484, + "mean_token_accuracy": 0.5786357522010803, + "num_tokens": 5396519051.0, + "step": 10556 + }, + { + "epoch": 2.8547863710113575, + "grad_norm": 1.3750146627426147, + "learning_rate": 9.379583387892002e-06, + "loss": 1.9132, + "mean_token_accuracy": 0.5643690824508667, + "num_tokens": 5397038694.0, + "step": 10557 + }, + { + "epoch": 2.855056787452677, + "grad_norm": 1.4665066003799438, + "learning_rate": 9.378032686323318e-06, + "loss": 1.8654, + "mean_token_accuracy": 0.5567762851715088, + "num_tokens": 5397562900.0, + "step": 10558 + }, + { + "epoch": 2.8553272038939967, + "grad_norm": 1.095513105392456, + "learning_rate": 9.376482034521354e-06, + "loss": 1.8715, + "mean_token_accuracy": 0.5656301975250244, + "num_tokens": 5398087178.0, + "step": 10559 + }, + { + "epoch": 2.8555976203353164, + "grad_norm": 1.069022297859192, + "learning_rate": 9.374931432533702e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5829856395721436, + "num_tokens": 5398611416.0, + "step": 10560 + }, + { + "epoch": 2.855868036776636, + "grad_norm": 0.40314149856567383, + "learning_rate": 9.373380880407932e-06, + "loss": 1.039, + "mean_token_accuracy": 0.7232836484909058, + "num_tokens": 5399091166.0, + "step": 10561 + }, + { + "epoch": 2.8561384532179557, + "grad_norm": 1.685381293296814, + "learning_rate": 9.371830378191615e-06, + "loss": 1.8843, + "mean_token_accuracy": 0.5744337439537048, + "num_tokens": 5399535057.0, + "step": 10562 + }, + { + "epoch": 2.8564088696592753, + "grad_norm": 1.7466857433319092, + "learning_rate": 9.370279925932337e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5875148177146912, + "num_tokens": 5399993814.0, + "step": 10563 + }, + { + "epoch": 2.856679286100595, + "grad_norm": 1.3416656255722046, + "learning_rate": 9.36872952367766e-06, + "loss": 2.0021, + "mean_token_accuracy": 0.5502833127975464, + "num_tokens": 5400463470.0, + "step": 10564 + }, + { + "epoch": 2.8569497025419146, + "grad_norm": 1.0296456813812256, + "learning_rate": 9.367179171475156e-06, + "loss": 1.8256, + "mean_token_accuracy": 0.5728234052658081, + "num_tokens": 5400987751.0, + "step": 10565 + }, + { + "epoch": 2.8572201189832342, + "grad_norm": 1.3643558025360107, + "learning_rate": 9.365628869372403e-06, + "loss": 1.6947, + "mean_token_accuracy": 0.5910547375679016, + "num_tokens": 5401511990.0, + "step": 10566 + }, + { + "epoch": 2.857490535424554, + "grad_norm": 1.5903234481811523, + "learning_rate": 9.364078617416961e-06, + "loss": 1.9706, + "mean_token_accuracy": 0.5663021802902222, + "num_tokens": 5401979898.0, + "step": 10567 + }, + { + "epoch": 2.8577609518658735, + "grad_norm": 1.2832221984863281, + "learning_rate": 9.362528415656396e-06, + "loss": 1.9261, + "mean_token_accuracy": 0.5735183954238892, + "num_tokens": 5402497947.0, + "step": 10568 + }, + { + "epoch": 2.858031368307193, + "grad_norm": 1.576377511024475, + "learning_rate": 9.360978264138274e-06, + "loss": 1.6447, + "mean_token_accuracy": 0.6233843564987183, + "num_tokens": 5403022192.0, + "step": 10569 + }, + { + "epoch": 2.858301784748513, + "grad_norm": 1.3365164995193481, + "learning_rate": 9.35942816291016e-06, + "loss": 1.9517, + "mean_token_accuracy": 0.5731362104415894, + "num_tokens": 5403506769.0, + "step": 10570 + }, + { + "epoch": 2.8585722011898325, + "grad_norm": 1.1437015533447266, + "learning_rate": 9.357878112019613e-06, + "loss": 1.933, + "mean_token_accuracy": 0.569340705871582, + "num_tokens": 5404020694.0, + "step": 10571 + }, + { + "epoch": 2.858842617631152, + "grad_norm": 0.9782375693321228, + "learning_rate": 9.356328111514199e-06, + "loss": 1.8105, + "mean_token_accuracy": 0.5905028581619263, + "num_tokens": 5404493598.0, + "step": 10572 + }, + { + "epoch": 2.8591130340724717, + "grad_norm": 1.1672357320785522, + "learning_rate": 9.354778161441472e-06, + "loss": 1.9008, + "mean_token_accuracy": 0.5610029697418213, + "num_tokens": 5405017851.0, + "step": 10573 + }, + { + "epoch": 2.8593834505137914, + "grad_norm": 1.2044765949249268, + "learning_rate": 9.353228261848984e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5774771571159363, + "num_tokens": 5405542076.0, + "step": 10574 + }, + { + "epoch": 2.859653866955111, + "grad_norm": 1.2247092723846436, + "learning_rate": 9.351678412784299e-06, + "loss": 1.9191, + "mean_token_accuracy": 0.568819522857666, + "num_tokens": 5406066280.0, + "step": 10575 + }, + { + "epoch": 2.8599242833964302, + "grad_norm": 1.1885654926300049, + "learning_rate": 9.350128614294965e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.5592458248138428, + "num_tokens": 5406588177.0, + "step": 10576 + }, + { + "epoch": 2.8601946998377503, + "grad_norm": 1.1331610679626465, + "learning_rate": 9.348578866428542e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5843521356582642, + "num_tokens": 5407112459.0, + "step": 10577 + }, + { + "epoch": 2.8604651162790695, + "grad_norm": 1.140037178993225, + "learning_rate": 9.347029169232573e-06, + "loss": 1.8502, + "mean_token_accuracy": 0.5716297626495361, + "num_tokens": 5407636736.0, + "step": 10578 + }, + { + "epoch": 2.8607355327203896, + "grad_norm": 1.173925518989563, + "learning_rate": 9.34547952275461e-06, + "loss": 1.8298, + "mean_token_accuracy": 0.5910590291023254, + "num_tokens": 5408137184.0, + "step": 10579 + }, + { + "epoch": 2.861005949161709, + "grad_norm": 1.1282566785812378, + "learning_rate": 9.343929927042202e-06, + "loss": 1.7268, + "mean_token_accuracy": 0.5998874306678772, + "num_tokens": 5408639430.0, + "step": 10580 + }, + { + "epoch": 2.861276365603029, + "grad_norm": 0.43275725841522217, + "learning_rate": 9.342380382142893e-06, + "loss": 1.1397, + "mean_token_accuracy": 0.6982516050338745, + "num_tokens": 5409152806.0, + "step": 10581 + }, + { + "epoch": 2.861546782044348, + "grad_norm": 1.1583184003829956, + "learning_rate": 9.340830888104227e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5816230773925781, + "num_tokens": 5409677019.0, + "step": 10582 + }, + { + "epoch": 2.861817198485668, + "grad_norm": 1.3179725408554077, + "learning_rate": 9.339281444973751e-06, + "loss": 1.8791, + "mean_token_accuracy": 0.5669187307357788, + "num_tokens": 5410201283.0, + "step": 10583 + }, + { + "epoch": 2.8620876149269874, + "grad_norm": 1.16155207157135, + "learning_rate": 9.337732052799007e-06, + "loss": 1.905, + "mean_token_accuracy": 0.5888878703117371, + "num_tokens": 5410675517.0, + "step": 10584 + }, + { + "epoch": 2.8623580313683075, + "grad_norm": 1.0635300874710083, + "learning_rate": 9.336182711627525e-06, + "loss": 1.7396, + "mean_token_accuracy": 0.5923559665679932, + "num_tokens": 5411199770.0, + "step": 10585 + }, + { + "epoch": 2.8626284478096267, + "grad_norm": 1.1408367156982422, + "learning_rate": 9.334633421506856e-06, + "loss": 1.8637, + "mean_token_accuracy": 0.5699411630630493, + "num_tokens": 5411724030.0, + "step": 10586 + }, + { + "epoch": 2.8628988642509463, + "grad_norm": 2.1590380668640137, + "learning_rate": 9.333084182484531e-06, + "loss": 1.8219, + "mean_token_accuracy": 0.5623629093170166, + "num_tokens": 5412248039.0, + "step": 10587 + }, + { + "epoch": 2.863169280692266, + "grad_norm": 1.5211939811706543, + "learning_rate": 9.33153499460808e-06, + "loss": 1.8628, + "mean_token_accuracy": 0.5810884237289429, + "num_tokens": 5412734900.0, + "step": 10588 + }, + { + "epoch": 2.8634396971335856, + "grad_norm": 1.2734777927398682, + "learning_rate": 9.329985857925048e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5677374601364136, + "num_tokens": 5413259032.0, + "step": 10589 + }, + { + "epoch": 2.8637101135749052, + "grad_norm": 1.4497504234313965, + "learning_rate": 9.32843677248296e-06, + "loss": 1.972, + "mean_token_accuracy": 0.5519668459892273, + "num_tokens": 5413783236.0, + "step": 10590 + }, + { + "epoch": 2.863980530016225, + "grad_norm": 1.0437088012695312, + "learning_rate": 9.326887738329345e-06, + "loss": 1.8768, + "mean_token_accuracy": 0.5718122124671936, + "num_tokens": 5414307498.0, + "step": 10591 + }, + { + "epoch": 2.8642509464575445, + "grad_norm": 1.0402483940124512, + "learning_rate": 9.325338755511735e-06, + "loss": 1.868, + "mean_token_accuracy": 0.585983157157898, + "num_tokens": 5414780898.0, + "step": 10592 + }, + { + "epoch": 2.864521362898864, + "grad_norm": 1.2900885343551636, + "learning_rate": 9.32378982407766e-06, + "loss": 1.9871, + "mean_token_accuracy": 0.5380803346633911, + "num_tokens": 5415305086.0, + "step": 10593 + }, + { + "epoch": 2.864791779340184, + "grad_norm": 1.3489073514938354, + "learning_rate": 9.322240944074637e-06, + "loss": 1.9648, + "mean_token_accuracy": 0.5395974516868591, + "num_tokens": 5415829365.0, + "step": 10594 + }, + { + "epoch": 2.8650621957815035, + "grad_norm": 0.9976826906204224, + "learning_rate": 9.3206921155502e-06, + "loss": 1.9395, + "mean_token_accuracy": 0.5452290773391724, + "num_tokens": 5416353613.0, + "step": 10595 + }, + { + "epoch": 2.865332612222823, + "grad_norm": 1.190380573272705, + "learning_rate": 9.319143338551869e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5848370790481567, + "num_tokens": 5416816990.0, + "step": 10596 + }, + { + "epoch": 2.8656030286641427, + "grad_norm": 1.3543444871902466, + "learning_rate": 9.31759461312716e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5741410851478577, + "num_tokens": 5417319409.0, + "step": 10597 + }, + { + "epoch": 2.8658734451054624, + "grad_norm": 0.9582384824752808, + "learning_rate": 9.316045939323597e-06, + "loss": 1.7862, + "mean_token_accuracy": 0.5879493951797485, + "num_tokens": 5417843542.0, + "step": 10598 + }, + { + "epoch": 2.866143861546782, + "grad_norm": 1.1049925088882446, + "learning_rate": 9.314497317188693e-06, + "loss": 1.6859, + "mean_token_accuracy": 0.6179754137992859, + "num_tokens": 5418367726.0, + "step": 10599 + }, + { + "epoch": 2.8664142779881017, + "grad_norm": 1.2152913808822632, + "learning_rate": 9.312948746769972e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5755949020385742, + "num_tokens": 5418891760.0, + "step": 10600 + }, + { + "epoch": 2.8666846944294213, + "grad_norm": 0.4641282856464386, + "learning_rate": 9.311400228114946e-06, + "loss": 1.091, + "mean_token_accuracy": 0.7066068053245544, + "num_tokens": 5419416008.0, + "step": 10601 + }, + { + "epoch": 2.866955110870741, + "grad_norm": 1.2536594867706299, + "learning_rate": 9.309851761271122e-06, + "loss": 1.8031, + "mean_token_accuracy": 0.5678638219833374, + "num_tokens": 5419940109.0, + "step": 10602 + }, + { + "epoch": 2.8672255273120606, + "grad_norm": 1.086621642112732, + "learning_rate": 9.308303346286022e-06, + "loss": 1.6655, + "mean_token_accuracy": 0.6252385377883911, + "num_tokens": 5420431059.0, + "step": 10603 + }, + { + "epoch": 2.8674959437533802, + "grad_norm": 0.840016782283783, + "learning_rate": 9.30675498320715e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5708109736442566, + "num_tokens": 5420915799.0, + "step": 10604 + }, + { + "epoch": 2.8677663601947, + "grad_norm": 1.1215622425079346, + "learning_rate": 9.30520667208201e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5678755044937134, + "num_tokens": 5421440071.0, + "step": 10605 + }, + { + "epoch": 2.8680367766360195, + "grad_norm": 1.0320314168930054, + "learning_rate": 9.303658412958119e-06, + "loss": 1.9568, + "mean_token_accuracy": 0.5378696918487549, + "num_tokens": 5421964274.0, + "step": 10606 + }, + { + "epoch": 2.868307193077339, + "grad_norm": 1.0185836553573608, + "learning_rate": 9.302110205882973e-06, + "loss": 1.8815, + "mean_token_accuracy": 0.5814206004142761, + "num_tokens": 5422432679.0, + "step": 10607 + }, + { + "epoch": 2.868577609518659, + "grad_norm": 0.9977190494537354, + "learning_rate": 9.300562050904076e-06, + "loss": 1.757, + "mean_token_accuracy": 0.5992262959480286, + "num_tokens": 5422956553.0, + "step": 10608 + }, + { + "epoch": 2.8688480259599785, + "grad_norm": 1.2115790843963623, + "learning_rate": 9.299013948068937e-06, + "loss": 1.8865, + "mean_token_accuracy": 0.5715102553367615, + "num_tokens": 5423480825.0, + "step": 10609 + }, + { + "epoch": 2.869118442401298, + "grad_norm": 1.0089854001998901, + "learning_rate": 9.297465897425056e-06, + "loss": 1.8179, + "mean_token_accuracy": 0.5667478442192078, + "num_tokens": 5424004996.0, + "step": 10610 + }, + { + "epoch": 2.8693888588426177, + "grad_norm": 1.6361664533615112, + "learning_rate": 9.29591789901992e-06, + "loss": 1.9975, + "mean_token_accuracy": 0.5491909980773926, + "num_tokens": 5424529281.0, + "step": 10611 + }, + { + "epoch": 2.8696592752839374, + "grad_norm": 1.3046845197677612, + "learning_rate": 9.294369952901041e-06, + "loss": 1.8154, + "mean_token_accuracy": 0.5726988911628723, + "num_tokens": 5425053536.0, + "step": 10612 + }, + { + "epoch": 2.869929691725257, + "grad_norm": 1.1005278825759888, + "learning_rate": 9.292822059115905e-06, + "loss": 1.8244, + "mean_token_accuracy": 0.5872968435287476, + "num_tokens": 5425577795.0, + "step": 10613 + }, + { + "epoch": 2.8702001081665767, + "grad_norm": 1.1656861305236816, + "learning_rate": 9.291274217712007e-06, + "loss": 1.8276, + "mean_token_accuracy": 0.5790456533432007, + "num_tokens": 5426071558.0, + "step": 10614 + }, + { + "epoch": 2.8704705246078963, + "grad_norm": 0.9873573184013367, + "learning_rate": 9.289726428736842e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.5724727511405945, + "num_tokens": 5426532366.0, + "step": 10615 + }, + { + "epoch": 2.870740941049216, + "grad_norm": 1.0368890762329102, + "learning_rate": 9.288178692237903e-06, + "loss": 1.7574, + "mean_token_accuracy": 0.5661799907684326, + "num_tokens": 5427056547.0, + "step": 10616 + }, + { + "epoch": 2.871011357490535, + "grad_norm": 0.9952079057693481, + "learning_rate": 9.286631008262669e-06, + "loss": 1.7805, + "mean_token_accuracy": 0.5826289057731628, + "num_tokens": 5427517375.0, + "step": 10617 + }, + { + "epoch": 2.8712817739318552, + "grad_norm": 1.2198525667190552, + "learning_rate": 9.285083376858642e-06, + "loss": 1.8758, + "mean_token_accuracy": 0.5571889877319336, + "num_tokens": 5428041648.0, + "step": 10618 + }, + { + "epoch": 2.8715521903731744, + "grad_norm": 1.1383510828018188, + "learning_rate": 9.283535798073297e-06, + "loss": 1.9362, + "mean_token_accuracy": 0.5543344616889954, + "num_tokens": 5428557873.0, + "step": 10619 + }, + { + "epoch": 2.8718226068144945, + "grad_norm": 1.146649956703186, + "learning_rate": 9.281988271954121e-06, + "loss": 1.9176, + "mean_token_accuracy": 0.570095419883728, + "num_tokens": 5429074294.0, + "step": 10620 + }, + { + "epoch": 2.8720930232558137, + "grad_norm": 0.48642322421073914, + "learning_rate": 9.280440798548596e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.7071443200111389, + "num_tokens": 5429549752.0, + "step": 10621 + }, + { + "epoch": 2.872363439697134, + "grad_norm": 1.3825591802597046, + "learning_rate": 9.278893377904203e-06, + "loss": 1.7952, + "mean_token_accuracy": 0.5857987403869629, + "num_tokens": 5430066657.0, + "step": 10622 + }, + { + "epoch": 2.872633856138453, + "grad_norm": 1.1972730159759521, + "learning_rate": 9.277346010068426e-06, + "loss": 2.021, + "mean_token_accuracy": 0.5377224087715149, + "num_tokens": 5430590727.0, + "step": 10623 + }, + { + "epoch": 2.872904272579773, + "grad_norm": 1.158820390701294, + "learning_rate": 9.275798695088743e-06, + "loss": 1.64, + "mean_token_accuracy": 0.6227075457572937, + "num_tokens": 5431114853.0, + "step": 10624 + }, + { + "epoch": 2.8731746890210923, + "grad_norm": 1.0195814371109009, + "learning_rate": 9.27425143301262e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5947113037109375, + "num_tokens": 5431523871.0, + "step": 10625 + }, + { + "epoch": 2.8734451054624124, + "grad_norm": 1.2513363361358643, + "learning_rate": 9.272704223887541e-06, + "loss": 1.9095, + "mean_token_accuracy": 0.560732364654541, + "num_tokens": 5432048151.0, + "step": 10626 + }, + { + "epoch": 2.8737155219037316, + "grad_norm": 1.2599709033966064, + "learning_rate": 9.271157067760977e-06, + "loss": 1.8079, + "mean_token_accuracy": 0.5917931795120239, + "num_tokens": 5432572378.0, + "step": 10627 + }, + { + "epoch": 2.8739859383450512, + "grad_norm": 1.443292260169983, + "learning_rate": 9.269609964680396e-06, + "loss": 1.6928, + "mean_token_accuracy": 0.6058247685432434, + "num_tokens": 5433096636.0, + "step": 10628 + }, + { + "epoch": 2.874256354786371, + "grad_norm": 1.4653242826461792, + "learning_rate": 9.26806291469327e-06, + "loss": 1.8695, + "mean_token_accuracy": 0.5779860019683838, + "num_tokens": 5433575188.0, + "step": 10629 + }, + { + "epoch": 2.8745267712276905, + "grad_norm": 1.430120825767517, + "learning_rate": 9.26651591784707e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5688431262969971, + "num_tokens": 5434099444.0, + "step": 10630 + }, + { + "epoch": 2.87479718766901, + "grad_norm": 1.0634125471115112, + "learning_rate": 9.264968974189255e-06, + "loss": 2.0265, + "mean_token_accuracy": 0.5221819877624512, + "num_tokens": 5434623547.0, + "step": 10631 + }, + { + "epoch": 2.87506760411033, + "grad_norm": 1.2295924425125122, + "learning_rate": 9.263422083767297e-06, + "loss": 1.8156, + "mean_token_accuracy": 0.5925778150558472, + "num_tokens": 5435084524.0, + "step": 10632 + }, + { + "epoch": 2.8753380205516494, + "grad_norm": 1.1238150596618652, + "learning_rate": 9.261875246628657e-06, + "loss": 1.765, + "mean_token_accuracy": 0.5841159820556641, + "num_tokens": 5435608702.0, + "step": 10633 + }, + { + "epoch": 2.875608436992969, + "grad_norm": 0.8513825535774231, + "learning_rate": 9.260328462820793e-06, + "loss": 1.7357, + "mean_token_accuracy": 0.6056274175643921, + "num_tokens": 5436132712.0, + "step": 10634 + }, + { + "epoch": 2.8758788534342887, + "grad_norm": 1.3665581941604614, + "learning_rate": 9.258781732391171e-06, + "loss": 1.7817, + "mean_token_accuracy": 0.5685621500015259, + "num_tokens": 5436656924.0, + "step": 10635 + }, + { + "epoch": 2.8761492698756084, + "grad_norm": 1.358972191810608, + "learning_rate": 9.257235055387246e-06, + "loss": 1.933, + "mean_token_accuracy": 0.5503330826759338, + "num_tokens": 5437147009.0, + "step": 10636 + }, + { + "epoch": 2.876419686316928, + "grad_norm": 1.289017677307129, + "learning_rate": 9.25568843185647e-06, + "loss": 1.7977, + "mean_token_accuracy": 0.5991390943527222, + "num_tokens": 5437646868.0, + "step": 10637 + }, + { + "epoch": 2.8766901027582477, + "grad_norm": 1.421432614326477, + "learning_rate": 9.254141861846309e-06, + "loss": 1.6427, + "mean_token_accuracy": 0.6306750774383545, + "num_tokens": 5438048849.0, + "step": 10638 + }, + { + "epoch": 2.8769605191995673, + "grad_norm": 1.2663946151733398, + "learning_rate": 9.25259534540421e-06, + "loss": 1.9159, + "mean_token_accuracy": 0.564555823802948, + "num_tokens": 5438539169.0, + "step": 10639 + }, + { + "epoch": 2.877230935640887, + "grad_norm": 1.1192816495895386, + "learning_rate": 9.251048882577617e-06, + "loss": 1.8138, + "mean_token_accuracy": 0.5779958963394165, + "num_tokens": 5439063415.0, + "step": 10640 + }, + { + "epoch": 2.8775013520822066, + "grad_norm": 0.4159426987171173, + "learning_rate": 9.249502473413996e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.7304365634918213, + "num_tokens": 5439560747.0, + "step": 10641 + }, + { + "epoch": 2.8777717685235262, + "grad_norm": 1.694067120552063, + "learning_rate": 9.247956117960781e-06, + "loss": 2.0507, + "mean_token_accuracy": 0.5358039140701294, + "num_tokens": 5440084774.0, + "step": 10642 + }, + { + "epoch": 2.878042184964846, + "grad_norm": 1.4345526695251465, + "learning_rate": 9.246409816265427e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5736682415008545, + "num_tokens": 5440609048.0, + "step": 10643 + }, + { + "epoch": 2.8783126014061655, + "grad_norm": 1.0028676986694336, + "learning_rate": 9.244863568375378e-06, + "loss": 1.8559, + "mean_token_accuracy": 0.5653770565986633, + "num_tokens": 5441133212.0, + "step": 10644 + }, + { + "epoch": 2.878583017847485, + "grad_norm": 1.4859533309936523, + "learning_rate": 9.243317374338073e-06, + "loss": 1.7716, + "mean_token_accuracy": 0.5905173420906067, + "num_tokens": 5441657384.0, + "step": 10645 + }, + { + "epoch": 2.878853434288805, + "grad_norm": 1.5094584226608276, + "learning_rate": 9.24177123420096e-06, + "loss": 1.8888, + "mean_token_accuracy": 0.5817323327064514, + "num_tokens": 5442117263.0, + "step": 10646 + }, + { + "epoch": 2.8791238507301244, + "grad_norm": 1.287209153175354, + "learning_rate": 9.240225148011478e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.565854549407959, + "num_tokens": 5442641465.0, + "step": 10647 + }, + { + "epoch": 2.879394267171444, + "grad_norm": 1.0843960046768188, + "learning_rate": 9.238679115817055e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5646056532859802, + "num_tokens": 5443165716.0, + "step": 10648 + }, + { + "epoch": 2.8796646836127637, + "grad_norm": 1.2794973850250244, + "learning_rate": 9.237133137665146e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.59360671043396, + "num_tokens": 5443689895.0, + "step": 10649 + }, + { + "epoch": 2.8799351000540834, + "grad_norm": 1.1636271476745605, + "learning_rate": 9.235587213603172e-06, + "loss": 1.7611, + "mean_token_accuracy": 0.5774708390235901, + "num_tokens": 5444213789.0, + "step": 10650 + }, + { + "epoch": 2.880205516495403, + "grad_norm": 1.2422696352005005, + "learning_rate": 9.23404134367857e-06, + "loss": 1.959, + "mean_token_accuracy": 0.5564332008361816, + "num_tokens": 5444737958.0, + "step": 10651 + }, + { + "epoch": 2.8804759329367227, + "grad_norm": 1.332703948020935, + "learning_rate": 9.232495527938776e-06, + "loss": 2.0967, + "mean_token_accuracy": 0.5493756532669067, + "num_tokens": 5445262166.0, + "step": 10652 + }, + { + "epoch": 2.8807463493780423, + "grad_norm": 0.9068584442138672, + "learning_rate": 9.230949766431219e-06, + "loss": 1.7729, + "mean_token_accuracy": 0.5948951244354248, + "num_tokens": 5445786374.0, + "step": 10653 + }, + { + "epoch": 2.881016765819362, + "grad_norm": 1.0065338611602783, + "learning_rate": 9.22940405920332e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.5711183547973633, + "num_tokens": 5446298397.0, + "step": 10654 + }, + { + "epoch": 2.8812871822606816, + "grad_norm": 1.3959951400756836, + "learning_rate": 9.227858406302515e-06, + "loss": 1.8842, + "mean_token_accuracy": 0.5656926035881042, + "num_tokens": 5446822676.0, + "step": 10655 + }, + { + "epoch": 2.8815575987020012, + "grad_norm": 1.0977269411087036, + "learning_rate": 9.226312807776224e-06, + "loss": 1.7487, + "mean_token_accuracy": 0.5950486660003662, + "num_tokens": 5447346915.0, + "step": 10656 + }, + { + "epoch": 2.881828015143321, + "grad_norm": 1.0235977172851562, + "learning_rate": 9.22476726367187e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.565501868724823, + "num_tokens": 5447871177.0, + "step": 10657 + }, + { + "epoch": 2.88209843158464, + "grad_norm": 1.128577470779419, + "learning_rate": 9.223221774036882e-06, + "loss": 1.9268, + "mean_token_accuracy": 0.5525098443031311, + "num_tokens": 5448395296.0, + "step": 10658 + }, + { + "epoch": 2.88236884802596, + "grad_norm": 1.3158131837844849, + "learning_rate": 9.221676338918675e-06, + "loss": 1.9358, + "mean_token_accuracy": 0.5567857027053833, + "num_tokens": 5448919430.0, + "step": 10659 + }, + { + "epoch": 2.8826392644672794, + "grad_norm": 1.053581714630127, + "learning_rate": 9.220130958364661e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5644896030426025, + "num_tokens": 5449443700.0, + "step": 10660 + }, + { + "epoch": 2.8829096809085994, + "grad_norm": 0.4456234574317932, + "learning_rate": 9.21858563242227e-06, + "loss": 1.157, + "mean_token_accuracy": 0.6844234466552734, + "num_tokens": 5449936143.0, + "step": 10661 + }, + { + "epoch": 2.8831800973499186, + "grad_norm": 1.4805546998977661, + "learning_rate": 9.217040361138909e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5545417666435242, + "num_tokens": 5450460351.0, + "step": 10662 + }, + { + "epoch": 2.8834505137912387, + "grad_norm": 1.876710057258606, + "learning_rate": 9.215495144561991e-06, + "loss": 1.8605, + "mean_token_accuracy": 0.5796598196029663, + "num_tokens": 5450946137.0, + "step": 10663 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 1.2580279111862183, + "learning_rate": 9.213949982738932e-06, + "loss": 1.9022, + "mean_token_accuracy": 0.5769943594932556, + "num_tokens": 5451470396.0, + "step": 10664 + }, + { + "epoch": 2.883991346673878, + "grad_norm": 1.2216241359710693, + "learning_rate": 9.212404875717143e-06, + "loss": 1.8053, + "mean_token_accuracy": 0.5717490315437317, + "num_tokens": 5451994549.0, + "step": 10665 + }, + { + "epoch": 2.884261763115197, + "grad_norm": 1.9766126871109009, + "learning_rate": 9.210859823544022e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.6007338762283325, + "num_tokens": 5452420163.0, + "step": 10666 + }, + { + "epoch": 2.8845321795565173, + "grad_norm": 1.5168074369430542, + "learning_rate": 9.20931482626699e-06, + "loss": 1.8772, + "mean_token_accuracy": 0.5762431025505066, + "num_tokens": 5452908641.0, + "step": 10667 + }, + { + "epoch": 2.8848025959978365, + "grad_norm": 1.3308628797531128, + "learning_rate": 9.207769883933437e-06, + "loss": 1.9364, + "mean_token_accuracy": 0.5591697692871094, + "num_tokens": 5453432894.0, + "step": 10668 + }, + { + "epoch": 2.885073012439156, + "grad_norm": 1.2327595949172974, + "learning_rate": 9.206224996590782e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5948336720466614, + "num_tokens": 5453902880.0, + "step": 10669 + }, + { + "epoch": 2.885343428880476, + "grad_norm": 1.2025471925735474, + "learning_rate": 9.204680164286416e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.5592011213302612, + "num_tokens": 5454426934.0, + "step": 10670 + }, + { + "epoch": 2.8856138453217954, + "grad_norm": 1.1077747344970703, + "learning_rate": 9.203135387067744e-06, + "loss": 1.9383, + "mean_token_accuracy": 0.5442823171615601, + "num_tokens": 5454914515.0, + "step": 10671 + }, + { + "epoch": 2.885884261763115, + "grad_norm": 1.452494740486145, + "learning_rate": 9.201590664982161e-06, + "loss": 1.8815, + "mean_token_accuracy": 0.5623189210891724, + "num_tokens": 5455438526.0, + "step": 10672 + }, + { + "epoch": 2.8861546782044347, + "grad_norm": 1.5413200855255127, + "learning_rate": 9.200045998077067e-06, + "loss": 1.7993, + "mean_token_accuracy": 0.6059464812278748, + "num_tokens": 5455962727.0, + "step": 10673 + }, + { + "epoch": 2.8864250946457544, + "grad_norm": 1.1945894956588745, + "learning_rate": 9.198501386399852e-06, + "loss": 1.9018, + "mean_token_accuracy": 0.5453653335571289, + "num_tokens": 5456478281.0, + "step": 10674 + }, + { + "epoch": 2.886695511087074, + "grad_norm": 1.4138383865356445, + "learning_rate": 9.196956829997917e-06, + "loss": 1.7573, + "mean_token_accuracy": 0.5692951679229736, + "num_tokens": 5457002456.0, + "step": 10675 + }, + { + "epoch": 2.8869659275283936, + "grad_norm": 1.2161293029785156, + "learning_rate": 9.195412328918648e-06, + "loss": 1.8819, + "mean_token_accuracy": 0.5633448362350464, + "num_tokens": 5457526518.0, + "step": 10676 + }, + { + "epoch": 2.8872363439697133, + "grad_norm": 1.1874662637710571, + "learning_rate": 9.193867883209431e-06, + "loss": 1.9034, + "mean_token_accuracy": 0.5678004026412964, + "num_tokens": 5457993367.0, + "step": 10677 + }, + { + "epoch": 2.887506760411033, + "grad_norm": 1.05953049659729, + "learning_rate": 9.19232349291766e-06, + "loss": 1.8576, + "mean_token_accuracy": 0.576359748840332, + "num_tokens": 5458517640.0, + "step": 10678 + }, + { + "epoch": 2.8877771768523526, + "grad_norm": 1.1794698238372803, + "learning_rate": 9.190779158090725e-06, + "loss": 1.8269, + "mean_token_accuracy": 0.5866109132766724, + "num_tokens": 5459041721.0, + "step": 10679 + }, + { + "epoch": 2.888047593293672, + "grad_norm": 1.1235551834106445, + "learning_rate": 9.189234878776001e-06, + "loss": 1.9128, + "mean_token_accuracy": 0.5665057897567749, + "num_tokens": 5459565917.0, + "step": 10680 + }, + { + "epoch": 2.888318009734992, + "grad_norm": 0.4502878189086914, + "learning_rate": 9.18769065502088e-06, + "loss": 1.1335, + "mean_token_accuracy": 0.6934018731117249, + "num_tokens": 5460090192.0, + "step": 10681 + }, + { + "epoch": 2.8885884261763115, + "grad_norm": 1.3864575624465942, + "learning_rate": 9.186146486872738e-06, + "loss": 1.909, + "mean_token_accuracy": 0.5627418756484985, + "num_tokens": 5460614379.0, + "step": 10682 + }, + { + "epoch": 2.888858842617631, + "grad_norm": 1.3524154424667358, + "learning_rate": 9.184602374378955e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5748999714851379, + "num_tokens": 5461068999.0, + "step": 10683 + }, + { + "epoch": 2.889129259058951, + "grad_norm": 1.0967096090316772, + "learning_rate": 9.18305831758691e-06, + "loss": 1.8678, + "mean_token_accuracy": 0.5616060495376587, + "num_tokens": 5461593108.0, + "step": 10684 + }, + { + "epoch": 2.8893996755002704, + "grad_norm": 1.053489327430725, + "learning_rate": 9.18151431654398e-06, + "loss": 1.814, + "mean_token_accuracy": 0.5920891761779785, + "num_tokens": 5462104178.0, + "step": 10685 + }, + { + "epoch": 2.88967009194159, + "grad_norm": 1.104204773902893, + "learning_rate": 9.179970371297536e-06, + "loss": 1.9517, + "mean_token_accuracy": 0.5594562292098999, + "num_tokens": 5462605288.0, + "step": 10686 + }, + { + "epoch": 2.8899405083829097, + "grad_norm": 1.0810168981552124, + "learning_rate": 9.178426481894955e-06, + "loss": 1.9362, + "mean_token_accuracy": 0.5627665519714355, + "num_tokens": 5463129387.0, + "step": 10687 + }, + { + "epoch": 2.8902109248242294, + "grad_norm": 1.0867164134979248, + "learning_rate": 9.17688264838361e-06, + "loss": 1.8513, + "mean_token_accuracy": 0.5849399566650391, + "num_tokens": 5463653653.0, + "step": 10688 + }, + { + "epoch": 2.890481341265549, + "grad_norm": 1.2080729007720947, + "learning_rate": 9.175338870810862e-06, + "loss": 1.8696, + "mean_token_accuracy": 0.5811089873313904, + "num_tokens": 5464177838.0, + "step": 10689 + }, + { + "epoch": 2.8907517577068687, + "grad_norm": 0.9790859818458557, + "learning_rate": 9.173795149224085e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5748326778411865, + "num_tokens": 5464702011.0, + "step": 10690 + }, + { + "epoch": 2.8910221741481883, + "grad_norm": 1.0893844366073608, + "learning_rate": 9.172251483670642e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5779721736907959, + "num_tokens": 5465226144.0, + "step": 10691 + }, + { + "epoch": 2.891292590589508, + "grad_norm": 1.282578468322754, + "learning_rate": 9.1707078741979e-06, + "loss": 1.9685, + "mean_token_accuracy": 0.5387805104255676, + "num_tokens": 5465695818.0, + "step": 10692 + }, + { + "epoch": 2.8915630070308276, + "grad_norm": 0.9791666269302368, + "learning_rate": 9.169164320853221e-06, + "loss": 1.782, + "mean_token_accuracy": 0.5859429836273193, + "num_tokens": 5466220069.0, + "step": 10693 + }, + { + "epoch": 2.8918334234721472, + "grad_norm": 0.9068737626075745, + "learning_rate": 9.16762082368396e-06, + "loss": 1.7933, + "mean_token_accuracy": 0.5674504041671753, + "num_tokens": 5466706351.0, + "step": 10694 + }, + { + "epoch": 2.892103839913467, + "grad_norm": 0.9200412034988403, + "learning_rate": 9.166077382737486e-06, + "loss": 1.8214, + "mean_token_accuracy": 0.571984589099884, + "num_tokens": 5467230492.0, + "step": 10695 + }, + { + "epoch": 2.8923742563547865, + "grad_norm": 1.3617361783981323, + "learning_rate": 9.164533998061152e-06, + "loss": 1.9239, + "mean_token_accuracy": 0.5496260523796082, + "num_tokens": 5467698643.0, + "step": 10696 + }, + { + "epoch": 2.892644672796106, + "grad_norm": 0.9597946405410767, + "learning_rate": 9.162990669702308e-06, + "loss": 1.7299, + "mean_token_accuracy": 0.5965721607208252, + "num_tokens": 5468186822.0, + "step": 10697 + }, + { + "epoch": 2.892915089237426, + "grad_norm": 1.0217769145965576, + "learning_rate": 9.161447397708317e-06, + "loss": 1.957, + "mean_token_accuracy": 0.5649572610855103, + "num_tokens": 5468666494.0, + "step": 10698 + }, + { + "epoch": 2.893185505678745, + "grad_norm": 1.009819746017456, + "learning_rate": 9.159904182126525e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5686108469963074, + "num_tokens": 5469190735.0, + "step": 10699 + }, + { + "epoch": 2.893455922120065, + "grad_norm": 1.0502901077270508, + "learning_rate": 9.15836102300428e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.6059209108352661, + "num_tokens": 5469650957.0, + "step": 10700 + }, + { + "epoch": 2.8937263385613843, + "grad_norm": 0.42597100138664246, + "learning_rate": 9.156817920388938e-06, + "loss": 1.048, + "mean_token_accuracy": 0.712256908416748, + "num_tokens": 5470175114.0, + "step": 10701 + }, + { + "epoch": 2.8939967550027044, + "grad_norm": 1.7103697061538696, + "learning_rate": 9.155274874327846e-06, + "loss": 1.8798, + "mean_token_accuracy": 0.5677216053009033, + "num_tokens": 5470699303.0, + "step": 10702 + }, + { + "epoch": 2.8942671714440236, + "grad_norm": 1.4106265306472778, + "learning_rate": 9.15373188486834e-06, + "loss": 1.9202, + "mean_token_accuracy": 0.5751062035560608, + "num_tokens": 5471223483.0, + "step": 10703 + }, + { + "epoch": 2.8945375878853437, + "grad_norm": 0.9871239066123962, + "learning_rate": 9.152188952057774e-06, + "loss": 1.9301, + "mean_token_accuracy": 0.5639677047729492, + "num_tokens": 5471747671.0, + "step": 10704 + }, + { + "epoch": 2.894808004326663, + "grad_norm": 1.2687122821807861, + "learning_rate": 9.15064607594348e-06, + "loss": 1.8405, + "mean_token_accuracy": 0.5775833129882812, + "num_tokens": 5472271774.0, + "step": 10705 + }, + { + "epoch": 2.895078420767983, + "grad_norm": 1.1653763055801392, + "learning_rate": 9.149103256572809e-06, + "loss": 1.9231, + "mean_token_accuracy": 0.5516998767852783, + "num_tokens": 5472796054.0, + "step": 10706 + }, + { + "epoch": 2.895348837209302, + "grad_norm": 1.1073846817016602, + "learning_rate": 9.147560493993089e-06, + "loss": 1.9822, + "mean_token_accuracy": 0.5634475350379944, + "num_tokens": 5473197902.0, + "step": 10707 + }, + { + "epoch": 2.8956192536506222, + "grad_norm": 1.1787846088409424, + "learning_rate": 9.146017788251663e-06, + "loss": 1.7491, + "mean_token_accuracy": 0.5802757143974304, + "num_tokens": 5473722013.0, + "step": 10708 + }, + { + "epoch": 2.8958896700919414, + "grad_norm": 1.238464117050171, + "learning_rate": 9.144475139395862e-06, + "loss": 1.5678, + "mean_token_accuracy": 0.6309366226196289, + "num_tokens": 5474246245.0, + "step": 10709 + }, + { + "epoch": 2.896160086533261, + "grad_norm": 1.3092414140701294, + "learning_rate": 9.142932547473023e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5609842538833618, + "num_tokens": 5474770481.0, + "step": 10710 + }, + { + "epoch": 2.8964305029745807, + "grad_norm": 1.265297770500183, + "learning_rate": 9.141390012530471e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5757632255554199, + "num_tokens": 5475294740.0, + "step": 10711 + }, + { + "epoch": 2.8967009194159004, + "grad_norm": 1.2878186702728271, + "learning_rate": 9.13984753461554e-06, + "loss": 1.9213, + "mean_token_accuracy": 0.5664986371994019, + "num_tokens": 5475819018.0, + "step": 10712 + }, + { + "epoch": 2.89697133585722, + "grad_norm": 1.2267529964447021, + "learning_rate": 9.138305113775558e-06, + "loss": 1.7246, + "mean_token_accuracy": 0.5962974429130554, + "num_tokens": 5476343206.0, + "step": 10713 + }, + { + "epoch": 2.8972417522985396, + "grad_norm": 1.0570844411849976, + "learning_rate": 9.136762750057847e-06, + "loss": 1.9038, + "mean_token_accuracy": 0.5601192116737366, + "num_tokens": 5476867445.0, + "step": 10714 + }, + { + "epoch": 2.8975121687398593, + "grad_norm": 1.1551228761672974, + "learning_rate": 9.135220443509738e-06, + "loss": 1.8305, + "mean_token_accuracy": 0.5860787630081177, + "num_tokens": 5477391714.0, + "step": 10715 + }, + { + "epoch": 2.897782585181179, + "grad_norm": 1.0394495725631714, + "learning_rate": 9.133678194178551e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5803049206733704, + "num_tokens": 5477903251.0, + "step": 10716 + }, + { + "epoch": 2.8980530016224986, + "grad_norm": 1.0057140588760376, + "learning_rate": 9.1321360021116e-06, + "loss": 1.818, + "mean_token_accuracy": 0.5768738985061646, + "num_tokens": 5478427506.0, + "step": 10717 + }, + { + "epoch": 2.898323418063818, + "grad_norm": 1.254220962524414, + "learning_rate": 9.130593867356216e-06, + "loss": 1.8173, + "mean_token_accuracy": 0.574167013168335, + "num_tokens": 5478951783.0, + "step": 10718 + }, + { + "epoch": 2.898593834505138, + "grad_norm": 1.1695473194122314, + "learning_rate": 9.12905178995971e-06, + "loss": 1.9304, + "mean_token_accuracy": 0.577975869178772, + "num_tokens": 5479433487.0, + "step": 10719 + }, + { + "epoch": 2.8988642509464575, + "grad_norm": 1.1597050428390503, + "learning_rate": 9.12750976996939e-06, + "loss": 1.9141, + "mean_token_accuracy": 0.5823246240615845, + "num_tokens": 5479896144.0, + "step": 10720 + }, + { + "epoch": 2.899134667387777, + "grad_norm": 0.43288037180900574, + "learning_rate": 9.125967807432578e-06, + "loss": 1.1347, + "mean_token_accuracy": 0.7016774415969849, + "num_tokens": 5480420416.0, + "step": 10721 + }, + { + "epoch": 2.899405083829097, + "grad_norm": 1.3690080642700195, + "learning_rate": 9.12442590239659e-06, + "loss": 1.8305, + "mean_token_accuracy": 0.5960688591003418, + "num_tokens": 5480936092.0, + "step": 10722 + }, + { + "epoch": 2.8996755002704164, + "grad_norm": 1.4236363172531128, + "learning_rate": 9.122884054908728e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.5574501752853394, + "num_tokens": 5481460299.0, + "step": 10723 + }, + { + "epoch": 2.899945916711736, + "grad_norm": 1.156126618385315, + "learning_rate": 9.121342265016304e-06, + "loss": 1.9602, + "mean_token_accuracy": 0.5427322387695312, + "num_tokens": 5481984531.0, + "step": 10724 + }, + { + "epoch": 2.9002163331530557, + "grad_norm": 1.4213777780532837, + "learning_rate": 9.119800532766624e-06, + "loss": 1.9089, + "mean_token_accuracy": 0.5620667338371277, + "num_tokens": 5482460345.0, + "step": 10725 + }, + { + "epoch": 2.9004867495943754, + "grad_norm": 1.6785235404968262, + "learning_rate": 9.118258858206991e-06, + "loss": 1.9917, + "mean_token_accuracy": 0.5479729175567627, + "num_tokens": 5482984516.0, + "step": 10726 + }, + { + "epoch": 2.900757166035695, + "grad_norm": 1.0862691402435303, + "learning_rate": 9.116717241384714e-06, + "loss": 1.9209, + "mean_token_accuracy": 0.5699178576469421, + "num_tokens": 5483457792.0, + "step": 10727 + }, + { + "epoch": 2.9010275824770146, + "grad_norm": 1.2238637208938599, + "learning_rate": 9.115175682347088e-06, + "loss": 1.892, + "mean_token_accuracy": 0.5441662669181824, + "num_tokens": 5483981999.0, + "step": 10728 + }, + { + "epoch": 2.9012979989183343, + "grad_norm": 1.1963160037994385, + "learning_rate": 9.113634181141416e-06, + "loss": 1.8287, + "mean_token_accuracy": 0.5853681564331055, + "num_tokens": 5484449248.0, + "step": 10729 + }, + { + "epoch": 2.901568415359654, + "grad_norm": 1.1085773706436157, + "learning_rate": 9.112092737814995e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5733448266983032, + "num_tokens": 5484973502.0, + "step": 10730 + }, + { + "epoch": 2.9018388318009736, + "grad_norm": 1.0185717344284058, + "learning_rate": 9.11055135241512e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5542253255844116, + "num_tokens": 5485497615.0, + "step": 10731 + }, + { + "epoch": 2.902109248242293, + "grad_norm": 1.0313578844070435, + "learning_rate": 9.109010024989084e-06, + "loss": 1.8937, + "mean_token_accuracy": 0.5595418214797974, + "num_tokens": 5486021864.0, + "step": 10732 + }, + { + "epoch": 2.902379664683613, + "grad_norm": 1.1108318567276, + "learning_rate": 9.107468755584183e-06, + "loss": 1.8287, + "mean_token_accuracy": 0.5839575529098511, + "num_tokens": 5486517147.0, + "step": 10733 + }, + { + "epoch": 2.9026500811249325, + "grad_norm": 1.1951583623886108, + "learning_rate": 9.105927544247707e-06, + "loss": 1.8745, + "mean_token_accuracy": 0.5714678764343262, + "num_tokens": 5486984511.0, + "step": 10734 + }, + { + "epoch": 2.902920497566252, + "grad_norm": 1.1218934059143066, + "learning_rate": 9.104386391026943e-06, + "loss": 1.9569, + "mean_token_accuracy": 0.5375214219093323, + "num_tokens": 5487508683.0, + "step": 10735 + }, + { + "epoch": 2.903190914007572, + "grad_norm": 0.8330931663513184, + "learning_rate": 9.10284529596918e-06, + "loss": 1.6966, + "mean_token_accuracy": 0.6161864399909973, + "num_tokens": 5488032829.0, + "step": 10736 + }, + { + "epoch": 2.9034613304488914, + "grad_norm": 1.1394387483596802, + "learning_rate": 9.101304259121702e-06, + "loss": 1.8722, + "mean_token_accuracy": 0.5596994757652283, + "num_tokens": 5488557114.0, + "step": 10737 + }, + { + "epoch": 2.903731746890211, + "grad_norm": 1.2315384149551392, + "learning_rate": 9.099763280531792e-06, + "loss": 1.9069, + "mean_token_accuracy": 0.5704790353775024, + "num_tokens": 5489045993.0, + "step": 10738 + }, + { + "epoch": 2.9040021633315307, + "grad_norm": 1.0122424364089966, + "learning_rate": 9.098222360246738e-06, + "loss": 1.8172, + "mean_token_accuracy": 0.5703492164611816, + "num_tokens": 5489570260.0, + "step": 10739 + }, + { + "epoch": 2.90427257977285, + "grad_norm": 1.0927420854568481, + "learning_rate": 9.096681498313808e-06, + "loss": 1.9253, + "mean_token_accuracy": 0.5740158557891846, + "num_tokens": 5490094531.0, + "step": 10740 + }, + { + "epoch": 2.90454299621417, + "grad_norm": 0.4226425886154175, + "learning_rate": 9.095140694780291e-06, + "loss": 1.075, + "mean_token_accuracy": 0.7165288329124451, + "num_tokens": 5490618799.0, + "step": 10741 + }, + { + "epoch": 2.904813412655489, + "grad_norm": 1.402703881263733, + "learning_rate": 9.093599949693457e-06, + "loss": 1.7918, + "mean_token_accuracy": 0.5907502770423889, + "num_tokens": 5491083680.0, + "step": 10742 + }, + { + "epoch": 2.9050838290968093, + "grad_norm": 1.0294609069824219, + "learning_rate": 9.092059263100583e-06, + "loss": 2.007, + "mean_token_accuracy": 0.550966203212738, + "num_tokens": 5491553939.0, + "step": 10743 + }, + { + "epoch": 2.9053542455381285, + "grad_norm": 1.0155467987060547, + "learning_rate": 9.090518635048944e-06, + "loss": 1.9592, + "mean_token_accuracy": 0.5542714595794678, + "num_tokens": 5492045022.0, + "step": 10744 + }, + { + "epoch": 2.9056246619794486, + "grad_norm": 0.9297847151756287, + "learning_rate": 9.088978065585809e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5736604928970337, + "num_tokens": 5492569290.0, + "step": 10745 + }, + { + "epoch": 2.9058950784207678, + "grad_norm": 1.0431309938430786, + "learning_rate": 9.087437554758442e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.586254894733429, + "num_tokens": 5493058722.0, + "step": 10746 + }, + { + "epoch": 2.906165494862088, + "grad_norm": 0.9149081707000732, + "learning_rate": 9.08589710261412e-06, + "loss": 1.8114, + "mean_token_accuracy": 0.5794820785522461, + "num_tokens": 5493582935.0, + "step": 10747 + }, + { + "epoch": 2.906435911303407, + "grad_norm": 1.03159761428833, + "learning_rate": 9.084356709200102e-06, + "loss": 1.9271, + "mean_token_accuracy": 0.5619263648986816, + "num_tokens": 5494107158.0, + "step": 10748 + }, + { + "epoch": 2.906706327744727, + "grad_norm": 2.8366506099700928, + "learning_rate": 9.082816374563654e-06, + "loss": 1.6905, + "mean_token_accuracy": 0.6135962009429932, + "num_tokens": 5494631291.0, + "step": 10749 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 1.1929596662521362, + "learning_rate": 9.081276098752039e-06, + "loss": 1.9641, + "mean_token_accuracy": 0.557621955871582, + "num_tokens": 5495155505.0, + "step": 10750 + }, + { + "epoch": 2.907247160627366, + "grad_norm": 1.06144380569458, + "learning_rate": 9.079735881812518e-06, + "loss": 1.829, + "mean_token_accuracy": 0.5798241496086121, + "num_tokens": 5495679649.0, + "step": 10751 + }, + { + "epoch": 2.9075175770686856, + "grad_norm": 1.1776213645935059, + "learning_rate": 9.078195723792339e-06, + "loss": 1.9103, + "mean_token_accuracy": 0.5612350702285767, + "num_tokens": 5496203926.0, + "step": 10752 + }, + { + "epoch": 2.9077879935100053, + "grad_norm": 1.0359067916870117, + "learning_rate": 9.076655624738773e-06, + "loss": 1.8408, + "mean_token_accuracy": 0.5864895582199097, + "num_tokens": 5496684912.0, + "step": 10753 + }, + { + "epoch": 2.908058409951325, + "grad_norm": 0.9556357860565186, + "learning_rate": 9.07511558469907e-06, + "loss": 1.9119, + "mean_token_accuracy": 0.5630900263786316, + "num_tokens": 5497209108.0, + "step": 10754 + }, + { + "epoch": 2.9083288263926446, + "grad_norm": 1.089902400970459, + "learning_rate": 9.073575603720477e-06, + "loss": 1.9903, + "mean_token_accuracy": 0.5254036784172058, + "num_tokens": 5497733300.0, + "step": 10755 + }, + { + "epoch": 2.908599242833964, + "grad_norm": 0.8241729140281677, + "learning_rate": 9.072035681850252e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5891525149345398, + "num_tokens": 5498228911.0, + "step": 10756 + }, + { + "epoch": 2.908869659275284, + "grad_norm": 0.9736574292182922, + "learning_rate": 9.070495819135643e-06, + "loss": 1.9209, + "mean_token_accuracy": 0.5582054853439331, + "num_tokens": 5498753105.0, + "step": 10757 + }, + { + "epoch": 2.9091400757166035, + "grad_norm": 0.9534607529640198, + "learning_rate": 9.068956015623892e-06, + "loss": 1.9577, + "mean_token_accuracy": 0.564139723777771, + "num_tokens": 5499277383.0, + "step": 10758 + }, + { + "epoch": 2.909410492157923, + "grad_norm": 0.7862074375152588, + "learning_rate": 9.067416271362256e-06, + "loss": 1.8718, + "mean_token_accuracy": 0.5648333430290222, + "num_tokens": 5499801543.0, + "step": 10759 + }, + { + "epoch": 2.9096809085992428, + "grad_norm": 1.0034856796264648, + "learning_rate": 9.065876586397965e-06, + "loss": 1.8885, + "mean_token_accuracy": 0.5744220018386841, + "num_tokens": 5500321401.0, + "step": 10760 + }, + { + "epoch": 2.9099513250405624, + "grad_norm": 0.5010537505149841, + "learning_rate": 9.064336960778273e-06, + "loss": 1.1333, + "mean_token_accuracy": 0.7134110927581787, + "num_tokens": 5500770350.0, + "step": 10761 + }, + { + "epoch": 2.910221741481882, + "grad_norm": 1.0372998714447021, + "learning_rate": 9.062797394550416e-06, + "loss": 1.8773, + "mean_token_accuracy": 0.5581052303314209, + "num_tokens": 5501294624.0, + "step": 10762 + }, + { + "epoch": 2.9104921579232017, + "grad_norm": 1.042697548866272, + "learning_rate": 9.06125788776163e-06, + "loss": 1.8376, + "mean_token_accuracy": 0.575355589389801, + "num_tokens": 5501818881.0, + "step": 10763 + }, + { + "epoch": 2.9107625743645213, + "grad_norm": 1.1200144290924072, + "learning_rate": 9.059718440459154e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.5706309080123901, + "num_tokens": 5502343138.0, + "step": 10764 + }, + { + "epoch": 2.911032990805841, + "grad_norm": 1.0861674547195435, + "learning_rate": 9.058179052690223e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5720810890197754, + "num_tokens": 5502808242.0, + "step": 10765 + }, + { + "epoch": 2.9113034072471606, + "grad_norm": 1.0835094451904297, + "learning_rate": 9.056639724502065e-06, + "loss": 1.834, + "mean_token_accuracy": 0.5692633390426636, + "num_tokens": 5503332409.0, + "step": 10766 + }, + { + "epoch": 2.9115738236884803, + "grad_norm": 1.0000709295272827, + "learning_rate": 9.05510045594192e-06, + "loss": 1.8801, + "mean_token_accuracy": 0.5588282346725464, + "num_tokens": 5503856596.0, + "step": 10767 + }, + { + "epoch": 2.9118442401298, + "grad_norm": 1.0698802471160889, + "learning_rate": 9.053561247057014e-06, + "loss": 1.9639, + "mean_token_accuracy": 0.5346055030822754, + "num_tokens": 5504380859.0, + "step": 10768 + }, + { + "epoch": 2.9121146565711196, + "grad_norm": 0.9749566316604614, + "learning_rate": 9.052022097894568e-06, + "loss": 1.6994, + "mean_token_accuracy": 0.5828226804733276, + "num_tokens": 5504905042.0, + "step": 10769 + }, + { + "epoch": 2.912385073012439, + "grad_norm": 1.0009052753448486, + "learning_rate": 9.050483008501816e-06, + "loss": 1.8128, + "mean_token_accuracy": 0.5723520517349243, + "num_tokens": 5505429319.0, + "step": 10770 + }, + { + "epoch": 2.912655489453759, + "grad_norm": 1.0183806419372559, + "learning_rate": 9.048943978925977e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.5835778117179871, + "num_tokens": 5505953518.0, + "step": 10771 + }, + { + "epoch": 2.9129259058950785, + "grad_norm": 1.2159919738769531, + "learning_rate": 9.047405009214271e-06, + "loss": 1.9363, + "mean_token_accuracy": 0.5617581009864807, + "num_tokens": 5506477698.0, + "step": 10772 + }, + { + "epoch": 2.913196322336398, + "grad_norm": 1.0466735363006592, + "learning_rate": 9.045866099413929e-06, + "loss": 1.901, + "mean_token_accuracy": 0.5736991167068481, + "num_tokens": 5506974499.0, + "step": 10773 + }, + { + "epoch": 2.913466738777718, + "grad_norm": 1.0155411958694458, + "learning_rate": 9.04432724957216e-06, + "loss": 1.8527, + "mean_token_accuracy": 0.5659199953079224, + "num_tokens": 5507498781.0, + "step": 10774 + }, + { + "epoch": 2.9137371552190374, + "grad_norm": 1.4593827724456787, + "learning_rate": 9.042788459736178e-06, + "loss": 1.9911, + "mean_token_accuracy": 0.553851842880249, + "num_tokens": 5508022976.0, + "step": 10775 + }, + { + "epoch": 2.914007571660357, + "grad_norm": 1.0628092288970947, + "learning_rate": 9.041249729953205e-06, + "loss": 1.7887, + "mean_token_accuracy": 0.5651099681854248, + "num_tokens": 5508540710.0, + "step": 10776 + }, + { + "epoch": 2.9142779881016767, + "grad_norm": 1.0659767389297485, + "learning_rate": 9.03971106027045e-06, + "loss": 1.9009, + "mean_token_accuracy": 0.5552073121070862, + "num_tokens": 5509064964.0, + "step": 10777 + }, + { + "epoch": 2.9145484045429964, + "grad_norm": 1.1343435049057007, + "learning_rate": 9.038172450735121e-06, + "loss": 1.968, + "mean_token_accuracy": 0.5457272529602051, + "num_tokens": 5509589169.0, + "step": 10778 + }, + { + "epoch": 2.914818820984316, + "grad_norm": 1.203019142150879, + "learning_rate": 9.036633901394435e-06, + "loss": 1.8048, + "mean_token_accuracy": 0.5879577398300171, + "num_tokens": 5510113415.0, + "step": 10779 + }, + { + "epoch": 2.9150892374256356, + "grad_norm": 1.1912871599197388, + "learning_rate": 9.035095412295595e-06, + "loss": 1.9331, + "mean_token_accuracy": 0.5511824488639832, + "num_tokens": 5510617319.0, + "step": 10780 + }, + { + "epoch": 2.915359653866955, + "grad_norm": 0.415441632270813, + "learning_rate": 9.033556983485804e-06, + "loss": 1.1088, + "mean_token_accuracy": 0.6959936022758484, + "num_tokens": 5511141576.0, + "step": 10781 + }, + { + "epoch": 2.915630070308275, + "grad_norm": 1.3786531686782837, + "learning_rate": 9.03201861501227e-06, + "loss": 1.8888, + "mean_token_accuracy": 0.5749056339263916, + "num_tokens": 5511665856.0, + "step": 10782 + }, + { + "epoch": 2.915900486749594, + "grad_norm": 1.2737826108932495, + "learning_rate": 9.03048030692219e-06, + "loss": 2.0342, + "mean_token_accuracy": 0.5479480028152466, + "num_tokens": 5512190124.0, + "step": 10783 + }, + { + "epoch": 2.916170903190914, + "grad_norm": 0.9793291091918945, + "learning_rate": 9.028942059262766e-06, + "loss": 1.9041, + "mean_token_accuracy": 0.5566682815551758, + "num_tokens": 5512714391.0, + "step": 10784 + }, + { + "epoch": 2.9164413196322334, + "grad_norm": 1.106017827987671, + "learning_rate": 9.027403872081198e-06, + "loss": 1.78, + "mean_token_accuracy": 0.5710060596466064, + "num_tokens": 5513208841.0, + "step": 10785 + }, + { + "epoch": 2.9167117360735535, + "grad_norm": 1.3610552549362183, + "learning_rate": 9.025865745424676e-06, + "loss": 1.8754, + "mean_token_accuracy": 0.5725916624069214, + "num_tokens": 5513733075.0, + "step": 10786 + }, + { + "epoch": 2.9169821525148727, + "grad_norm": 1.066149353981018, + "learning_rate": 9.024327679340403e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.58138108253479, + "num_tokens": 5514257323.0, + "step": 10787 + }, + { + "epoch": 2.917252568956193, + "grad_norm": 1.0167715549468994, + "learning_rate": 9.022789673875566e-06, + "loss": 1.8923, + "mean_token_accuracy": 0.5683914422988892, + "num_tokens": 5514781529.0, + "step": 10788 + }, + { + "epoch": 2.917522985397512, + "grad_norm": 0.9978624582290649, + "learning_rate": 9.021251729077355e-06, + "loss": 1.8881, + "mean_token_accuracy": 0.5709375143051147, + "num_tokens": 5515305624.0, + "step": 10789 + }, + { + "epoch": 2.917793401838832, + "grad_norm": 1.169392704963684, + "learning_rate": 9.019713844992963e-06, + "loss": 2.0274, + "mean_token_accuracy": 0.5518192052841187, + "num_tokens": 5515829893.0, + "step": 10790 + }, + { + "epoch": 2.9180638182801513, + "grad_norm": 0.9914742708206177, + "learning_rate": 9.018176021669572e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.5735116600990295, + "num_tokens": 5516311938.0, + "step": 10791 + }, + { + "epoch": 2.918334234721471, + "grad_norm": 1.1078680753707886, + "learning_rate": 9.016638259154369e-06, + "loss": 1.8744, + "mean_token_accuracy": 0.5798225998878479, + "num_tokens": 5516836135.0, + "step": 10792 + }, + { + "epoch": 2.9186046511627906, + "grad_norm": 1.0648798942565918, + "learning_rate": 9.015100557494536e-06, + "loss": 1.8226, + "mean_token_accuracy": 0.5898723602294922, + "num_tokens": 5517360411.0, + "step": 10793 + }, + { + "epoch": 2.91887506760411, + "grad_norm": 1.0931382179260254, + "learning_rate": 9.013562916737258e-06, + "loss": 1.8772, + "mean_token_accuracy": 0.5858681201934814, + "num_tokens": 5517854590.0, + "step": 10794 + }, + { + "epoch": 2.91914548404543, + "grad_norm": 1.0628832578659058, + "learning_rate": 9.012025336929706e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.5770174264907837, + "num_tokens": 5518341504.0, + "step": 10795 + }, + { + "epoch": 2.9194159004867495, + "grad_norm": 0.9998226761817932, + "learning_rate": 9.01048781811907e-06, + "loss": 1.7654, + "mean_token_accuracy": 0.5705610513687134, + "num_tokens": 5518810959.0, + "step": 10796 + }, + { + "epoch": 2.919686316928069, + "grad_norm": 1.1519533395767212, + "learning_rate": 9.008950360352513e-06, + "loss": 1.937, + "mean_token_accuracy": 0.5635144114494324, + "num_tokens": 5519335227.0, + "step": 10797 + }, + { + "epoch": 2.9199567333693888, + "grad_norm": 1.1373826265335083, + "learning_rate": 9.007412963677216e-06, + "loss": 1.9202, + "mean_token_accuracy": 0.5673072338104248, + "num_tokens": 5519831330.0, + "step": 10798 + }, + { + "epoch": 2.9202271498107084, + "grad_norm": 1.134783148765564, + "learning_rate": 9.005875628140348e-06, + "loss": 1.8864, + "mean_token_accuracy": 0.5624197721481323, + "num_tokens": 5520355404.0, + "step": 10799 + }, + { + "epoch": 2.920497566252028, + "grad_norm": 1.086830973625183, + "learning_rate": 9.004338353789084e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5811610221862793, + "num_tokens": 5520816086.0, + "step": 10800 + }, + { + "epoch": 2.9207679826933477, + "grad_norm": 0.429167777299881, + "learning_rate": 9.002801140670582e-06, + "loss": 1.1253, + "mean_token_accuracy": 0.6988105773925781, + "num_tokens": 5521340354.0, + "step": 10801 + }, + { + "epoch": 2.9210383991346673, + "grad_norm": 1.4371598958969116, + "learning_rate": 9.00126398883202e-06, + "loss": 1.8235, + "mean_token_accuracy": 0.6152427792549133, + "num_tokens": 5521712971.0, + "step": 10802 + }, + { + "epoch": 2.921308815575987, + "grad_norm": 1.3951376676559448, + "learning_rate": 8.999726898320556e-06, + "loss": 1.7277, + "mean_token_accuracy": 0.5963109135627747, + "num_tokens": 5522237221.0, + "step": 10803 + }, + { + "epoch": 2.9215792320173066, + "grad_norm": 1.331644058227539, + "learning_rate": 8.99818986918335e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5420941114425659, + "num_tokens": 5522761423.0, + "step": 10804 + }, + { + "epoch": 2.9218496484586263, + "grad_norm": 1.1524779796600342, + "learning_rate": 8.996652901467565e-06, + "loss": 1.9173, + "mean_token_accuracy": 0.5626729726791382, + "num_tokens": 5523285648.0, + "step": 10805 + }, + { + "epoch": 2.922120064899946, + "grad_norm": 1.3590606451034546, + "learning_rate": 8.995115995220364e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5760031342506409, + "num_tokens": 5523809822.0, + "step": 10806 + }, + { + "epoch": 2.9223904813412656, + "grad_norm": 1.2396727800369263, + "learning_rate": 8.993579150488899e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5734001398086548, + "num_tokens": 5524333889.0, + "step": 10807 + }, + { + "epoch": 2.922660897782585, + "grad_norm": 1.0741291046142578, + "learning_rate": 8.992042367320326e-06, + "loss": 1.8543, + "mean_token_accuracy": 0.5774658918380737, + "num_tokens": 5524858164.0, + "step": 10808 + }, + { + "epoch": 2.922931314223905, + "grad_norm": 1.1267279386520386, + "learning_rate": 8.990505645761798e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.5563136339187622, + "num_tokens": 5525382323.0, + "step": 10809 + }, + { + "epoch": 2.9232017306652245, + "grad_norm": 1.1922954320907593, + "learning_rate": 8.988968985860467e-06, + "loss": 1.9219, + "mean_token_accuracy": 0.5572068691253662, + "num_tokens": 5525846496.0, + "step": 10810 + }, + { + "epoch": 2.923472147106544, + "grad_norm": 1.0766867399215698, + "learning_rate": 8.987432387663482e-06, + "loss": 1.7851, + "mean_token_accuracy": 0.5763026475906372, + "num_tokens": 5526369639.0, + "step": 10811 + }, + { + "epoch": 2.9237425635478638, + "grad_norm": 0.9655267596244812, + "learning_rate": 8.985895851217987e-06, + "loss": 1.9248, + "mean_token_accuracy": 0.553844690322876, + "num_tokens": 5526893843.0, + "step": 10812 + }, + { + "epoch": 2.9240129799891834, + "grad_norm": 0.8774232864379883, + "learning_rate": 8.984359376571132e-06, + "loss": 1.7516, + "mean_token_accuracy": 0.5757818818092346, + "num_tokens": 5527418122.0, + "step": 10813 + }, + { + "epoch": 2.924283396430503, + "grad_norm": 1.2938804626464844, + "learning_rate": 8.98282296377006e-06, + "loss": 1.9, + "mean_token_accuracy": 0.5883835554122925, + "num_tokens": 5527883359.0, + "step": 10814 + }, + { + "epoch": 2.9245538128718227, + "grad_norm": 1.2289681434631348, + "learning_rate": 8.981286612861912e-06, + "loss": 1.9832, + "mean_token_accuracy": 0.5543082356452942, + "num_tokens": 5528346720.0, + "step": 10815 + }, + { + "epoch": 2.9248242293131423, + "grad_norm": 1.0819443464279175, + "learning_rate": 8.979750323893828e-06, + "loss": 1.8967, + "mean_token_accuracy": 0.5645316243171692, + "num_tokens": 5528870986.0, + "step": 10816 + }, + { + "epoch": 2.925094645754462, + "grad_norm": 1.1136271953582764, + "learning_rate": 8.978214096912944e-06, + "loss": 1.7117, + "mean_token_accuracy": 0.6253893375396729, + "num_tokens": 5529389014.0, + "step": 10817 + }, + { + "epoch": 2.9253650621957816, + "grad_norm": 1.0326253175735474, + "learning_rate": 8.976677931966395e-06, + "loss": 1.7892, + "mean_token_accuracy": 0.5761592388153076, + "num_tokens": 5529876208.0, + "step": 10818 + }, + { + "epoch": 2.9256354786371013, + "grad_norm": 1.1650835275650024, + "learning_rate": 8.975141829101321e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.5704100131988525, + "num_tokens": 5530374711.0, + "step": 10819 + }, + { + "epoch": 2.925905895078421, + "grad_norm": 1.2393243312835693, + "learning_rate": 8.973605788364847e-06, + "loss": 1.8938, + "mean_token_accuracy": 0.5487998127937317, + "num_tokens": 5530898925.0, + "step": 10820 + }, + { + "epoch": 2.9261763115197406, + "grad_norm": 0.47909078001976013, + "learning_rate": 8.972069809804106e-06, + "loss": 1.1285, + "mean_token_accuracy": 0.7205163240432739, + "num_tokens": 5531294922.0, + "step": 10821 + }, + { + "epoch": 2.9264467279610598, + "grad_norm": 1.1581627130508423, + "learning_rate": 8.970533893466231e-06, + "loss": 1.8379, + "mean_token_accuracy": 0.5815905332565308, + "num_tokens": 5531819168.0, + "step": 10822 + }, + { + "epoch": 2.92671714440238, + "grad_norm": 1.0983092784881592, + "learning_rate": 8.968998039398345e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.590986967086792, + "num_tokens": 5532343356.0, + "step": 10823 + }, + { + "epoch": 2.926987560843699, + "grad_norm": 0.9857136607170105, + "learning_rate": 8.967462247647568e-06, + "loss": 1.9346, + "mean_token_accuracy": 0.5739061832427979, + "num_tokens": 5532867609.0, + "step": 10824 + }, + { + "epoch": 2.927257977285019, + "grad_norm": 1.2697815895080566, + "learning_rate": 8.965926518261028e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.5759152173995972, + "num_tokens": 5533391815.0, + "step": 10825 + }, + { + "epoch": 2.9275283937263383, + "grad_norm": 1.059420108795166, + "learning_rate": 8.964390851285845e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.5762360095977783, + "num_tokens": 5533916075.0, + "step": 10826 + }, + { + "epoch": 2.9277988101676584, + "grad_norm": 0.9516172409057617, + "learning_rate": 8.962855246769133e-06, + "loss": 1.8573, + "mean_token_accuracy": 0.5681768655776978, + "num_tokens": 5534440306.0, + "step": 10827 + }, + { + "epoch": 2.9280692266089776, + "grad_norm": 1.1075519323349, + "learning_rate": 8.961319704758018e-06, + "loss": 1.8891, + "mean_token_accuracy": 0.5634502172470093, + "num_tokens": 5534964510.0, + "step": 10828 + }, + { + "epoch": 2.9283396430502977, + "grad_norm": 1.173641562461853, + "learning_rate": 8.95978422529961e-06, + "loss": 1.9629, + "mean_token_accuracy": 0.5609005689620972, + "num_tokens": 5535488662.0, + "step": 10829 + }, + { + "epoch": 2.928610059491617, + "grad_norm": 1.0007182359695435, + "learning_rate": 8.958248808441016e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5675140619277954, + "num_tokens": 5536012837.0, + "step": 10830 + }, + { + "epoch": 2.928880475932937, + "grad_norm": 1.0872443914413452, + "learning_rate": 8.95671345422936e-06, + "loss": 1.8541, + "mean_token_accuracy": 0.5693901777267456, + "num_tokens": 5536537049.0, + "step": 10831 + }, + { + "epoch": 2.929150892374256, + "grad_norm": 1.1701030731201172, + "learning_rate": 8.95517816271174e-06, + "loss": 1.7618, + "mean_token_accuracy": 0.6009595394134521, + "num_tokens": 5537061304.0, + "step": 10832 + }, + { + "epoch": 2.929421308815576, + "grad_norm": 0.974002480506897, + "learning_rate": 8.95364293393527e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5458762645721436, + "num_tokens": 5537585535.0, + "step": 10833 + }, + { + "epoch": 2.9296917252568955, + "grad_norm": 1.2498326301574707, + "learning_rate": 8.952107767947049e-06, + "loss": 2.0171, + "mean_token_accuracy": 0.5591705441474915, + "num_tokens": 5538054890.0, + "step": 10834 + }, + { + "epoch": 2.929962141698215, + "grad_norm": 1.0916388034820557, + "learning_rate": 8.950572664794185e-06, + "loss": 1.9304, + "mean_token_accuracy": 0.5619093179702759, + "num_tokens": 5538546282.0, + "step": 10835 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 1.1730890274047852, + "learning_rate": 8.949037624523782e-06, + "loss": 2.0042, + "mean_token_accuracy": 0.5484665632247925, + "num_tokens": 5539066022.0, + "step": 10836 + }, + { + "epoch": 2.9305029745808544, + "grad_norm": 1.0819979906082153, + "learning_rate": 8.947502647182936e-06, + "loss": 1.7623, + "mean_token_accuracy": 0.6148203015327454, + "num_tokens": 5539590288.0, + "step": 10837 + }, + { + "epoch": 2.930773391022174, + "grad_norm": 1.092303991317749, + "learning_rate": 8.945967732818738e-06, + "loss": 1.9232, + "mean_token_accuracy": 0.5517979264259338, + "num_tokens": 5540109191.0, + "step": 10838 + }, + { + "epoch": 2.9310438074634937, + "grad_norm": 1.0142533779144287, + "learning_rate": 8.944432881478297e-06, + "loss": 1.8193, + "mean_token_accuracy": 0.5796142220497131, + "num_tokens": 5540633425.0, + "step": 10839 + }, + { + "epoch": 2.9313142239048133, + "grad_norm": 1.2182238101959229, + "learning_rate": 8.942898093208701e-06, + "loss": 1.787, + "mean_token_accuracy": 0.5910499691963196, + "num_tokens": 5541157539.0, + "step": 10840 + }, + { + "epoch": 2.931584640346133, + "grad_norm": 0.5470490455627441, + "learning_rate": 8.941363368057036e-06, + "loss": 1.1696, + "mean_token_accuracy": 0.6865236759185791, + "num_tokens": 5541620798.0, + "step": 10841 + }, + { + "epoch": 2.9318550567874526, + "grad_norm": 1.4177062511444092, + "learning_rate": 8.939828706070399e-06, + "loss": 1.9697, + "mean_token_accuracy": 0.5653561353683472, + "num_tokens": 5542062000.0, + "step": 10842 + }, + { + "epoch": 2.9321254732287723, + "grad_norm": 1.1769921779632568, + "learning_rate": 8.93829410729588e-06, + "loss": 1.6685, + "mean_token_accuracy": 0.6454343795776367, + "num_tokens": 5542520902.0, + "step": 10843 + }, + { + "epoch": 2.932395889670092, + "grad_norm": 1.0144673585891724, + "learning_rate": 8.936759571780553e-06, + "loss": 1.9134, + "mean_token_accuracy": 0.5592221021652222, + "num_tokens": 5543045100.0, + "step": 10844 + }, + { + "epoch": 2.9326663061114115, + "grad_norm": 1.148119568824768, + "learning_rate": 8.935225099571517e-06, + "loss": 1.8171, + "mean_token_accuracy": 0.5901128053665161, + "num_tokens": 5543569096.0, + "step": 10845 + }, + { + "epoch": 2.932936722552731, + "grad_norm": 0.9502311944961548, + "learning_rate": 8.933690690715846e-06, + "loss": 1.9759, + "mean_token_accuracy": 0.5710145235061646, + "num_tokens": 5544027764.0, + "step": 10846 + }, + { + "epoch": 2.933207138994051, + "grad_norm": 1.0552748441696167, + "learning_rate": 8.932156345260616e-06, + "loss": 1.8013, + "mean_token_accuracy": 0.5769111514091492, + "num_tokens": 5544552008.0, + "step": 10847 + }, + { + "epoch": 2.9334775554353705, + "grad_norm": 0.9601358771324158, + "learning_rate": 8.930622063252914e-06, + "loss": 1.6868, + "mean_token_accuracy": 0.6073975563049316, + "num_tokens": 5545076033.0, + "step": 10848 + }, + { + "epoch": 2.93374797187669, + "grad_norm": 1.1785337924957275, + "learning_rate": 8.929087844739814e-06, + "loss": 1.9215, + "mean_token_accuracy": 0.5688345432281494, + "num_tokens": 5545500553.0, + "step": 10849 + }, + { + "epoch": 2.9340183883180098, + "grad_norm": 1.1212646961212158, + "learning_rate": 8.927553689768388e-06, + "loss": 1.9033, + "mean_token_accuracy": 0.5623281598091125, + "num_tokens": 5546024691.0, + "step": 10850 + }, + { + "epoch": 2.9342888047593294, + "grad_norm": 0.9105389714241028, + "learning_rate": 8.926019598385708e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5716673135757446, + "num_tokens": 5546548912.0, + "step": 10851 + }, + { + "epoch": 2.934559221200649, + "grad_norm": 1.1492034196853638, + "learning_rate": 8.92448557063885e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5764391422271729, + "num_tokens": 5547055722.0, + "step": 10852 + }, + { + "epoch": 2.9348296376419687, + "grad_norm": 1.1487724781036377, + "learning_rate": 8.922951606574873e-06, + "loss": 1.9117, + "mean_token_accuracy": 0.5559910535812378, + "num_tokens": 5547566971.0, + "step": 10853 + }, + { + "epoch": 2.9351000540832883, + "grad_norm": 1.039425253868103, + "learning_rate": 8.92141770624085e-06, + "loss": 1.9648, + "mean_token_accuracy": 0.5666408538818359, + "num_tokens": 5548091190.0, + "step": 10854 + }, + { + "epoch": 2.935370470524608, + "grad_norm": 1.105686902999878, + "learning_rate": 8.919883869683846e-06, + "loss": 1.7776, + "mean_token_accuracy": 0.5878020524978638, + "num_tokens": 5548577900.0, + "step": 10855 + }, + { + "epoch": 2.9356408869659276, + "grad_norm": 1.0487653017044067, + "learning_rate": 8.918350096950923e-06, + "loss": 1.7375, + "mean_token_accuracy": 0.5939604043960571, + "num_tokens": 5549074339.0, + "step": 10856 + }, + { + "epoch": 2.9359113034072473, + "grad_norm": 0.9922023415565491, + "learning_rate": 8.91681638808914e-06, + "loss": 1.8747, + "mean_token_accuracy": 0.5759207010269165, + "num_tokens": 5549598583.0, + "step": 10857 + }, + { + "epoch": 2.936181719848567, + "grad_norm": 1.006022572517395, + "learning_rate": 8.915282743145554e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5775902271270752, + "num_tokens": 5550073790.0, + "step": 10858 + }, + { + "epoch": 2.9364521362898865, + "grad_norm": 1.1383802890777588, + "learning_rate": 8.913749162167229e-06, + "loss": 1.823, + "mean_token_accuracy": 0.5712969303131104, + "num_tokens": 5550580957.0, + "step": 10859 + }, + { + "epoch": 2.936722552731206, + "grad_norm": 0.9672331213951111, + "learning_rate": 8.912215645201212e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5881706476211548, + "num_tokens": 5551105113.0, + "step": 10860 + }, + { + "epoch": 2.936992969172526, + "grad_norm": 0.48273441195487976, + "learning_rate": 8.910682192294558e-06, + "loss": 1.0912, + "mean_token_accuracy": 0.7116349339485168, + "num_tokens": 5551629216.0, + "step": 10861 + }, + { + "epoch": 2.9372633856138455, + "grad_norm": 1.217017412185669, + "learning_rate": 8.909148803494321e-06, + "loss": 1.8641, + "mean_token_accuracy": 0.5789781808853149, + "num_tokens": 5552153484.0, + "step": 10862 + }, + { + "epoch": 2.9375338020551647, + "grad_norm": 1.3012338876724243, + "learning_rate": 8.907615478847546e-06, + "loss": 1.9373, + "mean_token_accuracy": 0.5719287991523743, + "num_tokens": 5552617925.0, + "step": 10863 + }, + { + "epoch": 2.9378042184964848, + "grad_norm": 0.9651702046394348, + "learning_rate": 8.906082218401279e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5707864761352539, + "num_tokens": 5553142154.0, + "step": 10864 + }, + { + "epoch": 2.938074634937804, + "grad_norm": 0.9245644211769104, + "learning_rate": 8.904549022202573e-06, + "loss": 1.8368, + "mean_token_accuracy": 0.5855820178985596, + "num_tokens": 5553666430.0, + "step": 10865 + }, + { + "epoch": 2.938345051379124, + "grad_norm": 1.087449550628662, + "learning_rate": 8.903015890298464e-06, + "loss": 2.0088, + "mean_token_accuracy": 0.534163773059845, + "num_tokens": 5554190712.0, + "step": 10866 + }, + { + "epoch": 2.9386154678204432, + "grad_norm": 0.998693585395813, + "learning_rate": 8.90148282273599e-06, + "loss": 1.8584, + "mean_token_accuracy": 0.5539498329162598, + "num_tokens": 5554714866.0, + "step": 10867 + }, + { + "epoch": 2.9388858842617633, + "grad_norm": 1.018715500831604, + "learning_rate": 8.8999498195622e-06, + "loss": 1.8665, + "mean_token_accuracy": 0.5655016303062439, + "num_tokens": 5555239037.0, + "step": 10868 + }, + { + "epoch": 2.9391563007030825, + "grad_norm": 1.10538911819458, + "learning_rate": 8.89841688082412e-06, + "loss": 2.0211, + "mean_token_accuracy": 0.5285767316818237, + "num_tokens": 5555763197.0, + "step": 10869 + }, + { + "epoch": 2.9394267171444026, + "grad_norm": 1.1946892738342285, + "learning_rate": 8.89688400656879e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5868930816650391, + "num_tokens": 5556287225.0, + "step": 10870 + }, + { + "epoch": 2.939697133585722, + "grad_norm": 1.0868839025497437, + "learning_rate": 8.89535119684325e-06, + "loss": 1.8678, + "mean_token_accuracy": 0.574608564376831, + "num_tokens": 5556806296.0, + "step": 10871 + }, + { + "epoch": 2.939967550027042, + "grad_norm": 0.9410446286201477, + "learning_rate": 8.893818451694523e-06, + "loss": 1.796, + "mean_token_accuracy": 0.5887001752853394, + "num_tokens": 5557306418.0, + "step": 10872 + }, + { + "epoch": 2.940237966468361, + "grad_norm": 1.225711464881897, + "learning_rate": 8.892285771169632e-06, + "loss": 1.7441, + "mean_token_accuracy": 0.6105709075927734, + "num_tokens": 5557809063.0, + "step": 10873 + }, + { + "epoch": 2.9405083829096808, + "grad_norm": 1.1805291175842285, + "learning_rate": 8.890753155315621e-06, + "loss": 1.9477, + "mean_token_accuracy": 0.5428106784820557, + "num_tokens": 5558311573.0, + "step": 10874 + }, + { + "epoch": 2.9407787993510004, + "grad_norm": 1.2317500114440918, + "learning_rate": 8.889220604179503e-06, + "loss": 1.8021, + "mean_token_accuracy": 0.5477443337440491, + "num_tokens": 5558835763.0, + "step": 10875 + }, + { + "epoch": 2.94104921579232, + "grad_norm": 1.0292211771011353, + "learning_rate": 8.887688117808304e-06, + "loss": 1.7922, + "mean_token_accuracy": 0.5725134611129761, + "num_tokens": 5559360029.0, + "step": 10876 + }, + { + "epoch": 2.9413196322336397, + "grad_norm": 1.2346543073654175, + "learning_rate": 8.886155696249046e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.5696778297424316, + "num_tokens": 5559884307.0, + "step": 10877 + }, + { + "epoch": 2.9415900486749593, + "grad_norm": 1.0603924989700317, + "learning_rate": 8.884623339548743e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.5658020377159119, + "num_tokens": 5560408514.0, + "step": 10878 + }, + { + "epoch": 2.941860465116279, + "grad_norm": 1.0675804615020752, + "learning_rate": 8.883091047754422e-06, + "loss": 1.7232, + "mean_token_accuracy": 0.5905718803405762, + "num_tokens": 5560889348.0, + "step": 10879 + }, + { + "epoch": 2.9421308815575986, + "grad_norm": 1.0532066822052002, + "learning_rate": 8.881558820913097e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5677524209022522, + "num_tokens": 5561413609.0, + "step": 10880 + }, + { + "epoch": 2.9424012979989183, + "grad_norm": 0.4354172348976135, + "learning_rate": 8.880026659071768e-06, + "loss": 0.987, + "mean_token_accuracy": 0.730514645576477, + "num_tokens": 5561937685.0, + "step": 10881 + }, + { + "epoch": 2.942671714440238, + "grad_norm": 1.5452865362167358, + "learning_rate": 8.878494562277462e-06, + "loss": 1.9485, + "mean_token_accuracy": 0.5815356969833374, + "num_tokens": 5562414255.0, + "step": 10882 + }, + { + "epoch": 2.9429421308815575, + "grad_norm": 1.3220206499099731, + "learning_rate": 8.87696253057718e-06, + "loss": 1.9298, + "mean_token_accuracy": 0.5597838759422302, + "num_tokens": 5562925694.0, + "step": 10883 + }, + { + "epoch": 2.943212547322877, + "grad_norm": 1.00564444065094, + "learning_rate": 8.87543056401793e-06, + "loss": 1.9473, + "mean_token_accuracy": 0.5582049489021301, + "num_tokens": 5563449823.0, + "step": 10884 + }, + { + "epoch": 2.943482963764197, + "grad_norm": 1.1220797300338745, + "learning_rate": 8.873898662646718e-06, + "loss": 1.9112, + "mean_token_accuracy": 0.5509196519851685, + "num_tokens": 5563974000.0, + "step": 10885 + }, + { + "epoch": 2.9437533802055165, + "grad_norm": 1.3054507970809937, + "learning_rate": 8.87236682651055e-06, + "loss": 1.8908, + "mean_token_accuracy": 0.547521710395813, + "num_tokens": 5564498089.0, + "step": 10886 + }, + { + "epoch": 2.944023796646836, + "grad_norm": 1.1545047760009766, + "learning_rate": 8.870835055656421e-06, + "loss": 1.9923, + "mean_token_accuracy": 0.5604027509689331, + "num_tokens": 5564976976.0, + "step": 10887 + }, + { + "epoch": 2.9442942130881558, + "grad_norm": 1.2220221757888794, + "learning_rate": 8.869303350131338e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5695885419845581, + "num_tokens": 5565501246.0, + "step": 10888 + }, + { + "epoch": 2.9445646295294754, + "grad_norm": 0.9939145445823669, + "learning_rate": 8.867771709982294e-06, + "loss": 1.9138, + "mean_token_accuracy": 0.5618607997894287, + "num_tokens": 5566025357.0, + "step": 10889 + }, + { + "epoch": 2.944835045970795, + "grad_norm": 1.0751407146453857, + "learning_rate": 8.86624013525628e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.5863215923309326, + "num_tokens": 5566529890.0, + "step": 10890 + }, + { + "epoch": 2.9451054624121147, + "grad_norm": 1.091041922569275, + "learning_rate": 8.864708626000296e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.5838050842285156, + "num_tokens": 5567032403.0, + "step": 10891 + }, + { + "epoch": 2.9453758788534343, + "grad_norm": 0.9180760979652405, + "learning_rate": 8.863177182261333e-06, + "loss": 1.7909, + "mean_token_accuracy": 0.573664128780365, + "num_tokens": 5567556549.0, + "step": 10892 + }, + { + "epoch": 2.945646295294754, + "grad_norm": 0.9259474277496338, + "learning_rate": 8.861645804086374e-06, + "loss": 1.7324, + "mean_token_accuracy": 0.590672492980957, + "num_tokens": 5568080757.0, + "step": 10893 + }, + { + "epoch": 2.9459167117360736, + "grad_norm": 1.0814592838287354, + "learning_rate": 8.860114491522413e-06, + "loss": 1.8082, + "mean_token_accuracy": 0.5890729427337646, + "num_tokens": 5568580183.0, + "step": 10894 + }, + { + "epoch": 2.9461871281773933, + "grad_norm": 1.0574039220809937, + "learning_rate": 8.858583244616434e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.5794640183448792, + "num_tokens": 5569047204.0, + "step": 10895 + }, + { + "epoch": 2.946457544618713, + "grad_norm": 1.1230967044830322, + "learning_rate": 8.857052063415414e-06, + "loss": 1.8691, + "mean_token_accuracy": 0.5615971088409424, + "num_tokens": 5569528469.0, + "step": 10896 + }, + { + "epoch": 2.9467279610600325, + "grad_norm": 1.0511794090270996, + "learning_rate": 8.85552094796634e-06, + "loss": 1.8689, + "mean_token_accuracy": 0.5937360525131226, + "num_tokens": 5569964411.0, + "step": 10897 + }, + { + "epoch": 2.946998377501352, + "grad_norm": 1.0648252964019775, + "learning_rate": 8.853989898316193e-06, + "loss": 1.7829, + "mean_token_accuracy": 0.5579376220703125, + "num_tokens": 5570488682.0, + "step": 10898 + }, + { + "epoch": 2.947268793942672, + "grad_norm": 0.9830726981163025, + "learning_rate": 8.852458914511943e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5548018217086792, + "num_tokens": 5571012852.0, + "step": 10899 + }, + { + "epoch": 2.9475392103839915, + "grad_norm": 1.0253514051437378, + "learning_rate": 8.850927996600574e-06, + "loss": 1.7441, + "mean_token_accuracy": 0.5920819044113159, + "num_tokens": 5571474748.0, + "step": 10900 + }, + { + "epoch": 2.947809626825311, + "grad_norm": 0.47152474522590637, + "learning_rate": 8.849397144629048e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7213369607925415, + "num_tokens": 5571999020.0, + "step": 10901 + }, + { + "epoch": 2.9480800432666308, + "grad_norm": 1.6936113834381104, + "learning_rate": 8.847866358644345e-06, + "loss": 1.824, + "mean_token_accuracy": 0.5767123699188232, + "num_tokens": 5572523207.0, + "step": 10902 + }, + { + "epoch": 2.9483504597079504, + "grad_norm": 1.8745267391204834, + "learning_rate": 8.846335638693433e-06, + "loss": 1.9801, + "mean_token_accuracy": 0.553812563419342, + "num_tokens": 5573022263.0, + "step": 10903 + }, + { + "epoch": 2.9486208761492696, + "grad_norm": 1.188415765762329, + "learning_rate": 8.844804984823274e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.555545449256897, + "num_tokens": 5573499531.0, + "step": 10904 + }, + { + "epoch": 2.9488912925905897, + "grad_norm": 1.1026942729949951, + "learning_rate": 8.843274397080837e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.581466794013977, + "num_tokens": 5574023667.0, + "step": 10905 + }, + { + "epoch": 2.949161709031909, + "grad_norm": 1.240026831626892, + "learning_rate": 8.841743875513086e-06, + "loss": 1.7699, + "mean_token_accuracy": 0.6020113229751587, + "num_tokens": 5574471916.0, + "step": 10906 + }, + { + "epoch": 2.949432125473229, + "grad_norm": 1.2345247268676758, + "learning_rate": 8.840213420166976e-06, + "loss": 1.8405, + "mean_token_accuracy": 0.570080578327179, + "num_tokens": 5574996126.0, + "step": 10907 + }, + { + "epoch": 2.949702541914548, + "grad_norm": 1.2637856006622314, + "learning_rate": 8.838683031089474e-06, + "loss": 1.8938, + "mean_token_accuracy": 0.5507725477218628, + "num_tokens": 5575520394.0, + "step": 10908 + }, + { + "epoch": 2.9499729583558683, + "grad_norm": 1.5290805101394653, + "learning_rate": 8.837152708327532e-06, + "loss": 1.792, + "mean_token_accuracy": 0.5782004594802856, + "num_tokens": 5576000808.0, + "step": 10909 + }, + { + "epoch": 2.9502433747971875, + "grad_norm": 1.283879041671753, + "learning_rate": 8.8356224519281e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5859648585319519, + "num_tokens": 5576505550.0, + "step": 10910 + }, + { + "epoch": 2.9505137912385075, + "grad_norm": 1.190632700920105, + "learning_rate": 8.834092261938143e-06, + "loss": 1.6657, + "mean_token_accuracy": 0.6248162984848022, + "num_tokens": 5577029833.0, + "step": 10911 + }, + { + "epoch": 2.9507842076798267, + "grad_norm": 1.164241075515747, + "learning_rate": 8.832562138404603e-06, + "loss": 1.9375, + "mean_token_accuracy": 0.5563789010047913, + "num_tokens": 5577554115.0, + "step": 10912 + }, + { + "epoch": 2.951054624121147, + "grad_norm": 1.1617684364318848, + "learning_rate": 8.831032081374426e-06, + "loss": 1.8858, + "mean_token_accuracy": 0.5820260643959045, + "num_tokens": 5578013769.0, + "step": 10913 + }, + { + "epoch": 2.951325040562466, + "grad_norm": 1.285852074623108, + "learning_rate": 8.82950209089457e-06, + "loss": 1.8229, + "mean_token_accuracy": 0.5851690769195557, + "num_tokens": 5578537930.0, + "step": 10914 + }, + { + "epoch": 2.9515954570037857, + "grad_norm": 0.9760220050811768, + "learning_rate": 8.82797216701197e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5806071162223816, + "num_tokens": 5579062163.0, + "step": 10915 + }, + { + "epoch": 2.9518658734451053, + "grad_norm": 0.9320611357688904, + "learning_rate": 8.826442309773571e-06, + "loss": 1.9288, + "mean_token_accuracy": 0.5772348046302795, + "num_tokens": 5579552792.0, + "step": 10916 + }, + { + "epoch": 2.952136289886425, + "grad_norm": 0.999403178691864, + "learning_rate": 8.824912519226316e-06, + "loss": 1.8743, + "mean_token_accuracy": 0.5617230534553528, + "num_tokens": 5580076973.0, + "step": 10917 + }, + { + "epoch": 2.9524067063277446, + "grad_norm": 1.0781930685043335, + "learning_rate": 8.82338279541714e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.572770357131958, + "num_tokens": 5580601231.0, + "step": 10918 + }, + { + "epoch": 2.9526771227690642, + "grad_norm": 0.9151448011398315, + "learning_rate": 8.82185313839298e-06, + "loss": 1.9562, + "mean_token_accuracy": 0.5562660694122314, + "num_tokens": 5581125428.0, + "step": 10919 + }, + { + "epoch": 2.952947539210384, + "grad_norm": 1.0624316930770874, + "learning_rate": 8.820323548200771e-06, + "loss": 1.8843, + "mean_token_accuracy": 0.5671287775039673, + "num_tokens": 5581606815.0, + "step": 10920 + }, + { + "epoch": 2.9532179556517035, + "grad_norm": 0.476132333278656, + "learning_rate": 8.818794024887449e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.7148956060409546, + "num_tokens": 5582078660.0, + "step": 10921 + }, + { + "epoch": 2.953488372093023, + "grad_norm": 1.342557430267334, + "learning_rate": 8.817264568499935e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.5799175500869751, + "num_tokens": 5582602858.0, + "step": 10922 + }, + { + "epoch": 2.953758788534343, + "grad_norm": 1.0942156314849854, + "learning_rate": 8.815735179085169e-06, + "loss": 1.8449, + "mean_token_accuracy": 0.5696477293968201, + "num_tokens": 5583115781.0, + "step": 10923 + }, + { + "epoch": 2.9540292049756625, + "grad_norm": 1.0320887565612793, + "learning_rate": 8.814205856690066e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5701534748077393, + "num_tokens": 5583639935.0, + "step": 10924 + }, + { + "epoch": 2.954299621416982, + "grad_norm": 1.217474102973938, + "learning_rate": 8.812676601361558e-06, + "loss": 1.9222, + "mean_token_accuracy": 0.5588913559913635, + "num_tokens": 5584164103.0, + "step": 10925 + }, + { + "epoch": 2.9545700378583017, + "grad_norm": 1.0780491828918457, + "learning_rate": 8.811147413146564e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5791845321655273, + "num_tokens": 5584623800.0, + "step": 10926 + }, + { + "epoch": 2.9548404542996214, + "grad_norm": 0.9770480394363403, + "learning_rate": 8.809618292092006e-06, + "loss": 1.9741, + "mean_token_accuracy": 0.5503228306770325, + "num_tokens": 5585148037.0, + "step": 10927 + }, + { + "epoch": 2.955110870740941, + "grad_norm": 1.0323768854141235, + "learning_rate": 8.808089238244797e-06, + "loss": 1.8163, + "mean_token_accuracy": 0.5610767602920532, + "num_tokens": 5585672181.0, + "step": 10928 + }, + { + "epoch": 2.9553812871822607, + "grad_norm": 1.2688443660736084, + "learning_rate": 8.806560251651861e-06, + "loss": 1.9441, + "mean_token_accuracy": 0.5756868124008179, + "num_tokens": 5586148199.0, + "step": 10929 + }, + { + "epoch": 2.9556517036235803, + "grad_norm": 1.1086719036102295, + "learning_rate": 8.805031332360102e-06, + "loss": 1.9261, + "mean_token_accuracy": 0.5499162673950195, + "num_tokens": 5586672195.0, + "step": 10930 + }, + { + "epoch": 2.9559221200649, + "grad_norm": 0.9521324038505554, + "learning_rate": 8.803502480416439e-06, + "loss": 1.8441, + "mean_token_accuracy": 0.5810410976409912, + "num_tokens": 5587196436.0, + "step": 10931 + }, + { + "epoch": 2.9561925365062196, + "grad_norm": 1.4053471088409424, + "learning_rate": 8.80197369586778e-06, + "loss": 1.9011, + "mean_token_accuracy": 0.5775244235992432, + "num_tokens": 5587720718.0, + "step": 10932 + }, + { + "epoch": 2.9564629529475392, + "grad_norm": 1.1880697011947632, + "learning_rate": 8.80044497876103e-06, + "loss": 1.8348, + "mean_token_accuracy": 0.5852847099304199, + "num_tokens": 5588244950.0, + "step": 10933 + }, + { + "epoch": 2.956733369388859, + "grad_norm": 1.1581207513809204, + "learning_rate": 8.798916329143097e-06, + "loss": 1.9306, + "mean_token_accuracy": 0.5595182776451111, + "num_tokens": 5588769105.0, + "step": 10934 + }, + { + "epoch": 2.9570037858301785, + "grad_norm": 1.3237711191177368, + "learning_rate": 8.797387747060885e-06, + "loss": 1.9436, + "mean_token_accuracy": 0.5748145580291748, + "num_tokens": 5589293371.0, + "step": 10935 + }, + { + "epoch": 2.957274202271498, + "grad_norm": 1.354103922843933, + "learning_rate": 8.795859232561293e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.5643110871315002, + "num_tokens": 5589817654.0, + "step": 10936 + }, + { + "epoch": 2.957544618712818, + "grad_norm": 1.0626907348632812, + "learning_rate": 8.794330785691223e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.5776795148849487, + "num_tokens": 5590341847.0, + "step": 10937 + }, + { + "epoch": 2.9578150351541375, + "grad_norm": 0.9666998982429504, + "learning_rate": 8.792802406497573e-06, + "loss": 1.9288, + "mean_token_accuracy": 0.5520681142807007, + "num_tokens": 5590866040.0, + "step": 10938 + }, + { + "epoch": 2.958085451595457, + "grad_norm": 1.1652793884277344, + "learning_rate": 8.791274095027231e-06, + "loss": 1.8757, + "mean_token_accuracy": 0.5717514753341675, + "num_tokens": 5591390236.0, + "step": 10939 + }, + { + "epoch": 2.9583558680367767, + "grad_norm": 1.1939506530761719, + "learning_rate": 8.789745851327099e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5807565450668335, + "num_tokens": 5591914415.0, + "step": 10940 + }, + { + "epoch": 2.9586262844780964, + "grad_norm": 0.43301403522491455, + "learning_rate": 8.788217675444064e-06, + "loss": 1.1098, + "mean_token_accuracy": 0.7041432857513428, + "num_tokens": 5592438693.0, + "step": 10941 + }, + { + "epoch": 2.958896700919416, + "grad_norm": 1.0027467012405396, + "learning_rate": 8.786689567425012e-06, + "loss": 1.4751, + "mean_token_accuracy": 0.6450935006141663, + "num_tokens": 5592962885.0, + "step": 10942 + }, + { + "epoch": 2.9591671173607357, + "grad_norm": 1.0068682432174683, + "learning_rate": 8.785161527316836e-06, + "loss": 1.6769, + "mean_token_accuracy": 0.6196441650390625, + "num_tokens": 5593424356.0, + "step": 10943 + }, + { + "epoch": 2.9594375338020553, + "grad_norm": 0.9469150900840759, + "learning_rate": 8.783633555166419e-06, + "loss": 1.8727, + "mean_token_accuracy": 0.577512800693512, + "num_tokens": 5593948571.0, + "step": 10944 + }, + { + "epoch": 2.959707950243375, + "grad_norm": 1.1396763324737549, + "learning_rate": 8.782105651020638e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5855172872543335, + "num_tokens": 5594472751.0, + "step": 10945 + }, + { + "epoch": 2.9599783666846946, + "grad_norm": 1.0401036739349365, + "learning_rate": 8.780577814926383e-06, + "loss": 1.6401, + "mean_token_accuracy": 0.6137382388114929, + "num_tokens": 5594996976.0, + "step": 10946 + }, + { + "epoch": 2.960248783126014, + "grad_norm": 1.1214245557785034, + "learning_rate": 8.779050046930525e-06, + "loss": 1.7998, + "mean_token_accuracy": 0.5845901966094971, + "num_tokens": 5595512488.0, + "step": 10947 + }, + { + "epoch": 2.960519199567334, + "grad_norm": 1.0978803634643555, + "learning_rate": 8.777522347079941e-06, + "loss": 1.9093, + "mean_token_accuracy": 0.561298131942749, + "num_tokens": 5596036688.0, + "step": 10948 + }, + { + "epoch": 2.960789616008653, + "grad_norm": 0.9822790026664734, + "learning_rate": 8.775994715421509e-06, + "loss": 1.8147, + "mean_token_accuracy": 0.5807822346687317, + "num_tokens": 5596560816.0, + "step": 10949 + }, + { + "epoch": 2.961060032449973, + "grad_norm": 1.1413178443908691, + "learning_rate": 8.774467152002099e-06, + "loss": 1.8041, + "mean_token_accuracy": 0.5897438526153564, + "num_tokens": 5597074413.0, + "step": 10950 + }, + { + "epoch": 2.9613304488912924, + "grad_norm": 0.9365586638450623, + "learning_rate": 8.772939656868584e-06, + "loss": 1.9935, + "mean_token_accuracy": 0.5444406270980835, + "num_tokens": 5597598569.0, + "step": 10951 + }, + { + "epoch": 2.9616008653326125, + "grad_norm": 0.935341477394104, + "learning_rate": 8.771412230067831e-06, + "loss": 1.8997, + "mean_token_accuracy": 0.5629721879959106, + "num_tokens": 5598122767.0, + "step": 10952 + }, + { + "epoch": 2.9618712817739317, + "grad_norm": 0.9006691575050354, + "learning_rate": 8.769884871646696e-06, + "loss": 1.8584, + "mean_token_accuracy": 0.5541774034500122, + "num_tokens": 5598646938.0, + "step": 10953 + }, + { + "epoch": 2.9621416982152518, + "grad_norm": 0.8774871826171875, + "learning_rate": 8.768357581652061e-06, + "loss": 1.7197, + "mean_token_accuracy": 0.5752779841423035, + "num_tokens": 5599171088.0, + "step": 10954 + }, + { + "epoch": 2.962412114656571, + "grad_norm": 1.1394448280334473, + "learning_rate": 8.766830360130773e-06, + "loss": 2.0053, + "mean_token_accuracy": 0.552017331123352, + "num_tokens": 5599695257.0, + "step": 10955 + }, + { + "epoch": 2.9626825310978906, + "grad_norm": 0.9525613188743591, + "learning_rate": 8.765303207129697e-06, + "loss": 1.7312, + "mean_token_accuracy": 0.570791482925415, + "num_tokens": 5600219405.0, + "step": 10956 + }, + { + "epoch": 2.9629529475392102, + "grad_norm": 1.0759795904159546, + "learning_rate": 8.763776122695694e-06, + "loss": 1.9325, + "mean_token_accuracy": 0.56355881690979, + "num_tokens": 5600717215.0, + "step": 10957 + }, + { + "epoch": 2.96322336398053, + "grad_norm": 0.9714305400848389, + "learning_rate": 8.762249106875613e-06, + "loss": 1.9278, + "mean_token_accuracy": 0.5529128909111023, + "num_tokens": 5601241435.0, + "step": 10958 + }, + { + "epoch": 2.9634937804218495, + "grad_norm": 0.8846307396888733, + "learning_rate": 8.760722159716308e-06, + "loss": 1.9435, + "mean_token_accuracy": 0.5524961948394775, + "num_tokens": 5601765711.0, + "step": 10959 + }, + { + "epoch": 2.963764196863169, + "grad_norm": 1.1664276123046875, + "learning_rate": 8.75919528126464e-06, + "loss": 1.9532, + "mean_token_accuracy": 0.5650404691696167, + "num_tokens": 5602257254.0, + "step": 10960 + }, + { + "epoch": 2.964034613304489, + "grad_norm": 0.49867841601371765, + "learning_rate": 8.757668471567445e-06, + "loss": 1.1181, + "mean_token_accuracy": 0.7014868259429932, + "num_tokens": 5602781447.0, + "step": 10961 + }, + { + "epoch": 2.9643050297458085, + "grad_norm": 1.20149564743042, + "learning_rate": 8.756141730671574e-06, + "loss": 1.6403, + "mean_token_accuracy": 0.6267447471618652, + "num_tokens": 5603305670.0, + "step": 10962 + }, + { + "epoch": 2.964575446187128, + "grad_norm": 1.3298856019973755, + "learning_rate": 8.754615058623877e-06, + "loss": 1.899, + "mean_token_accuracy": 0.5720693469047546, + "num_tokens": 5603829889.0, + "step": 10963 + }, + { + "epoch": 2.9648458626284477, + "grad_norm": 1.0674492120742798, + "learning_rate": 8.753088455471193e-06, + "loss": 1.9434, + "mean_token_accuracy": 0.5587503910064697, + "num_tokens": 5604354029.0, + "step": 10964 + }, + { + "epoch": 2.9651162790697674, + "grad_norm": 1.1841355562210083, + "learning_rate": 8.751561921260361e-06, + "loss": 1.8229, + "mean_token_accuracy": 0.5627985000610352, + "num_tokens": 5604878240.0, + "step": 10965 + }, + { + "epoch": 2.965386695511087, + "grad_norm": 1.1827486753463745, + "learning_rate": 8.750035456038222e-06, + "loss": 1.85, + "mean_token_accuracy": 0.5401020646095276, + "num_tokens": 5605380444.0, + "step": 10966 + }, + { + "epoch": 2.9656571119524067, + "grad_norm": 1.1384503841400146, + "learning_rate": 8.748509059851613e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5707837343215942, + "num_tokens": 5605904607.0, + "step": 10967 + }, + { + "epoch": 2.9659275283937263, + "grad_norm": 1.1769822835922241, + "learning_rate": 8.746982732747366e-06, + "loss": 1.7297, + "mean_token_accuracy": 0.5820565223693848, + "num_tokens": 5606428778.0, + "step": 10968 + }, + { + "epoch": 2.966197944835046, + "grad_norm": 1.2622110843658447, + "learning_rate": 8.745456474772313e-06, + "loss": 1.9606, + "mean_token_accuracy": 0.5587843656539917, + "num_tokens": 5606953040.0, + "step": 10969 + }, + { + "epoch": 2.9664683612763656, + "grad_norm": 0.9429620504379272, + "learning_rate": 8.743930285973285e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5553420782089233, + "num_tokens": 5607477153.0, + "step": 10970 + }, + { + "epoch": 2.9667387777176852, + "grad_norm": 1.1954761743545532, + "learning_rate": 8.742404166397116e-06, + "loss": 1.9476, + "mean_token_accuracy": 0.5570240616798401, + "num_tokens": 5608001359.0, + "step": 10971 + }, + { + "epoch": 2.967009194159005, + "grad_norm": 1.1250686645507812, + "learning_rate": 8.740878116090625e-06, + "loss": 1.7678, + "mean_token_accuracy": 0.5736130475997925, + "num_tokens": 5608503084.0, + "step": 10972 + }, + { + "epoch": 2.9672796106003245, + "grad_norm": 1.2892674207687378, + "learning_rate": 8.739352135100634e-06, + "loss": 1.8573, + "mean_token_accuracy": 0.5766302943229675, + "num_tokens": 5609027255.0, + "step": 10973 + }, + { + "epoch": 2.967550027041644, + "grad_norm": 0.9844010472297668, + "learning_rate": 8.73782622347397e-06, + "loss": 1.8228, + "mean_token_accuracy": 0.5680609941482544, + "num_tokens": 5609551273.0, + "step": 10974 + }, + { + "epoch": 2.967820443482964, + "grad_norm": 0.9995099902153015, + "learning_rate": 8.736300381257449e-06, + "loss": 1.7424, + "mean_token_accuracy": 0.5955870747566223, + "num_tokens": 5610075367.0, + "step": 10975 + }, + { + "epoch": 2.9680908599242835, + "grad_norm": 1.1192725896835327, + "learning_rate": 8.734774608497889e-06, + "loss": 1.785, + "mean_token_accuracy": 0.595299243927002, + "num_tokens": 5610599578.0, + "step": 10976 + }, + { + "epoch": 2.968361276365603, + "grad_norm": 0.9993334412574768, + "learning_rate": 8.733248905242106e-06, + "loss": 1.9619, + "mean_token_accuracy": 0.5573148131370544, + "num_tokens": 5611123572.0, + "step": 10977 + }, + { + "epoch": 2.9686316928069227, + "grad_norm": 0.9599640369415283, + "learning_rate": 8.731723271536914e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5802084803581238, + "num_tokens": 5611636311.0, + "step": 10978 + }, + { + "epoch": 2.9689021092482424, + "grad_norm": 1.101851463317871, + "learning_rate": 8.73019770742912e-06, + "loss": 1.7003, + "mean_token_accuracy": 0.5896059274673462, + "num_tokens": 5612142790.0, + "step": 10979 + }, + { + "epoch": 2.969172525689562, + "grad_norm": 1.267972469329834, + "learning_rate": 8.728672212965538e-06, + "loss": 1.9137, + "mean_token_accuracy": 0.5767267346382141, + "num_tokens": 5612614769.0, + "step": 10980 + }, + { + "epoch": 2.9694429421308817, + "grad_norm": 0.4675820767879486, + "learning_rate": 8.727146788192972e-06, + "loss": 1.1177, + "mean_token_accuracy": 0.6886321902275085, + "num_tokens": 5613139015.0, + "step": 10981 + }, + { + "epoch": 2.9697133585722013, + "grad_norm": 1.2648553848266602, + "learning_rate": 8.725621433158222e-06, + "loss": 1.8696, + "mean_token_accuracy": 0.539644181728363, + "num_tokens": 5613663288.0, + "step": 10982 + }, + { + "epoch": 2.969983775013521, + "grad_norm": 1.2982028722763062, + "learning_rate": 8.7240961479081e-06, + "loss": 1.891, + "mean_token_accuracy": 0.5838052034378052, + "num_tokens": 5614187484.0, + "step": 10983 + }, + { + "epoch": 2.9702541914548406, + "grad_norm": 1.1948521137237549, + "learning_rate": 8.7225709324894e-06, + "loss": 1.9432, + "mean_token_accuracy": 0.5656439661979675, + "num_tokens": 5614693867.0, + "step": 10984 + }, + { + "epoch": 2.9705246078961602, + "grad_norm": 1.0307561159133911, + "learning_rate": 8.721045786948919e-06, + "loss": 1.9255, + "mean_token_accuracy": 0.5474206805229187, + "num_tokens": 5615218138.0, + "step": 10985 + }, + { + "epoch": 2.97079502433748, + "grad_norm": 1.0408878326416016, + "learning_rate": 8.71952071133346e-06, + "loss": 1.8256, + "mean_token_accuracy": 0.5828391313552856, + "num_tokens": 5615742259.0, + "step": 10986 + }, + { + "epoch": 2.9710654407787995, + "grad_norm": 1.163515329360962, + "learning_rate": 8.71799570568981e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5664275884628296, + "num_tokens": 5616266505.0, + "step": 10987 + }, + { + "epoch": 2.9713358572201187, + "grad_norm": 1.111398696899414, + "learning_rate": 8.71647077006476e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5919318199157715, + "num_tokens": 5616790617.0, + "step": 10988 + }, + { + "epoch": 2.971606273661439, + "grad_norm": 1.0121164321899414, + "learning_rate": 8.714945904505105e-06, + "loss": 1.9468, + "mean_token_accuracy": 0.5557835698127747, + "num_tokens": 5617314878.0, + "step": 10989 + }, + { + "epoch": 2.971876690102758, + "grad_norm": 1.109472393989563, + "learning_rate": 8.71342110905763e-06, + "loss": 1.8283, + "mean_token_accuracy": 0.5846962332725525, + "num_tokens": 5617795728.0, + "step": 10990 + }, + { + "epoch": 2.972147106544078, + "grad_norm": 1.1122103929519653, + "learning_rate": 8.711896383769113e-06, + "loss": 1.7676, + "mean_token_accuracy": 0.5654622912406921, + "num_tokens": 5618319836.0, + "step": 10991 + }, + { + "epoch": 2.9724175229853973, + "grad_norm": 1.205517292022705, + "learning_rate": 8.710371728686354e-06, + "loss": 1.7375, + "mean_token_accuracy": 0.6138709783554077, + "num_tokens": 5618750075.0, + "step": 10992 + }, + { + "epoch": 2.9726879394267174, + "grad_norm": 0.9530333280563354, + "learning_rate": 8.708847143856114e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.562816321849823, + "num_tokens": 5619261501.0, + "step": 10993 + }, + { + "epoch": 2.9729583558680366, + "grad_norm": 1.0509978532791138, + "learning_rate": 8.707322629325187e-06, + "loss": 1.9736, + "mean_token_accuracy": 0.5645179748535156, + "num_tokens": 5619758047.0, + "step": 10994 + }, + { + "epoch": 2.9732287723093567, + "grad_norm": 1.2872095108032227, + "learning_rate": 8.705798185140347e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5782512426376343, + "num_tokens": 5620282278.0, + "step": 10995 + }, + { + "epoch": 2.973499188750676, + "grad_norm": 1.2332381010055542, + "learning_rate": 8.704273811348359e-06, + "loss": 1.7741, + "mean_token_accuracy": 0.5930556058883667, + "num_tokens": 5620806498.0, + "step": 10996 + }, + { + "epoch": 2.9737696051919955, + "grad_norm": 0.948101282119751, + "learning_rate": 8.702749507996007e-06, + "loss": 1.7488, + "mean_token_accuracy": 0.6025627255439758, + "num_tokens": 5621321449.0, + "step": 10997 + }, + { + "epoch": 2.974040021633315, + "grad_norm": 1.101283073425293, + "learning_rate": 8.701225275130053e-06, + "loss": 1.8601, + "mean_token_accuracy": 0.5794889330863953, + "num_tokens": 5621845682.0, + "step": 10998 + }, + { + "epoch": 2.974310438074635, + "grad_norm": 1.070540428161621, + "learning_rate": 8.699701112797265e-06, + "loss": 1.9202, + "mean_token_accuracy": 0.5738924741744995, + "num_tokens": 5622361931.0, + "step": 10999 + }, + { + "epoch": 2.9745808545159544, + "grad_norm": 1.2116267681121826, + "learning_rate": 8.698177021044416e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5764484405517578, + "num_tokens": 5622855985.0, + "step": 11000 + }, + { + "epoch": 2.974851270957274, + "grad_norm": 0.4722798466682434, + "learning_rate": 8.696652999918267e-06, + "loss": 1.2214, + "mean_token_accuracy": 0.6782424449920654, + "num_tokens": 5623380146.0, + "step": 11001 + }, + { + "epoch": 2.9751216873985937, + "grad_norm": 1.301250696182251, + "learning_rate": 8.695129049465572e-06, + "loss": 1.8749, + "mean_token_accuracy": 0.572547435760498, + "num_tokens": 5623904332.0, + "step": 11002 + }, + { + "epoch": 2.9753921038399134, + "grad_norm": 1.0878063440322876, + "learning_rate": 8.693605169733102e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5685495138168335, + "num_tokens": 5624428609.0, + "step": 11003 + }, + { + "epoch": 2.975662520281233, + "grad_norm": 1.1312330961227417, + "learning_rate": 8.692081360767603e-06, + "loss": 1.9079, + "mean_token_accuracy": 0.5764054656028748, + "num_tokens": 5624942051.0, + "step": 11004 + }, + { + "epoch": 2.9759329367225527, + "grad_norm": 1.0930469036102295, + "learning_rate": 8.690557622615834e-06, + "loss": 1.8839, + "mean_token_accuracy": 0.5621545314788818, + "num_tokens": 5625466231.0, + "step": 11005 + }, + { + "epoch": 2.9762033531638723, + "grad_norm": 1.1113958358764648, + "learning_rate": 8.689033955324552e-06, + "loss": 1.9313, + "mean_token_accuracy": 0.540864884853363, + "num_tokens": 5625990515.0, + "step": 11006 + }, + { + "epoch": 2.976473769605192, + "grad_norm": 0.9163936376571655, + "learning_rate": 8.687510358940508e-06, + "loss": 1.7352, + "mean_token_accuracy": 0.5743011236190796, + "num_tokens": 5626514635.0, + "step": 11007 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 1.034516453742981, + "learning_rate": 8.685986833510438e-06, + "loss": 1.8021, + "mean_token_accuracy": 0.5868458151817322, + "num_tokens": 5627038889.0, + "step": 11008 + }, + { + "epoch": 2.9770146024878312, + "grad_norm": 1.25911545753479, + "learning_rate": 8.684463379081105e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5691806674003601, + "num_tokens": 5627563067.0, + "step": 11009 + }, + { + "epoch": 2.977285018929151, + "grad_norm": 1.2095118761062622, + "learning_rate": 8.682939995699242e-06, + "loss": 1.8742, + "mean_token_accuracy": 0.5669118762016296, + "num_tokens": 5628087220.0, + "step": 11010 + }, + { + "epoch": 2.9775554353704705, + "grad_norm": 1.2297734022140503, + "learning_rate": 8.681416683411593e-06, + "loss": 2.0114, + "mean_token_accuracy": 0.5482515096664429, + "num_tokens": 5628588264.0, + "step": 11011 + }, + { + "epoch": 2.97782585181179, + "grad_norm": 1.1464426517486572, + "learning_rate": 8.6798934422649e-06, + "loss": 2.0121, + "mean_token_accuracy": 0.5351855754852295, + "num_tokens": 5629112536.0, + "step": 11012 + }, + { + "epoch": 2.97809626825311, + "grad_norm": 0.8879081010818481, + "learning_rate": 8.678370272305898e-06, + "loss": 1.8264, + "mean_token_accuracy": 0.5768043994903564, + "num_tokens": 5629636805.0, + "step": 11013 + }, + { + "epoch": 2.9783666846944294, + "grad_norm": 1.2713689804077148, + "learning_rate": 8.676847173581322e-06, + "loss": 1.8428, + "mean_token_accuracy": 0.5883896350860596, + "num_tokens": 5630129413.0, + "step": 11014 + }, + { + "epoch": 2.978637101135749, + "grad_norm": 1.2933589220046997, + "learning_rate": 8.67532414613791e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.5802525281906128, + "num_tokens": 5630653545.0, + "step": 11015 + }, + { + "epoch": 2.9789075175770687, + "grad_norm": 1.0283236503601074, + "learning_rate": 8.673801190022386e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5593410134315491, + "num_tokens": 5631177772.0, + "step": 11016 + }, + { + "epoch": 2.9791779340183884, + "grad_norm": 1.0996935367584229, + "learning_rate": 8.672278305281487e-06, + "loss": 1.9582, + "mean_token_accuracy": 0.5606589317321777, + "num_tokens": 5631692267.0, + "step": 11017 + }, + { + "epoch": 2.979448350459708, + "grad_norm": 1.1360671520233154, + "learning_rate": 8.670755491961933e-06, + "loss": 1.759, + "mean_token_accuracy": 0.604164183139801, + "num_tokens": 5632216479.0, + "step": 11018 + }, + { + "epoch": 2.9797187669010277, + "grad_norm": 1.101884126663208, + "learning_rate": 8.669232750110448e-06, + "loss": 1.9016, + "mean_token_accuracy": 0.5590063333511353, + "num_tokens": 5632740622.0, + "step": 11019 + }, + { + "epoch": 2.9799891833423473, + "grad_norm": 1.176310420036316, + "learning_rate": 8.667710079773759e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5832176208496094, + "num_tokens": 5633264809.0, + "step": 11020 + }, + { + "epoch": 2.980259599783667, + "grad_norm": 0.3873021900653839, + "learning_rate": 8.666187480998587e-06, + "loss": 1.1166, + "mean_token_accuracy": 0.6947773694992065, + "num_tokens": 5633788993.0, + "step": 11021 + }, + { + "epoch": 2.9805300162249866, + "grad_norm": 1.3880465030670166, + "learning_rate": 8.664664953831639e-06, + "loss": 1.716, + "mean_token_accuracy": 0.5924795866012573, + "num_tokens": 5634313160.0, + "step": 11022 + }, + { + "epoch": 2.9808004326663062, + "grad_norm": 1.2255325317382812, + "learning_rate": 8.663142498319644e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.566338300704956, + "num_tokens": 5634811882.0, + "step": 11023 + }, + { + "epoch": 2.981070849107626, + "grad_norm": 1.1171619892120361, + "learning_rate": 8.661620114509308e-06, + "loss": 1.8944, + "mean_token_accuracy": 0.5666996240615845, + "num_tokens": 5635336151.0, + "step": 11024 + }, + { + "epoch": 2.9813412655489455, + "grad_norm": 1.2156412601470947, + "learning_rate": 8.66009780244734e-06, + "loss": 1.885, + "mean_token_accuracy": 0.559420108795166, + "num_tokens": 5635860334.0, + "step": 11025 + }, + { + "epoch": 2.981611681990265, + "grad_norm": 1.2866008281707764, + "learning_rate": 8.658575562180455e-06, + "loss": 1.8319, + "mean_token_accuracy": 0.5799627304077148, + "num_tokens": 5636384564.0, + "step": 11026 + }, + { + "epoch": 2.981882098431585, + "grad_norm": 1.129035234451294, + "learning_rate": 8.65705339375536e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5615803003311157, + "num_tokens": 5636908783.0, + "step": 11027 + }, + { + "epoch": 2.9821525148729044, + "grad_norm": 1.0375887155532837, + "learning_rate": 8.65553129721875e-06, + "loss": 1.9166, + "mean_token_accuracy": 0.5639393329620361, + "num_tokens": 5637419025.0, + "step": 11028 + }, + { + "epoch": 2.9824229313142236, + "grad_norm": 0.990028440952301, + "learning_rate": 8.654009272617342e-06, + "loss": 1.8048, + "mean_token_accuracy": 0.5546096563339233, + "num_tokens": 5637943304.0, + "step": 11029 + }, + { + "epoch": 2.9826933477555437, + "grad_norm": 1.0128382444381714, + "learning_rate": 8.652487319997826e-06, + "loss": 1.6722, + "mean_token_accuracy": 0.5856267213821411, + "num_tokens": 5638467548.0, + "step": 11030 + }, + { + "epoch": 2.982963764196863, + "grad_norm": 1.274462103843689, + "learning_rate": 8.650965439406901e-06, + "loss": 1.8694, + "mean_token_accuracy": 0.5726867914199829, + "num_tokens": 5638970719.0, + "step": 11031 + }, + { + "epoch": 2.983234180638183, + "grad_norm": 1.0324947834014893, + "learning_rate": 8.649443630891267e-06, + "loss": 1.8242, + "mean_token_accuracy": 0.5648150444030762, + "num_tokens": 5639494853.0, + "step": 11032 + }, + { + "epoch": 2.983504597079502, + "grad_norm": 1.0601000785827637, + "learning_rate": 8.647921894497616e-06, + "loss": 1.8355, + "mean_token_accuracy": 0.5823884606361389, + "num_tokens": 5639986002.0, + "step": 11033 + }, + { + "epoch": 2.9837750135208223, + "grad_norm": 0.9796707034111023, + "learning_rate": 8.646400230272633e-06, + "loss": 1.9443, + "mean_token_accuracy": 0.5691609978675842, + "num_tokens": 5640468446.0, + "step": 11034 + }, + { + "epoch": 2.9840454299621415, + "grad_norm": 0.9708990454673767, + "learning_rate": 8.644878638263016e-06, + "loss": 1.8107, + "mean_token_accuracy": 0.5551812648773193, + "num_tokens": 5640992643.0, + "step": 11035 + }, + { + "epoch": 2.9843158464034616, + "grad_norm": 0.9928115606307983, + "learning_rate": 8.643357118515449e-06, + "loss": 1.697, + "mean_token_accuracy": 0.6150468587875366, + "num_tokens": 5641516893.0, + "step": 11036 + }, + { + "epoch": 2.984586262844781, + "grad_norm": 1.145051121711731, + "learning_rate": 8.641835671076613e-06, + "loss": 1.7782, + "mean_token_accuracy": 0.5799935460090637, + "num_tokens": 5642041118.0, + "step": 11037 + }, + { + "epoch": 2.9848566792861004, + "grad_norm": 1.0003705024719238, + "learning_rate": 8.640314295993198e-06, + "loss": 1.8404, + "mean_token_accuracy": 0.5822837352752686, + "num_tokens": 5642565266.0, + "step": 11038 + }, + { + "epoch": 2.98512709572742, + "grad_norm": 1.064913034439087, + "learning_rate": 8.638792993311875e-06, + "loss": 1.8692, + "mean_token_accuracy": 0.5789150595664978, + "num_tokens": 5643077423.0, + "step": 11039 + }, + { + "epoch": 2.9853975121687397, + "grad_norm": 1.030504584312439, + "learning_rate": 8.637271763079328e-06, + "loss": 1.9175, + "mean_token_accuracy": 0.541012167930603, + "num_tokens": 5643601682.0, + "step": 11040 + }, + { + "epoch": 2.9856679286100594, + "grad_norm": 0.44142216444015503, + "learning_rate": 8.635750605342234e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.7055590152740479, + "num_tokens": 5644083536.0, + "step": 11041 + }, + { + "epoch": 2.985938345051379, + "grad_norm": 1.2087786197662354, + "learning_rate": 8.634229520147262e-06, + "loss": 1.7667, + "mean_token_accuracy": 0.5898277163505554, + "num_tokens": 5644595634.0, + "step": 11042 + }, + { + "epoch": 2.9862087614926986, + "grad_norm": 1.2698901891708374, + "learning_rate": 8.632708507541085e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5793251991271973, + "num_tokens": 5645063829.0, + "step": 11043 + }, + { + "epoch": 2.9864791779340183, + "grad_norm": 1.0232157707214355, + "learning_rate": 8.631187567570377e-06, + "loss": 1.9244, + "mean_token_accuracy": 0.57041335105896, + "num_tokens": 5645588030.0, + "step": 11044 + }, + { + "epoch": 2.986749594375338, + "grad_norm": 1.578427791595459, + "learning_rate": 8.629666700281796e-06, + "loss": 1.7644, + "mean_token_accuracy": 0.5739734172821045, + "num_tokens": 5646108657.0, + "step": 11045 + }, + { + "epoch": 2.9870200108166576, + "grad_norm": 1.038320779800415, + "learning_rate": 8.628145905722015e-06, + "loss": 1.5977, + "mean_token_accuracy": 0.6063291430473328, + "num_tokens": 5646632870.0, + "step": 11046 + }, + { + "epoch": 2.987290427257977, + "grad_norm": 1.274828314781189, + "learning_rate": 8.626625183937688e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5668879151344299, + "num_tokens": 5647157018.0, + "step": 11047 + }, + { + "epoch": 2.987560843699297, + "grad_norm": 0.9987815022468567, + "learning_rate": 8.625104534975482e-06, + "loss": 1.7099, + "mean_token_accuracy": 0.5931601524353027, + "num_tokens": 5647622693.0, + "step": 11048 + }, + { + "epoch": 2.9878312601406165, + "grad_norm": 0.8807297945022583, + "learning_rate": 8.623583958882056e-06, + "loss": 1.4894, + "mean_token_accuracy": 0.6513856649398804, + "num_tokens": 5648123421.0, + "step": 11049 + }, + { + "epoch": 2.988101676581936, + "grad_norm": 1.1037194728851318, + "learning_rate": 8.622063455704062e-06, + "loss": 1.9528, + "mean_token_accuracy": 0.5537924766540527, + "num_tokens": 5648623037.0, + "step": 11050 + }, + { + "epoch": 2.988372093023256, + "grad_norm": 1.2950435876846313, + "learning_rate": 8.62054302548815e-06, + "loss": 1.9576, + "mean_token_accuracy": 0.5400403738021851, + "num_tokens": 5649147184.0, + "step": 11051 + }, + { + "epoch": 2.9886425094645754, + "grad_norm": 1.0931024551391602, + "learning_rate": 8.61902266828098e-06, + "loss": 1.9065, + "mean_token_accuracy": 0.5687368512153625, + "num_tokens": 5649671432.0, + "step": 11052 + }, + { + "epoch": 2.988912925905895, + "grad_norm": 0.940433144569397, + "learning_rate": 8.617502384129195e-06, + "loss": 1.8408, + "mean_token_accuracy": 0.5668777823448181, + "num_tokens": 5650195695.0, + "step": 11053 + }, + { + "epoch": 2.9891833423472147, + "grad_norm": 1.1716383695602417, + "learning_rate": 8.615982173079442e-06, + "loss": 1.9256, + "mean_token_accuracy": 0.5246140956878662, + "num_tokens": 5650719930.0, + "step": 11054 + }, + { + "epoch": 2.9894537587885344, + "grad_norm": 1.2809555530548096, + "learning_rate": 8.614462035178369e-06, + "loss": 1.9306, + "mean_token_accuracy": 0.566240668296814, + "num_tokens": 5651244115.0, + "step": 11055 + }, + { + "epoch": 2.989724175229854, + "grad_norm": 1.0838041305541992, + "learning_rate": 8.612941970472616e-06, + "loss": 1.8983, + "mean_token_accuracy": 0.5810558199882507, + "num_tokens": 5651716358.0, + "step": 11056 + }, + { + "epoch": 2.9899945916711737, + "grad_norm": 1.13059401512146, + "learning_rate": 8.61142197900882e-06, + "loss": 1.818, + "mean_token_accuracy": 0.602180540561676, + "num_tokens": 5652154088.0, + "step": 11057 + }, + { + "epoch": 2.9902650081124933, + "grad_norm": 1.149804711341858, + "learning_rate": 8.609902060833626e-06, + "loss": 1.9067, + "mean_token_accuracy": 0.5673155188560486, + "num_tokens": 5652678323.0, + "step": 11058 + }, + { + "epoch": 2.990535424553813, + "grad_norm": 0.9690600633621216, + "learning_rate": 8.608382215993667e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5602637529373169, + "num_tokens": 5653202604.0, + "step": 11059 + }, + { + "epoch": 2.9908058409951326, + "grad_norm": 0.9662929177284241, + "learning_rate": 8.606862444535569e-06, + "loss": 1.9364, + "mean_token_accuracy": 0.5307745337486267, + "num_tokens": 5653726839.0, + "step": 11060 + }, + { + "epoch": 2.9910762574364522, + "grad_norm": 0.4503423273563385, + "learning_rate": 8.605342746505971e-06, + "loss": 1.1825, + "mean_token_accuracy": 0.6743605136871338, + "num_tokens": 5654251004.0, + "step": 11061 + }, + { + "epoch": 2.991346673877772, + "grad_norm": 1.2007547616958618, + "learning_rate": 8.603823121951503e-06, + "loss": 1.7756, + "mean_token_accuracy": 0.5732059478759766, + "num_tokens": 5654775240.0, + "step": 11062 + }, + { + "epoch": 2.9916170903190915, + "grad_norm": 1.105377197265625, + "learning_rate": 8.602303570918784e-06, + "loss": 1.7785, + "mean_token_accuracy": 0.5855756402015686, + "num_tokens": 5655299278.0, + "step": 11063 + }, + { + "epoch": 2.991887506760411, + "grad_norm": 1.1184844970703125, + "learning_rate": 8.600784093454446e-06, + "loss": 1.784, + "mean_token_accuracy": 0.5771818161010742, + "num_tokens": 5655823503.0, + "step": 11064 + }, + { + "epoch": 2.992157923201731, + "grad_norm": 0.9980412721633911, + "learning_rate": 8.599264689605105e-06, + "loss": 1.8373, + "mean_token_accuracy": 0.5798904299736023, + "num_tokens": 5656347711.0, + "step": 11065 + }, + { + "epoch": 2.9924283396430504, + "grad_norm": 0.9250726103782654, + "learning_rate": 8.597745359417386e-06, + "loss": 1.8008, + "mean_token_accuracy": 0.5983213186264038, + "num_tokens": 5656825826.0, + "step": 11066 + }, + { + "epoch": 2.99269875608437, + "grad_norm": 1.438604474067688, + "learning_rate": 8.596226102937902e-06, + "loss": 1.9808, + "mean_token_accuracy": 0.5475487112998962, + "num_tokens": 5657350007.0, + "step": 11067 + }, + { + "epoch": 2.9929691725256897, + "grad_norm": 1.280293583869934, + "learning_rate": 8.594706920213269e-06, + "loss": 1.8995, + "mean_token_accuracy": 0.570892333984375, + "num_tokens": 5657874240.0, + "step": 11068 + }, + { + "epoch": 2.9932395889670094, + "grad_norm": 0.9102675914764404, + "learning_rate": 8.593187811290104e-06, + "loss": 1.8706, + "mean_token_accuracy": 0.5768985748291016, + "num_tokens": 5658398364.0, + "step": 11069 + }, + { + "epoch": 2.9935100054083286, + "grad_norm": 1.277182698249817, + "learning_rate": 8.591668776215016e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5663100481033325, + "num_tokens": 5658922631.0, + "step": 11070 + }, + { + "epoch": 2.9937804218496487, + "grad_norm": 1.1093684434890747, + "learning_rate": 8.590149815034607e-06, + "loss": 1.8744, + "mean_token_accuracy": 0.554985761642456, + "num_tokens": 5659446719.0, + "step": 11071 + }, + { + "epoch": 2.994050838290968, + "grad_norm": 1.0861884355545044, + "learning_rate": 8.588630927795494e-06, + "loss": 1.9933, + "mean_token_accuracy": 0.546851396560669, + "num_tokens": 5659918350.0, + "step": 11072 + }, + { + "epoch": 2.994321254732288, + "grad_norm": 1.2062712907791138, + "learning_rate": 8.587112114544273e-06, + "loss": 1.9543, + "mean_token_accuracy": 0.553299069404602, + "num_tokens": 5660442626.0, + "step": 11073 + }, + { + "epoch": 2.994591671173607, + "grad_norm": 1.2287380695343018, + "learning_rate": 8.585593375327548e-06, + "loss": 1.8818, + "mean_token_accuracy": 0.5865957140922546, + "num_tokens": 5660966896.0, + "step": 11074 + }, + { + "epoch": 2.9948620876149272, + "grad_norm": 0.9244049787521362, + "learning_rate": 8.584074710191919e-06, + "loss": 1.7981, + "mean_token_accuracy": 0.5849756002426147, + "num_tokens": 5661491132.0, + "step": 11075 + }, + { + "epoch": 2.9951325040562464, + "grad_norm": 1.1869651079177856, + "learning_rate": 8.582556119183984e-06, + "loss": 1.8664, + "mean_token_accuracy": 0.5679559111595154, + "num_tokens": 5662015391.0, + "step": 11076 + }, + { + "epoch": 2.9954029204975665, + "grad_norm": 1.5372140407562256, + "learning_rate": 8.581037602350336e-06, + "loss": 1.7791, + "mean_token_accuracy": 0.568227231502533, + "num_tokens": 5662522116.0, + "step": 11077 + }, + { + "epoch": 2.9956733369388857, + "grad_norm": 1.1901216506958008, + "learning_rate": 8.579519159737568e-06, + "loss": 1.8341, + "mean_token_accuracy": 0.575918436050415, + "num_tokens": 5663046351.0, + "step": 11078 + }, + { + "epoch": 2.9959437533802054, + "grad_norm": 1.1150031089782715, + "learning_rate": 8.578000791392271e-06, + "loss": 1.988, + "mean_token_accuracy": 0.5702762007713318, + "num_tokens": 5663506533.0, + "step": 11079 + }, + { + "epoch": 2.996214169821525, + "grad_norm": 1.2037863731384277, + "learning_rate": 8.576482497361027e-06, + "loss": 1.9443, + "mean_token_accuracy": 0.5467740297317505, + "num_tokens": 5664030737.0, + "step": 11080 + }, + { + "epoch": 2.9964845862628446, + "grad_norm": 0.41236233711242676, + "learning_rate": 8.574964277690434e-06, + "loss": 1.1387, + "mean_token_accuracy": 0.6951555013656616, + "num_tokens": 5664554953.0, + "step": 11081 + }, + { + "epoch": 2.9967550027041643, + "grad_norm": 1.4119927883148193, + "learning_rate": 8.573446132427066e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5781181454658508, + "num_tokens": 5665079231.0, + "step": 11082 + }, + { + "epoch": 2.997025419145484, + "grad_norm": 1.1886131763458252, + "learning_rate": 8.571928061617503e-06, + "loss": 1.9278, + "mean_token_accuracy": 0.559345006942749, + "num_tokens": 5665575922.0, + "step": 11083 + }, + { + "epoch": 2.9972958355868036, + "grad_norm": 1.1754372119903564, + "learning_rate": 8.570410065308332e-06, + "loss": 1.9478, + "mean_token_accuracy": 0.5426034927368164, + "num_tokens": 5666100082.0, + "step": 11084 + }, + { + "epoch": 2.997566252028123, + "grad_norm": 1.312422752380371, + "learning_rate": 8.568892143546123e-06, + "loss": 1.9158, + "mean_token_accuracy": 0.598736047744751, + "num_tokens": 5666561802.0, + "step": 11085 + }, + { + "epoch": 2.997836668469443, + "grad_norm": 1.2039532661437988, + "learning_rate": 8.567374296377456e-06, + "loss": 2.0466, + "mean_token_accuracy": 0.5339058637619019, + "num_tokens": 5667085880.0, + "step": 11086 + }, + { + "epoch": 2.9981070849107625, + "grad_norm": 0.9880210757255554, + "learning_rate": 8.565856523848897e-06, + "loss": 1.8388, + "mean_token_accuracy": 0.568562388420105, + "num_tokens": 5667610104.0, + "step": 11087 + }, + { + "epoch": 2.998377501352082, + "grad_norm": 1.2137420177459717, + "learning_rate": 8.564338826007017e-06, + "loss": 1.9022, + "mean_token_accuracy": 0.5659273266792297, + "num_tokens": 5668134228.0, + "step": 11088 + }, + { + "epoch": 2.998647917793402, + "grad_norm": 1.214917778968811, + "learning_rate": 8.562821202898389e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.5734578371047974, + "num_tokens": 5668658496.0, + "step": 11089 + }, + { + "epoch": 2.9989183342347214, + "grad_norm": 1.1323614120483398, + "learning_rate": 8.561303654569567e-06, + "loss": 1.7681, + "mean_token_accuracy": 0.5885987281799316, + "num_tokens": 5669158359.0, + "step": 11090 + }, + { + "epoch": 2.999188750676041, + "grad_norm": 0.9477324485778809, + "learning_rate": 8.559786181067125e-06, + "loss": 1.9286, + "mean_token_accuracy": 0.5404131412506104, + "num_tokens": 5669682571.0, + "step": 11091 + }, + { + "epoch": 2.9994591671173607, + "grad_norm": 1.370483160018921, + "learning_rate": 8.55826878243762e-06, + "loss": 1.934, + "mean_token_accuracy": 0.5689921379089355, + "num_tokens": 5670206812.0, + "step": 11092 + }, + { + "epoch": 2.9997295835586804, + "grad_norm": 1.3443152904510498, + "learning_rate": 8.556751458727608e-06, + "loss": 1.7624, + "mean_token_accuracy": 0.5858379602432251, + "num_tokens": 5670711300.0, + "step": 11093 + }, + { + "epoch": 3.0, + "grad_norm": 1.0319193601608276, + "learning_rate": 8.555234209983642e-06, + "loss": 1.823, + "mean_token_accuracy": 0.5861802697181702, + "num_tokens": 5670973437.0, + "step": 11094 + }, + { + "epoch": 3.0002704164413196, + "grad_norm": 1.1757069826126099, + "learning_rate": 8.553717036252287e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5689670443534851, + "num_tokens": 5671459759.0, + "step": 11095 + }, + { + "epoch": 3.0005408328826393, + "grad_norm": 1.2201794385910034, + "learning_rate": 8.552199937580086e-06, + "loss": 1.8679, + "mean_token_accuracy": 0.5736418962478638, + "num_tokens": 5671926205.0, + "step": 11096 + }, + { + "epoch": 3.000811249323959, + "grad_norm": 0.9693132042884827, + "learning_rate": 8.550682914013585e-06, + "loss": 1.7928, + "mean_token_accuracy": 0.575217068195343, + "num_tokens": 5672450349.0, + "step": 11097 + }, + { + "epoch": 3.0010816657652786, + "grad_norm": 1.569258689880371, + "learning_rate": 8.54916596559934e-06, + "loss": 1.8486, + "mean_token_accuracy": 0.5616324543952942, + "num_tokens": 5672937963.0, + "step": 11098 + }, + { + "epoch": 3.001352082206598, + "grad_norm": 1.221989393234253, + "learning_rate": 8.54764909238389e-06, + "loss": 1.9293, + "mean_token_accuracy": 0.5598287582397461, + "num_tokens": 5673462158.0, + "step": 11099 + }, + { + "epoch": 3.001622498647918, + "grad_norm": 1.1200675964355469, + "learning_rate": 8.546132294413775e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.5686613321304321, + "num_tokens": 5673986400.0, + "step": 11100 + }, + { + "epoch": 3.0018929150892375, + "grad_norm": 0.3812822699546814, + "learning_rate": 8.544615571735539e-06, + "loss": 1.1951, + "mean_token_accuracy": 0.6969600915908813, + "num_tokens": 5674447371.0, + "step": 11101 + }, + { + "epoch": 3.002163331530557, + "grad_norm": 1.1989659070968628, + "learning_rate": 8.543098924395718e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.5737754106521606, + "num_tokens": 5674971473.0, + "step": 11102 + }, + { + "epoch": 3.002433747971877, + "grad_norm": 1.4887701272964478, + "learning_rate": 8.541582352440848e-06, + "loss": 1.9287, + "mean_token_accuracy": 0.5544208288192749, + "num_tokens": 5675495661.0, + "step": 11103 + }, + { + "epoch": 3.0027041644131964, + "grad_norm": 0.8777472972869873, + "learning_rate": 8.540065855917461e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.5695021748542786, + "num_tokens": 5676019915.0, + "step": 11104 + }, + { + "epoch": 3.002974580854516, + "grad_norm": 1.1105107069015503, + "learning_rate": 8.538549434872087e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5588815212249756, + "num_tokens": 5676544063.0, + "step": 11105 + }, + { + "epoch": 3.0032449972958357, + "grad_norm": 1.145887017250061, + "learning_rate": 8.537033089351256e-06, + "loss": 1.7702, + "mean_token_accuracy": 0.5741887092590332, + "num_tokens": 5677068214.0, + "step": 11106 + }, + { + "epoch": 3.0035154137371554, + "grad_norm": 0.965385913848877, + "learning_rate": 8.535516819401493e-06, + "loss": 1.8724, + "mean_token_accuracy": 0.5794326066970825, + "num_tokens": 5677592395.0, + "step": 11107 + }, + { + "epoch": 3.003785830178475, + "grad_norm": 0.9132204055786133, + "learning_rate": 8.534000625069319e-06, + "loss": 1.8308, + "mean_token_accuracy": 0.580237627029419, + "num_tokens": 5678056036.0, + "step": 11108 + }, + { + "epoch": 3.0040562466197946, + "grad_norm": 1.2366656064987183, + "learning_rate": 8.532484506401261e-06, + "loss": 2.0284, + "mean_token_accuracy": 0.5284818410873413, + "num_tokens": 5678535753.0, + "step": 11109 + }, + { + "epoch": 3.0043266630611143, + "grad_norm": 1.1961315870285034, + "learning_rate": 8.530968463443832e-06, + "loss": 1.7364, + "mean_token_accuracy": 0.5913293361663818, + "num_tokens": 5679020906.0, + "step": 11110 + }, + { + "epoch": 3.004597079502434, + "grad_norm": 1.0137214660644531, + "learning_rate": 8.529452496243553e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.5733932256698608, + "num_tokens": 5679490751.0, + "step": 11111 + }, + { + "epoch": 3.0048674959437536, + "grad_norm": 0.9987602829933167, + "learning_rate": 8.527936604846938e-06, + "loss": 1.7464, + "mean_token_accuracy": 0.5921638011932373, + "num_tokens": 5680015028.0, + "step": 11112 + }, + { + "epoch": 3.005137912385073, + "grad_norm": 1.1681548357009888, + "learning_rate": 8.526420789300498e-06, + "loss": 1.7516, + "mean_token_accuracy": 0.5963636040687561, + "num_tokens": 5680485875.0, + "step": 11113 + }, + { + "epoch": 3.005408328826393, + "grad_norm": 0.9787306785583496, + "learning_rate": 8.524905049650738e-06, + "loss": 1.903, + "mean_token_accuracy": 0.5632998943328857, + "num_tokens": 5681010074.0, + "step": 11114 + }, + { + "epoch": 3.005678745267712, + "grad_norm": 1.1863213777542114, + "learning_rate": 8.523389385944174e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5615848302841187, + "num_tokens": 5681534283.0, + "step": 11115 + }, + { + "epoch": 3.0059491617090317, + "grad_norm": 1.2701942920684814, + "learning_rate": 8.521873798227307e-06, + "loss": 1.9275, + "mean_token_accuracy": 0.5624474287033081, + "num_tokens": 5682058542.0, + "step": 11116 + }, + { + "epoch": 3.0062195781503513, + "grad_norm": 1.1158552169799805, + "learning_rate": 8.520358286546637e-06, + "loss": 1.813, + "mean_token_accuracy": 0.584689736366272, + "num_tokens": 5682522685.0, + "step": 11117 + }, + { + "epoch": 3.006489994591671, + "grad_norm": 0.9651569128036499, + "learning_rate": 8.518842850948667e-06, + "loss": 1.9462, + "mean_token_accuracy": 0.5333570837974548, + "num_tokens": 5683046899.0, + "step": 11118 + }, + { + "epoch": 3.0067604110329906, + "grad_norm": 1.0991857051849365, + "learning_rate": 8.517327491479901e-06, + "loss": 1.7054, + "mean_token_accuracy": 0.5948219299316406, + "num_tokens": 5683534113.0, + "step": 11119 + }, + { + "epoch": 3.0070308274743103, + "grad_norm": 1.2570899724960327, + "learning_rate": 8.515812208186819e-06, + "loss": 1.9309, + "mean_token_accuracy": 0.564480185508728, + "num_tokens": 5684011113.0, + "step": 11120 + }, + { + "epoch": 3.00730124391563, + "grad_norm": 0.43904221057891846, + "learning_rate": 8.514297001115931e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.7225483655929565, + "num_tokens": 5684440593.0, + "step": 11121 + }, + { + "epoch": 3.0075716603569496, + "grad_norm": 1.1758257150650024, + "learning_rate": 8.512781870313723e-06, + "loss": 1.9558, + "mean_token_accuracy": 0.565178632736206, + "num_tokens": 5684867939.0, + "step": 11122 + }, + { + "epoch": 3.007842076798269, + "grad_norm": 1.2631770372390747, + "learning_rate": 8.511266815826676e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5674209594726562, + "num_tokens": 5685392112.0, + "step": 11123 + }, + { + "epoch": 3.008112493239589, + "grad_norm": 1.1772944927215576, + "learning_rate": 8.509751837701284e-06, + "loss": 1.827, + "mean_token_accuracy": 0.5896561145782471, + "num_tokens": 5685883063.0, + "step": 11124 + }, + { + "epoch": 3.0083829096809085, + "grad_norm": 0.8728044629096985, + "learning_rate": 8.508236935984027e-06, + "loss": 1.9319, + "mean_token_accuracy": 0.5659606456756592, + "num_tokens": 5686407343.0, + "step": 11125 + }, + { + "epoch": 3.008653326122228, + "grad_norm": 1.3121817111968994, + "learning_rate": 8.506722110721388e-06, + "loss": 1.8903, + "mean_token_accuracy": 0.5752042531967163, + "num_tokens": 5686931462.0, + "step": 11126 + }, + { + "epoch": 3.0089237425635478, + "grad_norm": 1.1454343795776367, + "learning_rate": 8.50520736195985e-06, + "loss": 1.7864, + "mean_token_accuracy": 0.5879942774772644, + "num_tokens": 5687455692.0, + "step": 11127 + }, + { + "epoch": 3.0091941590048674, + "grad_norm": 1.0158419609069824, + "learning_rate": 8.503692689745888e-06, + "loss": 1.9311, + "mean_token_accuracy": 0.56714928150177, + "num_tokens": 5687934707.0, + "step": 11128 + }, + { + "epoch": 3.009464575446187, + "grad_norm": 1.0092324018478394, + "learning_rate": 8.502178094125972e-06, + "loss": 1.8719, + "mean_token_accuracy": 0.5911053419113159, + "num_tokens": 5688362309.0, + "step": 11129 + }, + { + "epoch": 3.0097349918875067, + "grad_norm": 1.2964587211608887, + "learning_rate": 8.50066357514658e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5870503783226013, + "num_tokens": 5688857803.0, + "step": 11130 + }, + { + "epoch": 3.0100054083288263, + "grad_norm": 1.0273327827453613, + "learning_rate": 8.499149132854177e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.5744622349739075, + "num_tokens": 5689345947.0, + "step": 11131 + }, + { + "epoch": 3.010275824770146, + "grad_norm": 0.9858922958374023, + "learning_rate": 8.497634767295235e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5833942890167236, + "num_tokens": 5689870137.0, + "step": 11132 + }, + { + "epoch": 3.0105462412114656, + "grad_norm": 1.170900821685791, + "learning_rate": 8.496120478516216e-06, + "loss": 1.8395, + "mean_token_accuracy": 0.5591105818748474, + "num_tokens": 5690305259.0, + "step": 11133 + }, + { + "epoch": 3.0108166576527853, + "grad_norm": 1.0447603464126587, + "learning_rate": 8.494606266563586e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5513302087783813, + "num_tokens": 5690824620.0, + "step": 11134 + }, + { + "epoch": 3.011087074094105, + "grad_norm": 1.268407940864563, + "learning_rate": 8.493092131483803e-06, + "loss": 1.7463, + "mean_token_accuracy": 0.5864891409873962, + "num_tokens": 5691348870.0, + "step": 11135 + }, + { + "epoch": 3.0113574905354246, + "grad_norm": 0.9818552732467651, + "learning_rate": 8.491578073323326e-06, + "loss": 1.8271, + "mean_token_accuracy": 0.5764493942260742, + "num_tokens": 5691873001.0, + "step": 11136 + }, + { + "epoch": 3.011627906976744, + "grad_norm": 1.099510908126831, + "learning_rate": 8.49006409212861e-06, + "loss": 1.831, + "mean_token_accuracy": 0.551335334777832, + "num_tokens": 5692397191.0, + "step": 11137 + }, + { + "epoch": 3.011898323418064, + "grad_norm": 1.0242477655410767, + "learning_rate": 8.48855018794611e-06, + "loss": 1.9245, + "mean_token_accuracy": 0.5751920938491821, + "num_tokens": 5692921469.0, + "step": 11138 + }, + { + "epoch": 3.0121687398593835, + "grad_norm": 0.8737895488739014, + "learning_rate": 8.487036360822273e-06, + "loss": 1.9125, + "mean_token_accuracy": 0.5575202107429504, + "num_tokens": 5693445658.0, + "step": 11139 + }, + { + "epoch": 3.012439156300703, + "grad_norm": 1.0344057083129883, + "learning_rate": 8.48552261080355e-06, + "loss": 1.8879, + "mean_token_accuracy": 0.5661132335662842, + "num_tokens": 5693931340.0, + "step": 11140 + }, + { + "epoch": 3.012709572742023, + "grad_norm": 0.39461714029312134, + "learning_rate": 8.484008937936392e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.693333625793457, + "num_tokens": 5694455578.0, + "step": 11141 + }, + { + "epoch": 3.0129799891833424, + "grad_norm": 1.308339238166809, + "learning_rate": 8.48249534226724e-06, + "loss": 1.8902, + "mean_token_accuracy": 0.5653100609779358, + "num_tokens": 5694968896.0, + "step": 11142 + }, + { + "epoch": 3.013250405624662, + "grad_norm": 1.1607074737548828, + "learning_rate": 8.480981823842529e-06, + "loss": 1.79, + "mean_token_accuracy": 0.5717226266860962, + "num_tokens": 5695493117.0, + "step": 11143 + }, + { + "epoch": 3.0135208220659817, + "grad_norm": 1.0856175422668457, + "learning_rate": 8.479468382708708e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.5571387410163879, + "num_tokens": 5696017364.0, + "step": 11144 + }, + { + "epoch": 3.0137912385073014, + "grad_norm": 1.0598818063735962, + "learning_rate": 8.477955018912207e-06, + "loss": 1.8667, + "mean_token_accuracy": 0.5626665949821472, + "num_tokens": 5696541577.0, + "step": 11145 + }, + { + "epoch": 3.014061654948621, + "grad_norm": 1.2285865545272827, + "learning_rate": 8.476441732499462e-06, + "loss": 1.9676, + "mean_token_accuracy": 0.5534806251525879, + "num_tokens": 5697065785.0, + "step": 11146 + }, + { + "epoch": 3.0143320713899406, + "grad_norm": 1.135931372642517, + "learning_rate": 8.474928523516908e-06, + "loss": 1.8967, + "mean_token_accuracy": 0.5649301409721375, + "num_tokens": 5697589955.0, + "step": 11147 + }, + { + "epoch": 3.0146024878312603, + "grad_norm": 1.1912837028503418, + "learning_rate": 8.473415392010973e-06, + "loss": 1.9193, + "mean_token_accuracy": 0.5538127422332764, + "num_tokens": 5698082651.0, + "step": 11148 + }, + { + "epoch": 3.01487290427258, + "grad_norm": 1.1268997192382812, + "learning_rate": 8.471902338028082e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.5918124914169312, + "num_tokens": 5698568621.0, + "step": 11149 + }, + { + "epoch": 3.0151433207138996, + "grad_norm": 1.0383037328720093, + "learning_rate": 8.470389361614663e-06, + "loss": 1.776, + "mean_token_accuracy": 0.5578855872154236, + "num_tokens": 5699042802.0, + "step": 11150 + }, + { + "epoch": 3.015413737155219, + "grad_norm": 0.9872010350227356, + "learning_rate": 8.46887646281714e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5714688897132874, + "num_tokens": 5699566826.0, + "step": 11151 + }, + { + "epoch": 3.015684153596539, + "grad_norm": 1.1401511430740356, + "learning_rate": 8.467363641681925e-06, + "loss": 1.8383, + "mean_token_accuracy": 0.5590673089027405, + "num_tokens": 5700074877.0, + "step": 11152 + }, + { + "epoch": 3.0159545700378585, + "grad_norm": 1.221347451210022, + "learning_rate": 8.465850898255445e-06, + "loss": 1.8905, + "mean_token_accuracy": 0.5598161220550537, + "num_tokens": 5700560309.0, + "step": 11153 + }, + { + "epoch": 3.016224986479178, + "grad_norm": 1.198479413986206, + "learning_rate": 8.46433823258411e-06, + "loss": 1.8226, + "mean_token_accuracy": 0.5669225454330444, + "num_tokens": 5701084538.0, + "step": 11154 + }, + { + "epoch": 3.016495402920498, + "grad_norm": 1.1427171230316162, + "learning_rate": 8.462825644714338e-06, + "loss": 1.9056, + "mean_token_accuracy": 0.5738922357559204, + "num_tokens": 5701608740.0, + "step": 11155 + }, + { + "epoch": 3.016765819361817, + "grad_norm": 1.1300005912780762, + "learning_rate": 8.461313134692535e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5727272033691406, + "num_tokens": 5702086225.0, + "step": 11156 + }, + { + "epoch": 3.0170362358031366, + "grad_norm": 1.105491280555725, + "learning_rate": 8.459800702565108e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.571151852607727, + "num_tokens": 5702588469.0, + "step": 11157 + }, + { + "epoch": 3.0173066522444563, + "grad_norm": 1.148983120918274, + "learning_rate": 8.458288348378472e-06, + "loss": 1.9305, + "mean_token_accuracy": 0.5730042457580566, + "num_tokens": 5703070963.0, + "step": 11158 + }, + { + "epoch": 3.017577068685776, + "grad_norm": 1.0735846757888794, + "learning_rate": 8.456776072179023e-06, + "loss": 1.8794, + "mean_token_accuracy": 0.5689865350723267, + "num_tokens": 5703595247.0, + "step": 11159 + }, + { + "epoch": 3.0178474851270956, + "grad_norm": 1.0050063133239746, + "learning_rate": 8.455263874013158e-06, + "loss": 1.8904, + "mean_token_accuracy": 0.5615890622138977, + "num_tokens": 5704119447.0, + "step": 11160 + }, + { + "epoch": 3.018117901568415, + "grad_norm": 0.4005328416824341, + "learning_rate": 8.453751753927283e-06, + "loss": 1.1581, + "mean_token_accuracy": 0.6961034536361694, + "num_tokens": 5704634337.0, + "step": 11161 + }, + { + "epoch": 3.018388318009735, + "grad_norm": 1.3055299520492554, + "learning_rate": 8.452239711967796e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.5768816471099854, + "num_tokens": 5705154019.0, + "step": 11162 + }, + { + "epoch": 3.0186587344510545, + "grad_norm": 1.220829725265503, + "learning_rate": 8.450727748181081e-06, + "loss": 1.8753, + "mean_token_accuracy": 0.569348931312561, + "num_tokens": 5705678300.0, + "step": 11163 + }, + { + "epoch": 3.018929150892374, + "grad_norm": 1.1322685480117798, + "learning_rate": 8.44921586261354e-06, + "loss": 1.8866, + "mean_token_accuracy": 0.5742062330245972, + "num_tokens": 5706202486.0, + "step": 11164 + }, + { + "epoch": 3.0191995673336938, + "grad_norm": 0.9419131875038147, + "learning_rate": 8.447704055311558e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5867927074432373, + "num_tokens": 5706721415.0, + "step": 11165 + }, + { + "epoch": 3.0194699837750134, + "grad_norm": 1.423557162284851, + "learning_rate": 8.446192326321518e-06, + "loss": 1.9794, + "mean_token_accuracy": 0.5730942487716675, + "num_tokens": 5707188428.0, + "step": 11166 + }, + { + "epoch": 3.019740400216333, + "grad_norm": 1.1871095895767212, + "learning_rate": 8.44468067568981e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.5740619897842407, + "num_tokens": 5707712679.0, + "step": 11167 + }, + { + "epoch": 3.0200108166576527, + "grad_norm": 1.1325650215148926, + "learning_rate": 8.443169103462814e-06, + "loss": 1.9135, + "mean_token_accuracy": 0.5738587379455566, + "num_tokens": 5708236760.0, + "step": 11168 + }, + { + "epoch": 3.0202812330989723, + "grad_norm": 1.0505397319793701, + "learning_rate": 8.441657609686904e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.567035436630249, + "num_tokens": 5708761043.0, + "step": 11169 + }, + { + "epoch": 3.020551649540292, + "grad_norm": 1.1534903049468994, + "learning_rate": 8.44014619440847e-06, + "loss": 1.925, + "mean_token_accuracy": 0.5577961802482605, + "num_tokens": 5709285263.0, + "step": 11170 + }, + { + "epoch": 3.0208220659816116, + "grad_norm": 1.3305913209915161, + "learning_rate": 8.438634857673875e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5881713032722473, + "num_tokens": 5709809401.0, + "step": 11171 + }, + { + "epoch": 3.0210924824229313, + "grad_norm": 0.9751655459403992, + "learning_rate": 8.437123599529494e-06, + "loss": 1.831, + "mean_token_accuracy": 0.5655550956726074, + "num_tokens": 5710333616.0, + "step": 11172 + }, + { + "epoch": 3.021362898864251, + "grad_norm": 1.1165612936019897, + "learning_rate": 8.4356124200217e-06, + "loss": 2.0505, + "mean_token_accuracy": 0.5450482964515686, + "num_tokens": 5710857797.0, + "step": 11173 + }, + { + "epoch": 3.0216333153055706, + "grad_norm": 0.992114245891571, + "learning_rate": 8.434101319196857e-06, + "loss": 1.89, + "mean_token_accuracy": 0.5710189342498779, + "num_tokens": 5711382075.0, + "step": 11174 + }, + { + "epoch": 3.02190373174689, + "grad_norm": 1.1017550230026245, + "learning_rate": 8.432590297101328e-06, + "loss": 1.8573, + "mean_token_accuracy": 0.5676367282867432, + "num_tokens": 5711906143.0, + "step": 11175 + }, + { + "epoch": 3.02217414818821, + "grad_norm": 1.1350346803665161, + "learning_rate": 8.431079353781487e-06, + "loss": 1.6962, + "mean_token_accuracy": 0.609286367893219, + "num_tokens": 5712368983.0, + "step": 11176 + }, + { + "epoch": 3.0224445646295295, + "grad_norm": 1.3046156167984009, + "learning_rate": 8.429568489283677e-06, + "loss": 2.0122, + "mean_token_accuracy": 0.5205650329589844, + "num_tokens": 5712893168.0, + "step": 11177 + }, + { + "epoch": 3.022714981070849, + "grad_norm": 1.0640507936477661, + "learning_rate": 8.428057703654273e-06, + "loss": 1.5755, + "mean_token_accuracy": 0.6129450798034668, + "num_tokens": 5713417432.0, + "step": 11178 + }, + { + "epoch": 3.0229853975121688, + "grad_norm": 1.144546627998352, + "learning_rate": 8.426546996939619e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.5858092904090881, + "num_tokens": 5713941592.0, + "step": 11179 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 0.9846941232681274, + "learning_rate": 8.425036369186071e-06, + "loss": 1.8281, + "mean_token_accuracy": 0.5615395307540894, + "num_tokens": 5714465789.0, + "step": 11180 + }, + { + "epoch": 3.023526230394808, + "grad_norm": 0.4114014804363251, + "learning_rate": 8.42352582043998e-06, + "loss": 1.1546, + "mean_token_accuracy": 0.7006328701972961, + "num_tokens": 5714990049.0, + "step": 11181 + }, + { + "epoch": 3.0237966468361277, + "grad_norm": 1.382981538772583, + "learning_rate": 8.422015350747695e-06, + "loss": 1.8428, + "mean_token_accuracy": 0.5849952101707458, + "num_tokens": 5715514331.0, + "step": 11182 + }, + { + "epoch": 3.0240670632774473, + "grad_norm": 1.4816430807113647, + "learning_rate": 8.420504960155555e-06, + "loss": 1.8143, + "mean_token_accuracy": 0.5615421533584595, + "num_tokens": 5716038449.0, + "step": 11183 + }, + { + "epoch": 3.024337479718767, + "grad_norm": 1.117153525352478, + "learning_rate": 8.418994648709912e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.6000188589096069, + "num_tokens": 5716497658.0, + "step": 11184 + }, + { + "epoch": 3.0246078961600866, + "grad_norm": 1.0794248580932617, + "learning_rate": 8.417484416457106e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5921012759208679, + "num_tokens": 5717007306.0, + "step": 11185 + }, + { + "epoch": 3.0248783126014063, + "grad_norm": 1.4115411043167114, + "learning_rate": 8.41597426344347e-06, + "loss": 1.7007, + "mean_token_accuracy": 0.6171368360519409, + "num_tokens": 5717531467.0, + "step": 11186 + }, + { + "epoch": 3.025148729042726, + "grad_norm": 1.0495357513427734, + "learning_rate": 8.414464189715345e-06, + "loss": 1.9341, + "mean_token_accuracy": 0.5447377562522888, + "num_tokens": 5718055599.0, + "step": 11187 + }, + { + "epoch": 3.0254191454840456, + "grad_norm": 1.2249231338500977, + "learning_rate": 8.41295419531906e-06, + "loss": 1.8515, + "mean_token_accuracy": 0.5837808847427368, + "num_tokens": 5718538567.0, + "step": 11188 + }, + { + "epoch": 3.025689561925365, + "grad_norm": 1.007813572883606, + "learning_rate": 8.411444280300948e-06, + "loss": 1.5761, + "mean_token_accuracy": 0.6225218772888184, + "num_tokens": 5719062623.0, + "step": 11189 + }, + { + "epoch": 3.025959978366685, + "grad_norm": 1.1176977157592773, + "learning_rate": 8.40993444470734e-06, + "loss": 1.8984, + "mean_token_accuracy": 0.5923484563827515, + "num_tokens": 5719522759.0, + "step": 11190 + }, + { + "epoch": 3.0262303948080045, + "grad_norm": 1.132866621017456, + "learning_rate": 8.40842468858456e-06, + "loss": 1.7699, + "mean_token_accuracy": 0.5882158279418945, + "num_tokens": 5720046988.0, + "step": 11191 + }, + { + "epoch": 3.026500811249324, + "grad_norm": 1.0869215726852417, + "learning_rate": 8.406915011978928e-06, + "loss": 1.9949, + "mean_token_accuracy": 0.5522173643112183, + "num_tokens": 5720571231.0, + "step": 11192 + }, + { + "epoch": 3.0267712276906438, + "grad_norm": 1.0320916175842285, + "learning_rate": 8.405405414936773e-06, + "loss": 1.9905, + "mean_token_accuracy": 0.55995774269104, + "num_tokens": 5721095319.0, + "step": 11193 + }, + { + "epoch": 3.0270416441319634, + "grad_norm": 1.0876141786575317, + "learning_rate": 8.40389589750441e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5745683908462524, + "num_tokens": 5721601319.0, + "step": 11194 + }, + { + "epoch": 3.027312060573283, + "grad_norm": 1.0432860851287842, + "learning_rate": 8.402386459728153e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5714319944381714, + "num_tokens": 5722125551.0, + "step": 11195 + }, + { + "epoch": 3.0275824770146027, + "grad_norm": 0.9882169365882874, + "learning_rate": 8.400877101654321e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5660352110862732, + "num_tokens": 5722649808.0, + "step": 11196 + }, + { + "epoch": 3.027852893455922, + "grad_norm": 1.0481833219528198, + "learning_rate": 8.399367823329226e-06, + "loss": 1.8226, + "mean_token_accuracy": 0.5804144740104675, + "num_tokens": 5723173951.0, + "step": 11197 + }, + { + "epoch": 3.0281233098972415, + "grad_norm": 1.0989387035369873, + "learning_rate": 8.397858624799167e-06, + "loss": 1.847, + "mean_token_accuracy": 0.5809139609336853, + "num_tokens": 5723698172.0, + "step": 11198 + }, + { + "epoch": 3.028393726338561, + "grad_norm": 1.0373172760009766, + "learning_rate": 8.396349506110462e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5909607410430908, + "num_tokens": 5724222361.0, + "step": 11199 + }, + { + "epoch": 3.028664142779881, + "grad_norm": 1.2121822834014893, + "learning_rate": 8.39484046730941e-06, + "loss": 1.9275, + "mean_token_accuracy": 0.5506700277328491, + "num_tokens": 5724707387.0, + "step": 11200 + }, + { + "epoch": 3.0289345592212005, + "grad_norm": 0.437416672706604, + "learning_rate": 8.393331508442312e-06, + "loss": 1.112, + "mean_token_accuracy": 0.7082074880599976, + "num_tokens": 5725140365.0, + "step": 11201 + }, + { + "epoch": 3.02920497566252, + "grad_norm": 1.306600570678711, + "learning_rate": 8.391822629555475e-06, + "loss": 1.7349, + "mean_token_accuracy": 0.5815302729606628, + "num_tokens": 5725664460.0, + "step": 11202 + }, + { + "epoch": 3.0294753921038398, + "grad_norm": 1.3550162315368652, + "learning_rate": 8.390313830695183e-06, + "loss": 1.7581, + "mean_token_accuracy": 0.5817265510559082, + "num_tokens": 5726188705.0, + "step": 11203 + }, + { + "epoch": 3.0297458085451594, + "grad_norm": 1.1673551797866821, + "learning_rate": 8.388805111907739e-06, + "loss": 1.7527, + "mean_token_accuracy": 0.5914759635925293, + "num_tokens": 5726712836.0, + "step": 11204 + }, + { + "epoch": 3.030016224986479, + "grad_norm": 0.9988052248954773, + "learning_rate": 8.387296473239435e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5631387233734131, + "num_tokens": 5727237059.0, + "step": 11205 + }, + { + "epoch": 3.0302866414277987, + "grad_norm": 1.2959800958633423, + "learning_rate": 8.385787914736556e-06, + "loss": 1.8074, + "mean_token_accuracy": 0.5730704665184021, + "num_tokens": 5727761283.0, + "step": 11206 + }, + { + "epoch": 3.0305570578691183, + "grad_norm": 1.154317021369934, + "learning_rate": 8.384279436445394e-06, + "loss": 1.9571, + "mean_token_accuracy": 0.5562376976013184, + "num_tokens": 5728285566.0, + "step": 11207 + }, + { + "epoch": 3.030827474310438, + "grad_norm": 1.07622492313385, + "learning_rate": 8.382771038412234e-06, + "loss": 1.7954, + "mean_token_accuracy": 0.573714017868042, + "num_tokens": 5728809752.0, + "step": 11208 + }, + { + "epoch": 3.0310978907517576, + "grad_norm": 1.0255504846572876, + "learning_rate": 8.38126272068335e-06, + "loss": 1.7142, + "mean_token_accuracy": 0.6128509044647217, + "num_tokens": 5729333860.0, + "step": 11209 + }, + { + "epoch": 3.0313683071930773, + "grad_norm": 1.0745912790298462, + "learning_rate": 8.37975448330503e-06, + "loss": 1.9415, + "mean_token_accuracy": 0.5608307719230652, + "num_tokens": 5729858069.0, + "step": 11210 + }, + { + "epoch": 3.031638723634397, + "grad_norm": 0.9077897071838379, + "learning_rate": 8.378246326323547e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5875601768493652, + "num_tokens": 5730382289.0, + "step": 11211 + }, + { + "epoch": 3.0319091400757165, + "grad_norm": 1.2076958417892456, + "learning_rate": 8.376738249785174e-06, + "loss": 1.8481, + "mean_token_accuracy": 0.5907073020935059, + "num_tokens": 5730906452.0, + "step": 11212 + }, + { + "epoch": 3.032179556517036, + "grad_norm": 0.9436899423599243, + "learning_rate": 8.375230253736191e-06, + "loss": 1.7802, + "mean_token_accuracy": 0.5946053266525269, + "num_tokens": 5731430678.0, + "step": 11213 + }, + { + "epoch": 3.032449972958356, + "grad_norm": 0.8966681361198425, + "learning_rate": 8.373722338222862e-06, + "loss": 1.9114, + "mean_token_accuracy": 0.5695003271102905, + "num_tokens": 5731954857.0, + "step": 11214 + }, + { + "epoch": 3.0327203893996755, + "grad_norm": 1.0381956100463867, + "learning_rate": 8.372214503291452e-06, + "loss": 1.9755, + "mean_token_accuracy": 0.5576948523521423, + "num_tokens": 5732478987.0, + "step": 11215 + }, + { + "epoch": 3.032990805840995, + "grad_norm": 1.0362465381622314, + "learning_rate": 8.370706748988234e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5759445428848267, + "num_tokens": 5733003202.0, + "step": 11216 + }, + { + "epoch": 3.0332612222823148, + "grad_norm": 1.2771854400634766, + "learning_rate": 8.36919907535946e-06, + "loss": 1.9173, + "mean_token_accuracy": 0.542451024055481, + "num_tokens": 5733521616.0, + "step": 11217 + }, + { + "epoch": 3.0335316387236344, + "grad_norm": 1.128544807434082, + "learning_rate": 8.367691482451395e-06, + "loss": 1.8921, + "mean_token_accuracy": 0.5635757446289062, + "num_tokens": 5734045762.0, + "step": 11218 + }, + { + "epoch": 3.033802055164954, + "grad_norm": 1.2106138467788696, + "learning_rate": 8.3661839703103e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.5901858806610107, + "num_tokens": 5734550813.0, + "step": 11219 + }, + { + "epoch": 3.0340724716062737, + "grad_norm": 1.1859627962112427, + "learning_rate": 8.364676538982423e-06, + "loss": 1.896, + "mean_token_accuracy": 0.5712234973907471, + "num_tokens": 5735075031.0, + "step": 11220 + }, + { + "epoch": 3.0343428880475933, + "grad_norm": 0.400556743144989, + "learning_rate": 8.36316918851402e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.7174285054206848, + "num_tokens": 5735599295.0, + "step": 11221 + }, + { + "epoch": 3.034613304488913, + "grad_norm": 1.0635323524475098, + "learning_rate": 8.361661918951345e-06, + "loss": 1.9326, + "mean_token_accuracy": 0.5613890886306763, + "num_tokens": 5736123467.0, + "step": 11222 + }, + { + "epoch": 3.0348837209302326, + "grad_norm": 1.0899332761764526, + "learning_rate": 8.360154730340634e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5891648530960083, + "num_tokens": 5736647082.0, + "step": 11223 + }, + { + "epoch": 3.0351541373715523, + "grad_norm": 0.9974488615989685, + "learning_rate": 8.358647622728147e-06, + "loss": 1.9132, + "mean_token_accuracy": 0.5650202035903931, + "num_tokens": 5737158577.0, + "step": 11224 + }, + { + "epoch": 3.035424553812872, + "grad_norm": 1.0084407329559326, + "learning_rate": 8.357140596160114e-06, + "loss": 1.8023, + "mean_token_accuracy": 0.602721095085144, + "num_tokens": 5737677438.0, + "step": 11225 + }, + { + "epoch": 3.0356949702541915, + "grad_norm": 0.9604286551475525, + "learning_rate": 8.355633650682777e-06, + "loss": 1.7307, + "mean_token_accuracy": 0.601824164390564, + "num_tokens": 5738201654.0, + "step": 11226 + }, + { + "epoch": 3.035965386695511, + "grad_norm": 1.2920200824737549, + "learning_rate": 8.354126786342382e-06, + "loss": 1.916, + "mean_token_accuracy": 0.5627239942550659, + "num_tokens": 5738725863.0, + "step": 11227 + }, + { + "epoch": 3.036235803136831, + "grad_norm": 0.8735387921333313, + "learning_rate": 8.352620003185157e-06, + "loss": 1.7756, + "mean_token_accuracy": 0.5672568082809448, + "num_tokens": 5739250059.0, + "step": 11228 + }, + { + "epoch": 3.0365062195781505, + "grad_norm": 1.3135857582092285, + "learning_rate": 8.351113301257335e-06, + "loss": 1.9054, + "mean_token_accuracy": 0.5808316469192505, + "num_tokens": 5739774222.0, + "step": 11229 + }, + { + "epoch": 3.03677663601947, + "grad_norm": 1.014355182647705, + "learning_rate": 8.349606680605148e-06, + "loss": 1.7705, + "mean_token_accuracy": 0.5897456407546997, + "num_tokens": 5740276174.0, + "step": 11230 + }, + { + "epoch": 3.0370470524607898, + "grad_norm": 0.9400622844696045, + "learning_rate": 8.34810014127482e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5775877833366394, + "num_tokens": 5740800348.0, + "step": 11231 + }, + { + "epoch": 3.0373174689021094, + "grad_norm": 1.1168988943099976, + "learning_rate": 8.346593683312578e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5749785900115967, + "num_tokens": 5741324505.0, + "step": 11232 + }, + { + "epoch": 3.037587885343429, + "grad_norm": 1.0132455825805664, + "learning_rate": 8.345087306764646e-06, + "loss": 1.7772, + "mean_token_accuracy": 0.5799569487571716, + "num_tokens": 5741848750.0, + "step": 11233 + }, + { + "epoch": 3.0378583017847487, + "grad_norm": 0.824503481388092, + "learning_rate": 8.343581011677244e-06, + "loss": 1.8089, + "mean_token_accuracy": 0.5811797380447388, + "num_tokens": 5742328697.0, + "step": 11234 + }, + { + "epoch": 3.0381287182260683, + "grad_norm": 1.1237831115722656, + "learning_rate": 8.342074798096586e-06, + "loss": 1.8071, + "mean_token_accuracy": 0.5778307914733887, + "num_tokens": 5742816392.0, + "step": 11235 + }, + { + "epoch": 3.038399134667388, + "grad_norm": 1.165349006652832, + "learning_rate": 8.340568666068893e-06, + "loss": 1.9197, + "mean_token_accuracy": 0.5456728935241699, + "num_tokens": 5743340519.0, + "step": 11236 + }, + { + "epoch": 3.0386695511087076, + "grad_norm": 1.0874865055084229, + "learning_rate": 8.339062615640374e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.562390923500061, + "num_tokens": 5743864654.0, + "step": 11237 + }, + { + "epoch": 3.038939967550027, + "grad_norm": 1.1025111675262451, + "learning_rate": 8.337556646857236e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5679992437362671, + "num_tokens": 5744388834.0, + "step": 11238 + }, + { + "epoch": 3.0392103839913465, + "grad_norm": 1.0916974544525146, + "learning_rate": 8.336050759765692e-06, + "loss": 1.833, + "mean_token_accuracy": 0.5863778591156006, + "num_tokens": 5744912968.0, + "step": 11239 + }, + { + "epoch": 3.039480800432666, + "grad_norm": 1.0520362854003906, + "learning_rate": 8.334544954411947e-06, + "loss": 1.8549, + "mean_token_accuracy": 0.5575248599052429, + "num_tokens": 5745437003.0, + "step": 11240 + }, + { + "epoch": 3.0397512168739858, + "grad_norm": 0.5527485013008118, + "learning_rate": 8.333039230842197e-06, + "loss": 1.1272, + "mean_token_accuracy": 0.705653727054596, + "num_tokens": 5745921372.0, + "step": 11241 + }, + { + "epoch": 3.0400216333153054, + "grad_norm": 1.4836920499801636, + "learning_rate": 8.33153358910265e-06, + "loss": 1.8851, + "mean_token_accuracy": 0.5858058333396912, + "num_tokens": 5746432362.0, + "step": 11242 + }, + { + "epoch": 3.040292049756625, + "grad_norm": 1.058254361152649, + "learning_rate": 8.3300280292395e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5781537294387817, + "num_tokens": 5746956531.0, + "step": 11243 + }, + { + "epoch": 3.0405624661979447, + "grad_norm": 1.008428931236267, + "learning_rate": 8.32852255129894e-06, + "loss": 1.86, + "mean_token_accuracy": 0.5727019906044006, + "num_tokens": 5747480774.0, + "step": 11244 + }, + { + "epoch": 3.0408328826392643, + "grad_norm": 0.9748631119728088, + "learning_rate": 8.327017155327162e-06, + "loss": 1.8521, + "mean_token_accuracy": 0.5784336924552917, + "num_tokens": 5748004988.0, + "step": 11245 + }, + { + "epoch": 3.041103299080584, + "grad_norm": 1.2176278829574585, + "learning_rate": 8.32551184137036e-06, + "loss": 1.859, + "mean_token_accuracy": 0.5733698606491089, + "num_tokens": 5748525425.0, + "step": 11246 + }, + { + "epoch": 3.0413737155219036, + "grad_norm": 1.2221983671188354, + "learning_rate": 8.324006609474722e-06, + "loss": 1.9241, + "mean_token_accuracy": 0.5621515512466431, + "num_tokens": 5749049651.0, + "step": 11247 + }, + { + "epoch": 3.0416441319632233, + "grad_norm": 1.2713414430618286, + "learning_rate": 8.322501459686432e-06, + "loss": 1.9628, + "mean_token_accuracy": 0.5650014877319336, + "num_tokens": 5749532994.0, + "step": 11248 + }, + { + "epoch": 3.041914548404543, + "grad_norm": 1.3685299158096313, + "learning_rate": 8.320996392051666e-06, + "loss": 1.9154, + "mean_token_accuracy": 0.5614193081855774, + "num_tokens": 5750012248.0, + "step": 11249 + }, + { + "epoch": 3.0421849648458625, + "grad_norm": 1.1109236478805542, + "learning_rate": 8.319491406616614e-06, + "loss": 1.9181, + "mean_token_accuracy": 0.5619877576828003, + "num_tokens": 5750536488.0, + "step": 11250 + }, + { + "epoch": 3.042455381287182, + "grad_norm": 1.0386511087417603, + "learning_rate": 8.317986503427446e-06, + "loss": 1.8171, + "mean_token_accuracy": 0.576061487197876, + "num_tokens": 5751060752.0, + "step": 11251 + }, + { + "epoch": 3.042725797728502, + "grad_norm": 1.0811893939971924, + "learning_rate": 8.31648168253034e-06, + "loss": 1.8973, + "mean_token_accuracy": 0.5700158476829529, + "num_tokens": 5751585010.0, + "step": 11252 + }, + { + "epoch": 3.0429962141698215, + "grad_norm": 1.0534108877182007, + "learning_rate": 8.314976943971467e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.5482238531112671, + "num_tokens": 5752109035.0, + "step": 11253 + }, + { + "epoch": 3.043266630611141, + "grad_norm": 1.0743234157562256, + "learning_rate": 8.313472287797e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5662446022033691, + "num_tokens": 5752633280.0, + "step": 11254 + }, + { + "epoch": 3.0435370470524608, + "grad_norm": 1.0746228694915771, + "learning_rate": 8.3119677140531e-06, + "loss": 1.7665, + "mean_token_accuracy": 0.6062965989112854, + "num_tokens": 5753157397.0, + "step": 11255 + }, + { + "epoch": 3.0438074634937804, + "grad_norm": 1.079628348350525, + "learning_rate": 8.310463222785938e-06, + "loss": 1.9115, + "mean_token_accuracy": 0.5724807977676392, + "num_tokens": 5753646593.0, + "step": 11256 + }, + { + "epoch": 3.0440778799351, + "grad_norm": 1.1967482566833496, + "learning_rate": 8.308958814041676e-06, + "loss": 1.8978, + "mean_token_accuracy": 0.5683592557907104, + "num_tokens": 5754138175.0, + "step": 11257 + }, + { + "epoch": 3.0443482963764197, + "grad_norm": 1.1884583234786987, + "learning_rate": 8.307454487866467e-06, + "loss": 1.8799, + "mean_token_accuracy": 0.5672870874404907, + "num_tokens": 5754625742.0, + "step": 11258 + }, + { + "epoch": 3.0446187128177393, + "grad_norm": 1.151911735534668, + "learning_rate": 8.305950244306475e-06, + "loss": 1.9205, + "mean_token_accuracy": 0.566132664680481, + "num_tokens": 5755149942.0, + "step": 11259 + }, + { + "epoch": 3.044889129259059, + "grad_norm": 1.0476996898651123, + "learning_rate": 8.304446083407851e-06, + "loss": 1.8407, + "mean_token_accuracy": 0.5836880207061768, + "num_tokens": 5755617560.0, + "step": 11260 + }, + { + "epoch": 3.0451595457003786, + "grad_norm": 0.4292455315589905, + "learning_rate": 8.302942005216747e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.7039669752120972, + "num_tokens": 5756109413.0, + "step": 11261 + }, + { + "epoch": 3.0454299621416983, + "grad_norm": 1.5788131952285767, + "learning_rate": 8.301438009779316e-06, + "loss": 1.8447, + "mean_token_accuracy": 0.5666971206665039, + "num_tokens": 5756633574.0, + "step": 11262 + }, + { + "epoch": 3.045700378583018, + "grad_norm": 1.4874008893966675, + "learning_rate": 8.299934097141704e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.5846387147903442, + "num_tokens": 5757157797.0, + "step": 11263 + }, + { + "epoch": 3.0459707950243375, + "grad_norm": 1.2000781297683716, + "learning_rate": 8.29843026735005e-06, + "loss": 1.8604, + "mean_token_accuracy": 0.5726957321166992, + "num_tokens": 5757673339.0, + "step": 11264 + }, + { + "epoch": 3.046241211465657, + "grad_norm": 1.5628721714019775, + "learning_rate": 8.296926520450505e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.5833927392959595, + "num_tokens": 5758125105.0, + "step": 11265 + }, + { + "epoch": 3.046511627906977, + "grad_norm": 1.4797673225402832, + "learning_rate": 8.295422856489201e-06, + "loss": 1.7403, + "mean_token_accuracy": 0.5771458148956299, + "num_tokens": 5758649284.0, + "step": 11266 + }, + { + "epoch": 3.0467820443482965, + "grad_norm": 1.416871428489685, + "learning_rate": 8.293919275512276e-06, + "loss": 1.8624, + "mean_token_accuracy": 0.5771614909172058, + "num_tokens": 5759173481.0, + "step": 11267 + }, + { + "epoch": 3.047052460789616, + "grad_norm": 1.0355783700942993, + "learning_rate": 8.292415777565868e-06, + "loss": 1.9971, + "mean_token_accuracy": 0.5612765550613403, + "num_tokens": 5759697743.0, + "step": 11268 + }, + { + "epoch": 3.0473228772309358, + "grad_norm": 1.4454352855682373, + "learning_rate": 8.290912362696105e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.6036850810050964, + "num_tokens": 5760157177.0, + "step": 11269 + }, + { + "epoch": 3.0475932936722554, + "grad_norm": 1.4470083713531494, + "learning_rate": 8.28940903094912e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5585268139839172, + "num_tokens": 5760681404.0, + "step": 11270 + }, + { + "epoch": 3.047863710113575, + "grad_norm": 1.2100486755371094, + "learning_rate": 8.287905782371038e-06, + "loss": 1.9351, + "mean_token_accuracy": 0.5712714791297913, + "num_tokens": 5761205576.0, + "step": 11271 + }, + { + "epoch": 3.0481341265548947, + "grad_norm": 1.0460323095321655, + "learning_rate": 8.286402617007975e-06, + "loss": 1.8495, + "mean_token_accuracy": 0.5613617897033691, + "num_tokens": 5761729641.0, + "step": 11272 + }, + { + "epoch": 3.0484045429962143, + "grad_norm": 1.152869462966919, + "learning_rate": 8.284899534906068e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5640854835510254, + "num_tokens": 5762253916.0, + "step": 11273 + }, + { + "epoch": 3.048674959437534, + "grad_norm": 1.0547746419906616, + "learning_rate": 8.283396536111425e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.592180609703064, + "num_tokens": 5762724310.0, + "step": 11274 + }, + { + "epoch": 3.0489453758788536, + "grad_norm": 0.9188313484191895, + "learning_rate": 8.281893620670164e-06, + "loss": 1.7888, + "mean_token_accuracy": 0.5781598091125488, + "num_tokens": 5763190621.0, + "step": 11275 + }, + { + "epoch": 3.0492157923201733, + "grad_norm": 0.9377020597457886, + "learning_rate": 8.280390788628401e-06, + "loss": 1.9157, + "mean_token_accuracy": 0.5575153231620789, + "num_tokens": 5763714851.0, + "step": 11276 + }, + { + "epoch": 3.049486208761493, + "grad_norm": 1.158215880393982, + "learning_rate": 8.27888804003225e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5560845136642456, + "num_tokens": 5764239024.0, + "step": 11277 + }, + { + "epoch": 3.0497566252028125, + "grad_norm": 1.7835901975631714, + "learning_rate": 8.27738537492781e-06, + "loss": 1.7285, + "mean_token_accuracy": 0.5931023955345154, + "num_tokens": 5764707713.0, + "step": 11278 + }, + { + "epoch": 3.0500270416441317, + "grad_norm": 1.1244444847106934, + "learning_rate": 8.275882793361198e-06, + "loss": 2.0534, + "mean_token_accuracy": 0.5109311938285828, + "num_tokens": 5765231928.0, + "step": 11279 + }, + { + "epoch": 3.0502974580854514, + "grad_norm": 1.0314346551895142, + "learning_rate": 8.27438029537851e-06, + "loss": 1.896, + "mean_token_accuracy": 0.5625737905502319, + "num_tokens": 5765756141.0, + "step": 11280 + }, + { + "epoch": 3.050567874526771, + "grad_norm": 0.5905487537384033, + "learning_rate": 8.272877881025852e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7474863529205322, + "num_tokens": 5766280390.0, + "step": 11281 + }, + { + "epoch": 3.0508382909680907, + "grad_norm": 1.0861661434173584, + "learning_rate": 8.271375550349319e-06, + "loss": 1.8176, + "mean_token_accuracy": 0.5894464254379272, + "num_tokens": 5766768963.0, + "step": 11282 + }, + { + "epoch": 3.0511087074094103, + "grad_norm": 1.1042588949203491, + "learning_rate": 8.269873303395011e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5838617086410522, + "num_tokens": 5767227738.0, + "step": 11283 + }, + { + "epoch": 3.05137912385073, + "grad_norm": 1.0938148498535156, + "learning_rate": 8.268371140209016e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.5652848482131958, + "num_tokens": 5767751924.0, + "step": 11284 + }, + { + "epoch": 3.0516495402920496, + "grad_norm": 1.035548448562622, + "learning_rate": 8.26686906083743e-06, + "loss": 1.7107, + "mean_token_accuracy": 0.5725300908088684, + "num_tokens": 5768276094.0, + "step": 11285 + }, + { + "epoch": 3.0519199567333692, + "grad_norm": 0.9234438538551331, + "learning_rate": 8.265367065326339e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5677754878997803, + "num_tokens": 5768800253.0, + "step": 11286 + }, + { + "epoch": 3.052190373174689, + "grad_norm": 0.9970924854278564, + "learning_rate": 8.263865153721824e-06, + "loss": 1.4832, + "mean_token_accuracy": 0.655657947063446, + "num_tokens": 5769270394.0, + "step": 11287 + }, + { + "epoch": 3.0524607896160085, + "grad_norm": 1.1654844284057617, + "learning_rate": 8.262363326069976e-06, + "loss": 1.8836, + "mean_token_accuracy": 0.5666333436965942, + "num_tokens": 5769794662.0, + "step": 11288 + }, + { + "epoch": 3.052731206057328, + "grad_norm": 1.0934596061706543, + "learning_rate": 8.260861582416872e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.584060549736023, + "num_tokens": 5770229739.0, + "step": 11289 + }, + { + "epoch": 3.053001622498648, + "grad_norm": 1.085802435874939, + "learning_rate": 8.25935992280859e-06, + "loss": 1.8502, + "mean_token_accuracy": 0.5661237239837646, + "num_tokens": 5770706487.0, + "step": 11290 + }, + { + "epoch": 3.0532720389399675, + "grad_norm": 0.9563043713569641, + "learning_rate": 8.257858347291206e-06, + "loss": 1.9151, + "mean_token_accuracy": 0.5734131336212158, + "num_tokens": 5771230668.0, + "step": 11291 + }, + { + "epoch": 3.053542455381287, + "grad_norm": 0.9439797401428223, + "learning_rate": 8.256356855910788e-06, + "loss": 1.8242, + "mean_token_accuracy": 0.5806704163551331, + "num_tokens": 5771754887.0, + "step": 11292 + }, + { + "epoch": 3.0538128718226067, + "grad_norm": 0.90803062915802, + "learning_rate": 8.254855448713415e-06, + "loss": 1.6498, + "mean_token_accuracy": 0.6351074576377869, + "num_tokens": 5772181760.0, + "step": 11293 + }, + { + "epoch": 3.0540832882639264, + "grad_norm": 0.9347644448280334, + "learning_rate": 8.25335412574515e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5757811069488525, + "num_tokens": 5772663997.0, + "step": 11294 + }, + { + "epoch": 3.054353704705246, + "grad_norm": 1.097786784172058, + "learning_rate": 8.251852887052054e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5906780958175659, + "num_tokens": 5773188200.0, + "step": 11295 + }, + { + "epoch": 3.0546241211465657, + "grad_norm": 0.9974490404129028, + "learning_rate": 8.250351732680195e-06, + "loss": 1.8966, + "mean_token_accuracy": 0.5541962385177612, + "num_tokens": 5773712469.0, + "step": 11296 + }, + { + "epoch": 3.0548945375878853, + "grad_norm": 0.96115642786026, + "learning_rate": 8.248850662675631e-06, + "loss": 1.9056, + "mean_token_accuracy": 0.5586232542991638, + "num_tokens": 5774236641.0, + "step": 11297 + }, + { + "epoch": 3.055164954029205, + "grad_norm": 1.1924347877502441, + "learning_rate": 8.247349677084418e-06, + "loss": 1.9736, + "mean_token_accuracy": 0.5693490505218506, + "num_tokens": 5774760913.0, + "step": 11298 + }, + { + "epoch": 3.0554353704705246, + "grad_norm": 1.2514375448226929, + "learning_rate": 8.245848775952613e-06, + "loss": 1.8598, + "mean_token_accuracy": 0.5630198121070862, + "num_tokens": 5775228722.0, + "step": 11299 + }, + { + "epoch": 3.0557057869118442, + "grad_norm": 0.923317551612854, + "learning_rate": 8.244347959326269e-06, + "loss": 1.7551, + "mean_token_accuracy": 0.5798388719558716, + "num_tokens": 5775752938.0, + "step": 11300 + }, + { + "epoch": 3.055976203353164, + "grad_norm": 0.38901421427726746, + "learning_rate": 8.242847227251429e-06, + "loss": 1.1562, + "mean_token_accuracy": 0.6874523758888245, + "num_tokens": 5776277206.0, + "step": 11301 + }, + { + "epoch": 3.0562466197944835, + "grad_norm": 1.4781378507614136, + "learning_rate": 8.24134657977415e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5693790316581726, + "num_tokens": 5776801335.0, + "step": 11302 + }, + { + "epoch": 3.056517036235803, + "grad_norm": 1.4560216665267944, + "learning_rate": 8.239846016940467e-06, + "loss": 1.7917, + "mean_token_accuracy": 0.5842217803001404, + "num_tokens": 5777325514.0, + "step": 11303 + }, + { + "epoch": 3.056787452677123, + "grad_norm": 1.0103394985198975, + "learning_rate": 8.238345538796425e-06, + "loss": 1.8733, + "mean_token_accuracy": 0.5720769166946411, + "num_tokens": 5777812831.0, + "step": 11304 + }, + { + "epoch": 3.0570578691184425, + "grad_norm": 1.1181762218475342, + "learning_rate": 8.236845145388064e-06, + "loss": 1.8845, + "mean_token_accuracy": 0.540382444858551, + "num_tokens": 5778336989.0, + "step": 11305 + }, + { + "epoch": 3.057328285559762, + "grad_norm": 1.33780837059021, + "learning_rate": 8.235344836761424e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.579825222492218, + "num_tokens": 5778861266.0, + "step": 11306 + }, + { + "epoch": 3.0575987020010817, + "grad_norm": 1.4722816944122314, + "learning_rate": 8.233844612962525e-06, + "loss": 2.0251, + "mean_token_accuracy": 0.5492760539054871, + "num_tokens": 5779385444.0, + "step": 11307 + }, + { + "epoch": 3.0578691184424014, + "grad_norm": 1.1099220514297485, + "learning_rate": 8.232344474037414e-06, + "loss": 1.8038, + "mean_token_accuracy": 0.5778427124023438, + "num_tokens": 5779909722.0, + "step": 11308 + }, + { + "epoch": 3.058139534883721, + "grad_norm": 1.1423557996749878, + "learning_rate": 8.230844420032115e-06, + "loss": 1.6353, + "mean_token_accuracy": 0.5869925618171692, + "num_tokens": 5780433845.0, + "step": 11309 + }, + { + "epoch": 3.0584099513250407, + "grad_norm": 1.1902587413787842, + "learning_rate": 8.229344450992648e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.590532660484314, + "num_tokens": 5780893297.0, + "step": 11310 + }, + { + "epoch": 3.0586803677663603, + "grad_norm": 1.1126322746276855, + "learning_rate": 8.227844566965049e-06, + "loss": 1.8266, + "mean_token_accuracy": 0.5794076323509216, + "num_tokens": 5781399046.0, + "step": 11311 + }, + { + "epoch": 3.05895078420768, + "grad_norm": 0.9873098134994507, + "learning_rate": 8.226344767995327e-06, + "loss": 1.8252, + "mean_token_accuracy": 0.5705928802490234, + "num_tokens": 5781923169.0, + "step": 11312 + }, + { + "epoch": 3.0592212006489996, + "grad_norm": 1.1871185302734375, + "learning_rate": 8.2248450541295e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5854871273040771, + "num_tokens": 5782447452.0, + "step": 11313 + }, + { + "epoch": 3.0594916170903192, + "grad_norm": 0.9903535842895508, + "learning_rate": 8.223345425413593e-06, + "loss": 1.9184, + "mean_token_accuracy": 0.5527260303497314, + "num_tokens": 5782971631.0, + "step": 11314 + }, + { + "epoch": 3.059762033531639, + "grad_norm": 1.0692940950393677, + "learning_rate": 8.221845881893611e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.5698422789573669, + "num_tokens": 5783472421.0, + "step": 11315 + }, + { + "epoch": 3.0600324499729585, + "grad_norm": 0.9511592984199524, + "learning_rate": 8.22034642361557e-06, + "loss": 1.7581, + "mean_token_accuracy": 0.5968737602233887, + "num_tokens": 5783996648.0, + "step": 11316 + }, + { + "epoch": 3.060302866414278, + "grad_norm": 1.0768290758132935, + "learning_rate": 8.218847050625476e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5582761764526367, + "num_tokens": 5784520917.0, + "step": 11317 + }, + { + "epoch": 3.060573282855598, + "grad_norm": 0.9829150438308716, + "learning_rate": 8.217347762969327e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5701241493225098, + "num_tokens": 5785045088.0, + "step": 11318 + }, + { + "epoch": 3.0608436992969175, + "grad_norm": 1.0855650901794434, + "learning_rate": 8.215848560693137e-06, + "loss": 1.8773, + "mean_token_accuracy": 0.5764856338500977, + "num_tokens": 5785569254.0, + "step": 11319 + }, + { + "epoch": 3.0611141157382367, + "grad_norm": 1.1121573448181152, + "learning_rate": 8.214349443842904e-06, + "loss": 1.916, + "mean_token_accuracy": 0.5737616419792175, + "num_tokens": 5786093537.0, + "step": 11320 + }, + { + "epoch": 3.0613845321795563, + "grad_norm": 0.37552931904792786, + "learning_rate": 8.212850412464615e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.7042816281318665, + "num_tokens": 5786617823.0, + "step": 11321 + }, + { + "epoch": 3.061654948620876, + "grad_norm": 1.5590171813964844, + "learning_rate": 8.211351466604279e-06, + "loss": 1.9032, + "mean_token_accuracy": 0.551476001739502, + "num_tokens": 5787142062.0, + "step": 11322 + }, + { + "epoch": 3.0619253650621956, + "grad_norm": 1.3084710836410522, + "learning_rate": 8.209852606307875e-06, + "loss": 1.7738, + "mean_token_accuracy": 0.6024280786514282, + "num_tokens": 5787666287.0, + "step": 11323 + }, + { + "epoch": 3.0621957815035152, + "grad_norm": 1.0283640623092651, + "learning_rate": 8.2083538316214e-06, + "loss": 1.8146, + "mean_token_accuracy": 0.5796807408332825, + "num_tokens": 5788190437.0, + "step": 11324 + }, + { + "epoch": 3.062466197944835, + "grad_norm": 1.0579111576080322, + "learning_rate": 8.206855142590839e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5736268162727356, + "num_tokens": 5788680351.0, + "step": 11325 + }, + { + "epoch": 3.0627366143861545, + "grad_norm": 1.3063714504241943, + "learning_rate": 8.20535653926218e-06, + "loss": 1.7149, + "mean_token_accuracy": 0.5986384153366089, + "num_tokens": 5789204480.0, + "step": 11326 + }, + { + "epoch": 3.063007030827474, + "grad_norm": 1.300528645515442, + "learning_rate": 8.203858021681395e-06, + "loss": 1.9561, + "mean_token_accuracy": 0.5618895888328552, + "num_tokens": 5789728734.0, + "step": 11327 + }, + { + "epoch": 3.063277447268794, + "grad_norm": 1.1334196329116821, + "learning_rate": 8.202359589894474e-06, + "loss": 1.852, + "mean_token_accuracy": 0.566281795501709, + "num_tokens": 5790252973.0, + "step": 11328 + }, + { + "epoch": 3.0635478637101135, + "grad_norm": 1.0407674312591553, + "learning_rate": 8.200861243947388e-06, + "loss": 1.8231, + "mean_token_accuracy": 0.5721240639686584, + "num_tokens": 5790777146.0, + "step": 11329 + }, + { + "epoch": 3.063818280151433, + "grad_norm": 1.0289534330368042, + "learning_rate": 8.199362983886103e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5747684240341187, + "num_tokens": 5791301334.0, + "step": 11330 + }, + { + "epoch": 3.0640886965927527, + "grad_norm": 1.1063191890716553, + "learning_rate": 8.197864809756604e-06, + "loss": 1.7861, + "mean_token_accuracy": 0.5864706635475159, + "num_tokens": 5791825527.0, + "step": 11331 + }, + { + "epoch": 3.0643591130340724, + "grad_norm": 1.1890314817428589, + "learning_rate": 8.196366721604854e-06, + "loss": 1.742, + "mean_token_accuracy": 0.5785408616065979, + "num_tokens": 5792349722.0, + "step": 11332 + }, + { + "epoch": 3.064629529475392, + "grad_norm": 1.2738711833953857, + "learning_rate": 8.194868719476813e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5765213370323181, + "num_tokens": 5792873880.0, + "step": 11333 + }, + { + "epoch": 3.0648999459167117, + "grad_norm": 1.1598542928695679, + "learning_rate": 8.193370803418455e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5804434418678284, + "num_tokens": 5793338942.0, + "step": 11334 + }, + { + "epoch": 3.0651703623580313, + "grad_norm": 1.3366564512252808, + "learning_rate": 8.191872973475734e-06, + "loss": 1.8717, + "mean_token_accuracy": 0.5821325778961182, + "num_tokens": 5793830228.0, + "step": 11335 + }, + { + "epoch": 3.065440778799351, + "grad_norm": 1.0467579364776611, + "learning_rate": 8.190375229694602e-06, + "loss": 1.7813, + "mean_token_accuracy": 0.5869870185852051, + "num_tokens": 5794344395.0, + "step": 11336 + }, + { + "epoch": 3.0657111952406706, + "grad_norm": 1.034710168838501, + "learning_rate": 8.188877572121026e-06, + "loss": 1.9427, + "mean_token_accuracy": 0.5475216507911682, + "num_tokens": 5794868583.0, + "step": 11337 + }, + { + "epoch": 3.0659816116819902, + "grad_norm": 1.0986307859420776, + "learning_rate": 8.18738000080095e-06, + "loss": 1.7467, + "mean_token_accuracy": 0.5960221290588379, + "num_tokens": 5795392837.0, + "step": 11338 + }, + { + "epoch": 3.06625202812331, + "grad_norm": 1.0913896560668945, + "learning_rate": 8.185882515780328e-06, + "loss": 1.9222, + "mean_token_accuracy": 0.5596323013305664, + "num_tokens": 5795893425.0, + "step": 11339 + }, + { + "epoch": 3.0665224445646295, + "grad_norm": 1.0798249244689941, + "learning_rate": 8.184385117105108e-06, + "loss": 1.8974, + "mean_token_accuracy": 0.5518978834152222, + "num_tokens": 5796417521.0, + "step": 11340 + }, + { + "epoch": 3.066792861005949, + "grad_norm": 0.37864434719085693, + "learning_rate": 8.182887804821228e-06, + "loss": 1.159, + "mean_token_accuracy": 0.6993324756622314, + "num_tokens": 5796881846.0, + "step": 11341 + }, + { + "epoch": 3.067063277447269, + "grad_norm": 1.4928754568099976, + "learning_rate": 8.181390578974643e-06, + "loss": 1.8598, + "mean_token_accuracy": 0.5733701586723328, + "num_tokens": 5797405888.0, + "step": 11342 + }, + { + "epoch": 3.0673336938885885, + "grad_norm": 1.5852608680725098, + "learning_rate": 8.179893439611282e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5768651366233826, + "num_tokens": 5797900670.0, + "step": 11343 + }, + { + "epoch": 3.067604110329908, + "grad_norm": 1.1921026706695557, + "learning_rate": 8.178396386777077e-06, + "loss": 1.922, + "mean_token_accuracy": 0.5567623376846313, + "num_tokens": 5798424945.0, + "step": 11344 + }, + { + "epoch": 3.0678745267712277, + "grad_norm": 1.1420844793319702, + "learning_rate": 8.176899420517974e-06, + "loss": 1.931, + "mean_token_accuracy": 0.5563439726829529, + "num_tokens": 5798949207.0, + "step": 11345 + }, + { + "epoch": 3.0681449432125474, + "grad_norm": 1.1160279512405396, + "learning_rate": 8.175402540879897e-06, + "loss": 1.6357, + "mean_token_accuracy": 0.6226629614830017, + "num_tokens": 5799473476.0, + "step": 11346 + }, + { + "epoch": 3.068415359653867, + "grad_norm": 1.4559599161148071, + "learning_rate": 8.173905747908776e-06, + "loss": 1.8102, + "mean_token_accuracy": 0.5806660652160645, + "num_tokens": 5799997660.0, + "step": 11347 + }, + { + "epoch": 3.0686857760951867, + "grad_norm": 1.0823781490325928, + "learning_rate": 8.172409041650542e-06, + "loss": 1.9297, + "mean_token_accuracy": 0.5673906803131104, + "num_tokens": 5800521848.0, + "step": 11348 + }, + { + "epoch": 3.0689561925365063, + "grad_norm": 1.3722233772277832, + "learning_rate": 8.170912422151112e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.5519787073135376, + "num_tokens": 5801045941.0, + "step": 11349 + }, + { + "epoch": 3.069226608977826, + "grad_norm": 1.4406583309173584, + "learning_rate": 8.169415889456404e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5598702430725098, + "num_tokens": 5801566551.0, + "step": 11350 + }, + { + "epoch": 3.0694970254191456, + "grad_norm": 1.1754649877548218, + "learning_rate": 8.167919443612346e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5622347593307495, + "num_tokens": 5802046396.0, + "step": 11351 + }, + { + "epoch": 3.0697674418604652, + "grad_norm": 1.200871229171753, + "learning_rate": 8.166423084664848e-06, + "loss": 1.9156, + "mean_token_accuracy": 0.5706731081008911, + "num_tokens": 5802570656.0, + "step": 11352 + }, + { + "epoch": 3.070037858301785, + "grad_norm": 1.2885240316390991, + "learning_rate": 8.16492681265982e-06, + "loss": 1.7892, + "mean_token_accuracy": 0.5825052857398987, + "num_tokens": 5803036239.0, + "step": 11353 + }, + { + "epoch": 3.0703082747431045, + "grad_norm": 1.2056175470352173, + "learning_rate": 8.163430627643177e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5903673768043518, + "num_tokens": 5803560497.0, + "step": 11354 + }, + { + "epoch": 3.070578691184424, + "grad_norm": 1.046247124671936, + "learning_rate": 8.161934529660824e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.576080322265625, + "num_tokens": 5804075762.0, + "step": 11355 + }, + { + "epoch": 3.070849107625744, + "grad_norm": 1.2077981233596802, + "learning_rate": 8.160438518758662e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5756974220275879, + "num_tokens": 5804596550.0, + "step": 11356 + }, + { + "epoch": 3.0711195240670635, + "grad_norm": 1.4452701807022095, + "learning_rate": 8.158942594982602e-06, + "loss": 1.9127, + "mean_token_accuracy": 0.559679388999939, + "num_tokens": 5805067887.0, + "step": 11357 + }, + { + "epoch": 3.071389940508383, + "grad_norm": 1.1382535696029663, + "learning_rate": 8.157446758378537e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5711715221405029, + "num_tokens": 5805592116.0, + "step": 11358 + }, + { + "epoch": 3.0716603569497027, + "grad_norm": 1.4099267721176147, + "learning_rate": 8.155951008992361e-06, + "loss": 1.8608, + "mean_token_accuracy": 0.5860071182250977, + "num_tokens": 5806058718.0, + "step": 11359 + }, + { + "epoch": 3.0719307733910224, + "grad_norm": 1.1708366870880127, + "learning_rate": 8.154455346869977e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5794782042503357, + "num_tokens": 5806582994.0, + "step": 11360 + }, + { + "epoch": 3.0722011898323416, + "grad_norm": 0.4454900920391083, + "learning_rate": 8.152959772057264e-06, + "loss": 1.1134, + "mean_token_accuracy": 0.7093133926391602, + "num_tokens": 5807094865.0, + "step": 11361 + }, + { + "epoch": 3.0724716062736612, + "grad_norm": 1.30613374710083, + "learning_rate": 8.151464284600126e-06, + "loss": 1.8635, + "mean_token_accuracy": 0.5362696051597595, + "num_tokens": 5807619016.0, + "step": 11362 + }, + { + "epoch": 3.072742022714981, + "grad_norm": 1.5740950107574463, + "learning_rate": 8.149968884544439e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5718438625335693, + "num_tokens": 5808143296.0, + "step": 11363 + }, + { + "epoch": 3.0730124391563005, + "grad_norm": 1.2775710821151733, + "learning_rate": 8.148473571936085e-06, + "loss": 1.8846, + "mean_token_accuracy": 0.5721100568771362, + "num_tokens": 5808650678.0, + "step": 11364 + }, + { + "epoch": 3.07328285559762, + "grad_norm": 1.011741280555725, + "learning_rate": 8.14697834682095e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5653191804885864, + "num_tokens": 5809127987.0, + "step": 11365 + }, + { + "epoch": 3.07355327203894, + "grad_norm": 1.2688024044036865, + "learning_rate": 8.14548320924491e-06, + "loss": 1.9505, + "mean_token_accuracy": 0.5431989431381226, + "num_tokens": 5809652114.0, + "step": 11366 + }, + { + "epoch": 3.0738236884802594, + "grad_norm": 1.6193286180496216, + "learning_rate": 8.143988159253837e-06, + "loss": 1.8903, + "mean_token_accuracy": 0.570942759513855, + "num_tokens": 5810176241.0, + "step": 11367 + }, + { + "epoch": 3.074094104921579, + "grad_norm": 1.1189748048782349, + "learning_rate": 8.142493196893608e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.5764113664627075, + "num_tokens": 5810696007.0, + "step": 11368 + }, + { + "epoch": 3.0743645213628987, + "grad_norm": 0.9820084571838379, + "learning_rate": 8.140998322210091e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5872420072555542, + "num_tokens": 5811220244.0, + "step": 11369 + }, + { + "epoch": 3.0746349378042184, + "grad_norm": 1.0289849042892456, + "learning_rate": 8.139503535249151e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5584723949432373, + "num_tokens": 5811744443.0, + "step": 11370 + }, + { + "epoch": 3.074905354245538, + "grad_norm": 1.17972731590271, + "learning_rate": 8.138008836056658e-06, + "loss": 1.8191, + "mean_token_accuracy": 0.588529646396637, + "num_tokens": 5812268656.0, + "step": 11371 + }, + { + "epoch": 3.0751757706868577, + "grad_norm": 1.1684273481369019, + "learning_rate": 8.136514224678472e-06, + "loss": 1.9775, + "mean_token_accuracy": 0.5561926960945129, + "num_tokens": 5812775027.0, + "step": 11372 + }, + { + "epoch": 3.0754461871281773, + "grad_norm": 1.0134516954421997, + "learning_rate": 8.135019701160444e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5848069190979004, + "num_tokens": 5813299289.0, + "step": 11373 + }, + { + "epoch": 3.075716603569497, + "grad_norm": 0.9615053534507751, + "learning_rate": 8.133525265548442e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.588779091835022, + "num_tokens": 5813763484.0, + "step": 11374 + }, + { + "epoch": 3.0759870200108166, + "grad_norm": 1.0144143104553223, + "learning_rate": 8.132030917888314e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5730876326560974, + "num_tokens": 5814287762.0, + "step": 11375 + }, + { + "epoch": 3.0762574364521362, + "grad_norm": 1.1370373964309692, + "learning_rate": 8.130536658225906e-06, + "loss": 1.8989, + "mean_token_accuracy": 0.5645800232887268, + "num_tokens": 5814812000.0, + "step": 11376 + }, + { + "epoch": 3.076527852893456, + "grad_norm": 1.1653324365615845, + "learning_rate": 8.129042486607082e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.564530611038208, + "num_tokens": 5815336134.0, + "step": 11377 + }, + { + "epoch": 3.0767982693347755, + "grad_norm": 1.1513475179672241, + "learning_rate": 8.127548403077671e-06, + "loss": 1.9691, + "mean_token_accuracy": 0.5411942005157471, + "num_tokens": 5815860390.0, + "step": 11378 + }, + { + "epoch": 3.077068685776095, + "grad_norm": 1.291558027267456, + "learning_rate": 8.126054407683521e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.5651880502700806, + "num_tokens": 5816384517.0, + "step": 11379 + }, + { + "epoch": 3.077339102217415, + "grad_norm": 1.3783786296844482, + "learning_rate": 8.124560500470478e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5949410200119019, + "num_tokens": 5816846439.0, + "step": 11380 + }, + { + "epoch": 3.0776095186587344, + "grad_norm": 0.4023644030094147, + "learning_rate": 8.123066681484375e-06, + "loss": 1.1594, + "mean_token_accuracy": 0.7004703283309937, + "num_tokens": 5817344419.0, + "step": 11381 + }, + { + "epoch": 3.077879935100054, + "grad_norm": 1.3563735485076904, + "learning_rate": 8.121572950771041e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5848518013954163, + "num_tokens": 5817868668.0, + "step": 11382 + }, + { + "epoch": 3.0781503515413737, + "grad_norm": 1.4637761116027832, + "learning_rate": 8.120079308376318e-06, + "loss": 1.875, + "mean_token_accuracy": 0.5809754729270935, + "num_tokens": 5818392926.0, + "step": 11383 + }, + { + "epoch": 3.0784207679826934, + "grad_norm": 1.0566425323486328, + "learning_rate": 8.118585754346033e-06, + "loss": 1.8467, + "mean_token_accuracy": 0.5698281526565552, + "num_tokens": 5818917151.0, + "step": 11384 + }, + { + "epoch": 3.078691184424013, + "grad_norm": 1.0560299158096313, + "learning_rate": 8.117092288726005e-06, + "loss": 1.8181, + "mean_token_accuracy": 0.5772947669029236, + "num_tokens": 5819441397.0, + "step": 11385 + }, + { + "epoch": 3.0789616008653327, + "grad_norm": 1.1304116249084473, + "learning_rate": 8.11559891156207e-06, + "loss": 1.8652, + "mean_token_accuracy": 0.5580174922943115, + "num_tokens": 5819965566.0, + "step": 11386 + }, + { + "epoch": 3.0792320173066523, + "grad_norm": 1.2642090320587158, + "learning_rate": 8.114105622900036e-06, + "loss": 1.8965, + "mean_token_accuracy": 0.5639146566390991, + "num_tokens": 5820489740.0, + "step": 11387 + }, + { + "epoch": 3.079502433747972, + "grad_norm": 1.0613383054733276, + "learning_rate": 8.112612422785735e-06, + "loss": 1.8815, + "mean_token_accuracy": 0.573318362236023, + "num_tokens": 5821006502.0, + "step": 11388 + }, + { + "epoch": 3.0797728501892916, + "grad_norm": 1.1564466953277588, + "learning_rate": 8.111119311264972e-06, + "loss": 1.8845, + "mean_token_accuracy": 0.5874168872833252, + "num_tokens": 5821484707.0, + "step": 11389 + }, + { + "epoch": 3.0800432666306112, + "grad_norm": 1.10036301612854, + "learning_rate": 8.109626288383565e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5828951597213745, + "num_tokens": 5821961858.0, + "step": 11390 + }, + { + "epoch": 3.080313683071931, + "grad_norm": 1.212014079093933, + "learning_rate": 8.108133354187326e-06, + "loss": 1.8847, + "mean_token_accuracy": 0.5502665638923645, + "num_tokens": 5822486010.0, + "step": 11391 + }, + { + "epoch": 3.0805840995132505, + "grad_norm": 1.0826597213745117, + "learning_rate": 8.10664050872206e-06, + "loss": 1.8678, + "mean_token_accuracy": 0.5788500308990479, + "num_tokens": 5822954326.0, + "step": 11392 + }, + { + "epoch": 3.08085451595457, + "grad_norm": 1.0232230424880981, + "learning_rate": 8.105147752033567e-06, + "loss": 1.9327, + "mean_token_accuracy": 0.5506900548934937, + "num_tokens": 5823478599.0, + "step": 11393 + }, + { + "epoch": 3.08112493239589, + "grad_norm": 1.1884863376617432, + "learning_rate": 8.103655084167658e-06, + "loss": 1.8997, + "mean_token_accuracy": 0.577576220035553, + "num_tokens": 5823981086.0, + "step": 11394 + }, + { + "epoch": 3.0813953488372094, + "grad_norm": 1.0335215330123901, + "learning_rate": 8.102162505170127e-06, + "loss": 1.8615, + "mean_token_accuracy": 0.5829629898071289, + "num_tokens": 5824469975.0, + "step": 11395 + }, + { + "epoch": 3.081665765278529, + "grad_norm": 1.0770267248153687, + "learning_rate": 8.100670015086773e-06, + "loss": 1.7981, + "mean_token_accuracy": 0.6062659025192261, + "num_tokens": 5824929946.0, + "step": 11396 + }, + { + "epoch": 3.0819361817198487, + "grad_norm": 1.0412800312042236, + "learning_rate": 8.099177613963391e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.5839314460754395, + "num_tokens": 5825427743.0, + "step": 11397 + }, + { + "epoch": 3.0822065981611684, + "grad_norm": 1.1041311025619507, + "learning_rate": 8.09768530184577e-06, + "loss": 1.8634, + "mean_token_accuracy": 0.570953905582428, + "num_tokens": 5825952008.0, + "step": 11398 + }, + { + "epoch": 3.082477014602488, + "grad_norm": 1.0352039337158203, + "learning_rate": 8.096193078779695e-06, + "loss": 1.8409, + "mean_token_accuracy": 0.5881311893463135, + "num_tokens": 5826458861.0, + "step": 11399 + }, + { + "epoch": 3.0827474310438077, + "grad_norm": 0.9275801777839661, + "learning_rate": 8.094700944810962e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5495049357414246, + "num_tokens": 5826983089.0, + "step": 11400 + }, + { + "epoch": 3.0830178474851273, + "grad_norm": 0.4281267523765564, + "learning_rate": 8.093208899985345e-06, + "loss": 1.1524, + "mean_token_accuracy": 0.6813184022903442, + "num_tokens": 5827496035.0, + "step": 11401 + }, + { + "epoch": 3.0832882639264465, + "grad_norm": 1.2884271144866943, + "learning_rate": 8.091716944348626e-06, + "loss": 1.851, + "mean_token_accuracy": 0.5770748853683472, + "num_tokens": 5828020280.0, + "step": 11402 + }, + { + "epoch": 3.083558680367766, + "grad_norm": 1.1592670679092407, + "learning_rate": 8.090225077946585e-06, + "loss": 1.8846, + "mean_token_accuracy": 0.5686877369880676, + "num_tokens": 5828544400.0, + "step": 11403 + }, + { + "epoch": 3.083829096809086, + "grad_norm": 1.257613182067871, + "learning_rate": 8.088733300824998e-06, + "loss": 1.9047, + "mean_token_accuracy": 0.5842328071594238, + "num_tokens": 5829027049.0, + "step": 11404 + }, + { + "epoch": 3.0840995132504054, + "grad_norm": 0.9727849364280701, + "learning_rate": 8.087241613029631e-06, + "loss": 1.7457, + "mean_token_accuracy": 0.5917184352874756, + "num_tokens": 5829551258.0, + "step": 11405 + }, + { + "epoch": 3.084369929691725, + "grad_norm": 1.3310909271240234, + "learning_rate": 8.08575001460626e-06, + "loss": 1.8541, + "mean_token_accuracy": 0.5746970176696777, + "num_tokens": 5830075356.0, + "step": 11406 + }, + { + "epoch": 3.0846403461330447, + "grad_norm": 1.2168906927108765, + "learning_rate": 8.084258505600648e-06, + "loss": 1.8774, + "mean_token_accuracy": 0.5793753266334534, + "num_tokens": 5830599532.0, + "step": 11407 + }, + { + "epoch": 3.0849107625743644, + "grad_norm": 1.0278992652893066, + "learning_rate": 8.082767086058559e-06, + "loss": 1.8831, + "mean_token_accuracy": 0.5760126113891602, + "num_tokens": 5831123772.0, + "step": 11408 + }, + { + "epoch": 3.085181179015684, + "grad_norm": 1.1643182039260864, + "learning_rate": 8.081275756025752e-06, + "loss": 1.8635, + "mean_token_accuracy": 0.5815255641937256, + "num_tokens": 5831610065.0, + "step": 11409 + }, + { + "epoch": 3.0854515954570036, + "grad_norm": 1.1807507276535034, + "learning_rate": 8.07978451554799e-06, + "loss": 1.9432, + "mean_token_accuracy": 0.562856912612915, + "num_tokens": 5832134286.0, + "step": 11410 + }, + { + "epoch": 3.0857220118983233, + "grad_norm": 1.0873583555221558, + "learning_rate": 8.07829336467103e-06, + "loss": 1.7878, + "mean_token_accuracy": 0.5709983706474304, + "num_tokens": 5832658492.0, + "step": 11411 + }, + { + "epoch": 3.085992428339643, + "grad_norm": 1.0328494310379028, + "learning_rate": 8.076802303440621e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.599717378616333, + "num_tokens": 5833153552.0, + "step": 11412 + }, + { + "epoch": 3.0862628447809626, + "grad_norm": 1.2176368236541748, + "learning_rate": 8.07531133190251e-06, + "loss": 1.895, + "mean_token_accuracy": 0.5524699687957764, + "num_tokens": 5833677813.0, + "step": 11413 + }, + { + "epoch": 3.086533261222282, + "grad_norm": 1.1104484796524048, + "learning_rate": 8.073820450102455e-06, + "loss": 1.7903, + "mean_token_accuracy": 0.5670861005783081, + "num_tokens": 5834202092.0, + "step": 11414 + }, + { + "epoch": 3.086803677663602, + "grad_norm": 1.080761194229126, + "learning_rate": 8.072329658086189e-06, + "loss": 2.0056, + "mean_token_accuracy": 0.5342206954956055, + "num_tokens": 5834726374.0, + "step": 11415 + }, + { + "epoch": 3.0870740941049215, + "grad_norm": 1.0422059297561646, + "learning_rate": 8.07083895589946e-06, + "loss": 1.89, + "mean_token_accuracy": 0.5739234685897827, + "num_tokens": 5835239867.0, + "step": 11416 + }, + { + "epoch": 3.087344510546241, + "grad_norm": 1.229609727859497, + "learning_rate": 8.069348343588004e-06, + "loss": 1.7881, + "mean_token_accuracy": 0.5977355241775513, + "num_tokens": 5835684767.0, + "step": 11417 + }, + { + "epoch": 3.087614926987561, + "grad_norm": 1.4187692403793335, + "learning_rate": 8.067857821197565e-06, + "loss": 1.9484, + "mean_token_accuracy": 0.5682690143585205, + "num_tokens": 5836204788.0, + "step": 11418 + }, + { + "epoch": 3.0878853434288804, + "grad_norm": 1.3000575304031372, + "learning_rate": 8.066367388773863e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5588200092315674, + "num_tokens": 5836728994.0, + "step": 11419 + }, + { + "epoch": 3.0881557598702, + "grad_norm": 1.189170241355896, + "learning_rate": 8.064877046362643e-06, + "loss": 1.9287, + "mean_token_accuracy": 0.5622488260269165, + "num_tokens": 5837243249.0, + "step": 11420 + }, + { + "epoch": 3.0884261763115197, + "grad_norm": 0.4167768061161041, + "learning_rate": 8.063386794009626e-06, + "loss": 1.1571, + "mean_token_accuracy": 0.6662851572036743, + "num_tokens": 5837767460.0, + "step": 11421 + }, + { + "epoch": 3.0886965927528394, + "grad_norm": 1.5413800477981567, + "learning_rate": 8.061896631760536e-06, + "loss": 1.8528, + "mean_token_accuracy": 0.578917384147644, + "num_tokens": 5838254824.0, + "step": 11422 + }, + { + "epoch": 3.088967009194159, + "grad_norm": 1.2827085256576538, + "learning_rate": 8.060406559661095e-06, + "loss": 1.9136, + "mean_token_accuracy": 0.5597366094589233, + "num_tokens": 5838778902.0, + "step": 11423 + }, + { + "epoch": 3.0892374256354787, + "grad_norm": 1.1562165021896362, + "learning_rate": 8.058916577757032e-06, + "loss": 1.8418, + "mean_token_accuracy": 0.5650351047515869, + "num_tokens": 5839297409.0, + "step": 11424 + }, + { + "epoch": 3.0895078420767983, + "grad_norm": 0.9927552938461304, + "learning_rate": 8.057426686094053e-06, + "loss": 1.6387, + "mean_token_accuracy": 0.6197824478149414, + "num_tokens": 5839821685.0, + "step": 11425 + }, + { + "epoch": 3.089778258518118, + "grad_norm": 1.1386466026306152, + "learning_rate": 8.05593688471788e-06, + "loss": 1.8785, + "mean_token_accuracy": 0.5446937084197998, + "num_tokens": 5840288323.0, + "step": 11426 + }, + { + "epoch": 3.0900486749594376, + "grad_norm": 1.0824358463287354, + "learning_rate": 8.05444717367422e-06, + "loss": 1.7009, + "mean_token_accuracy": 0.5906075239181519, + "num_tokens": 5840788657.0, + "step": 11427 + }, + { + "epoch": 3.0903190914007572, + "grad_norm": 0.9130866527557373, + "learning_rate": 8.052957553008782e-06, + "loss": 1.829, + "mean_token_accuracy": 0.5784375667572021, + "num_tokens": 5841312880.0, + "step": 11428 + }, + { + "epoch": 3.090589507842077, + "grad_norm": 0.9499028921127319, + "learning_rate": 8.051468022767275e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5810331702232361, + "num_tokens": 5841837049.0, + "step": 11429 + }, + { + "epoch": 3.0908599242833965, + "grad_norm": 1.2121068239212036, + "learning_rate": 8.049978582995397e-06, + "loss": 1.7541, + "mean_token_accuracy": 0.5831975936889648, + "num_tokens": 5842360994.0, + "step": 11430 + }, + { + "epoch": 3.091130340724716, + "grad_norm": 0.9415381550788879, + "learning_rate": 8.048489233738853e-06, + "loss": 1.9185, + "mean_token_accuracy": 0.559258222579956, + "num_tokens": 5842885217.0, + "step": 11431 + }, + { + "epoch": 3.091400757166036, + "grad_norm": 1.0289177894592285, + "learning_rate": 8.04699997504334e-06, + "loss": 1.8867, + "mean_token_accuracy": 0.5881085991859436, + "num_tokens": 5843344788.0, + "step": 11432 + }, + { + "epoch": 3.0916711736073554, + "grad_norm": 1.3395565748214722, + "learning_rate": 8.045510806954548e-06, + "loss": 1.8014, + "mean_token_accuracy": 0.566609263420105, + "num_tokens": 5843832934.0, + "step": 11433 + }, + { + "epoch": 3.091941590048675, + "grad_norm": 1.1040440797805786, + "learning_rate": 8.044021729518178e-06, + "loss": 1.9507, + "mean_token_accuracy": 0.5639638900756836, + "num_tokens": 5844357164.0, + "step": 11434 + }, + { + "epoch": 3.0922120064899947, + "grad_norm": 1.026442289352417, + "learning_rate": 8.042532742779916e-06, + "loss": 1.7751, + "mean_token_accuracy": 0.6007654070854187, + "num_tokens": 5844840989.0, + "step": 11435 + }, + { + "epoch": 3.0924824229313144, + "grad_norm": 1.1169153451919556, + "learning_rate": 8.041043846785442e-06, + "loss": 1.8505, + "mean_token_accuracy": 0.5737478733062744, + "num_tokens": 5845365265.0, + "step": 11436 + }, + { + "epoch": 3.092752839372634, + "grad_norm": 1.1120654344558716, + "learning_rate": 8.03955504158045e-06, + "loss": 1.9375, + "mean_token_accuracy": 0.568790853023529, + "num_tokens": 5845889459.0, + "step": 11437 + }, + { + "epoch": 3.0930232558139537, + "grad_norm": 1.3344157934188843, + "learning_rate": 8.03806632721061e-06, + "loss": 1.8841, + "mean_token_accuracy": 0.5834729075431824, + "num_tokens": 5846344394.0, + "step": 11438 + }, + { + "epoch": 3.0932936722552733, + "grad_norm": 1.237106204032898, + "learning_rate": 8.036577703721604e-06, + "loss": 1.8812, + "mean_token_accuracy": 0.5650538206100464, + "num_tokens": 5846868578.0, + "step": 11439 + }, + { + "epoch": 3.093564088696593, + "grad_norm": 1.2389299869537354, + "learning_rate": 8.035089171159114e-06, + "loss": 1.8727, + "mean_token_accuracy": 0.5775173902511597, + "num_tokens": 5847392755.0, + "step": 11440 + }, + { + "epoch": 3.0938345051379126, + "grad_norm": 0.4421243369579315, + "learning_rate": 8.033600729568807e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.724983811378479, + "num_tokens": 5847909342.0, + "step": 11441 + }, + { + "epoch": 3.0941049215792322, + "grad_norm": 1.8733640909194946, + "learning_rate": 8.032112378996349e-06, + "loss": 1.8543, + "mean_token_accuracy": 0.5852093696594238, + "num_tokens": 5848433621.0, + "step": 11442 + }, + { + "epoch": 3.0943753380205514, + "grad_norm": 1.5616681575775146, + "learning_rate": 8.030624119487416e-06, + "loss": 1.8271, + "mean_token_accuracy": 0.594357430934906, + "num_tokens": 5848916234.0, + "step": 11443 + }, + { + "epoch": 3.094645754461871, + "grad_norm": 1.1566805839538574, + "learning_rate": 8.029135951087663e-06, + "loss": 1.7826, + "mean_token_accuracy": 0.5702356100082397, + "num_tokens": 5849440389.0, + "step": 11444 + }, + { + "epoch": 3.0949161709031907, + "grad_norm": 1.381395936012268, + "learning_rate": 8.027647873842756e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.5684414505958557, + "num_tokens": 5849964671.0, + "step": 11445 + }, + { + "epoch": 3.0951865873445104, + "grad_norm": 1.4847939014434814, + "learning_rate": 8.026159887798357e-06, + "loss": 2.0287, + "mean_token_accuracy": 0.531517505645752, + "num_tokens": 5850488948.0, + "step": 11446 + }, + { + "epoch": 3.09545700378583, + "grad_norm": 1.4707801342010498, + "learning_rate": 8.02467199300012e-06, + "loss": 1.9036, + "mean_token_accuracy": 0.5681425333023071, + "num_tokens": 5851013052.0, + "step": 11447 + }, + { + "epoch": 3.0957274202271496, + "grad_norm": 1.6717662811279297, + "learning_rate": 8.023184189493689e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5803956985473633, + "num_tokens": 5851537057.0, + "step": 11448 + }, + { + "epoch": 3.0959978366684693, + "grad_norm": 1.3175654411315918, + "learning_rate": 8.021696477324727e-06, + "loss": 1.9408, + "mean_token_accuracy": 0.5527699589729309, + "num_tokens": 5852061246.0, + "step": 11449 + }, + { + "epoch": 3.096268253109789, + "grad_norm": 1.6933902502059937, + "learning_rate": 8.020208856538873e-06, + "loss": 1.845, + "mean_token_accuracy": 0.578513503074646, + "num_tokens": 5852573695.0, + "step": 11450 + }, + { + "epoch": 3.0965386695511086, + "grad_norm": 1.3500604629516602, + "learning_rate": 8.018721327181776e-06, + "loss": 1.7354, + "mean_token_accuracy": 0.5907350778579712, + "num_tokens": 5853097906.0, + "step": 11451 + }, + { + "epoch": 3.096809085992428, + "grad_norm": 1.2134562730789185, + "learning_rate": 8.017233889299073e-06, + "loss": 1.8524, + "mean_token_accuracy": 0.5601844787597656, + "num_tokens": 5853591486.0, + "step": 11452 + }, + { + "epoch": 3.097079502433748, + "grad_norm": 1.282581090927124, + "learning_rate": 8.015746542936409e-06, + "loss": 1.9072, + "mean_token_accuracy": 0.5795819163322449, + "num_tokens": 5854086691.0, + "step": 11453 + }, + { + "epoch": 3.0973499188750675, + "grad_norm": 1.6536504030227661, + "learning_rate": 8.014259288139414e-06, + "loss": 1.8697, + "mean_token_accuracy": 0.5695239305496216, + "num_tokens": 5854610968.0, + "step": 11454 + }, + { + "epoch": 3.097620335316387, + "grad_norm": 1.676833987236023, + "learning_rate": 8.01277212495373e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5802192687988281, + "num_tokens": 5855135183.0, + "step": 11455 + }, + { + "epoch": 3.097890751757707, + "grad_norm": 1.3500090837478638, + "learning_rate": 8.011285053424975e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5887256860733032, + "num_tokens": 5855659184.0, + "step": 11456 + }, + { + "epoch": 3.0981611681990264, + "grad_norm": 1.1522119045257568, + "learning_rate": 8.00979807359879e-06, + "loss": 1.6706, + "mean_token_accuracy": 0.6015869379043579, + "num_tokens": 5856183367.0, + "step": 11457 + }, + { + "epoch": 3.098431584640346, + "grad_norm": 1.1588367223739624, + "learning_rate": 8.008311185520793e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5685117244720459, + "num_tokens": 5856707614.0, + "step": 11458 + }, + { + "epoch": 3.0987020010816657, + "grad_norm": 1.4406198263168335, + "learning_rate": 8.006824389236603e-06, + "loss": 1.9551, + "mean_token_accuracy": 0.5434194207191467, + "num_tokens": 5857231698.0, + "step": 11459 + }, + { + "epoch": 3.0989724175229854, + "grad_norm": 1.1340974569320679, + "learning_rate": 8.005337684791846e-06, + "loss": 1.8334, + "mean_token_accuracy": 0.5640910863876343, + "num_tokens": 5857755983.0, + "step": 11460 + }, + { + "epoch": 3.099242833964305, + "grad_norm": 0.3973872661590576, + "learning_rate": 8.003851072232135e-06, + "loss": 1.1756, + "mean_token_accuracy": 0.684944748878479, + "num_tokens": 5858224152.0, + "step": 11461 + }, + { + "epoch": 3.0995132504056246, + "grad_norm": 1.3755663633346558, + "learning_rate": 8.002364551603081e-06, + "loss": 1.9236, + "mean_token_accuracy": 0.5585472583770752, + "num_tokens": 5858748301.0, + "step": 11462 + }, + { + "epoch": 3.0997836668469443, + "grad_norm": 1.337103247642517, + "learning_rate": 8.000878122950305e-06, + "loss": 1.7955, + "mean_token_accuracy": 0.579590380191803, + "num_tokens": 5859272470.0, + "step": 11463 + }, + { + "epoch": 3.100054083288264, + "grad_norm": 1.0630789995193481, + "learning_rate": 7.999391786319403e-06, + "loss": 1.9149, + "mean_token_accuracy": 0.5574461221694946, + "num_tokens": 5859796566.0, + "step": 11464 + }, + { + "epoch": 3.1003244997295836, + "grad_norm": 0.8961224555969238, + "learning_rate": 7.997905541755985e-06, + "loss": 1.6639, + "mean_token_accuracy": 0.6133193373680115, + "num_tokens": 5860288959.0, + "step": 11465 + }, + { + "epoch": 3.100594916170903, + "grad_norm": 1.2100582122802734, + "learning_rate": 7.996419389305654e-06, + "loss": 2.0158, + "mean_token_accuracy": 0.5408853888511658, + "num_tokens": 5860813238.0, + "step": 11466 + }, + { + "epoch": 3.100865332612223, + "grad_norm": 0.9479857087135315, + "learning_rate": 7.994933329014011e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.6243178844451904, + "num_tokens": 5861211755.0, + "step": 11467 + }, + { + "epoch": 3.1011357490535425, + "grad_norm": 0.9021061658859253, + "learning_rate": 7.99344736092665e-06, + "loss": 1.6715, + "mean_token_accuracy": 0.5854305028915405, + "num_tokens": 5861718350.0, + "step": 11468 + }, + { + "epoch": 3.101406165494862, + "grad_norm": 1.0370426177978516, + "learning_rate": 7.991961485089163e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5802385807037354, + "num_tokens": 5862242534.0, + "step": 11469 + }, + { + "epoch": 3.101676581936182, + "grad_norm": 1.1361258029937744, + "learning_rate": 7.990475701547148e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.5754681825637817, + "num_tokens": 5862766753.0, + "step": 11470 + }, + { + "epoch": 3.1019469983775014, + "grad_norm": 1.0316427946090698, + "learning_rate": 7.988990010346188e-06, + "loss": 1.8494, + "mean_token_accuracy": 0.5646628141403198, + "num_tokens": 5863290919.0, + "step": 11471 + }, + { + "epoch": 3.102217414818821, + "grad_norm": 0.9905170202255249, + "learning_rate": 7.987504411531867e-06, + "loss": 1.8839, + "mean_token_accuracy": 0.5740576982498169, + "num_tokens": 5863763204.0, + "step": 11472 + }, + { + "epoch": 3.1024878312601407, + "grad_norm": 1.0093488693237305, + "learning_rate": 7.98601890514977e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5716893672943115, + "num_tokens": 5864287433.0, + "step": 11473 + }, + { + "epoch": 3.1027582477014604, + "grad_norm": 1.1175979375839233, + "learning_rate": 7.984533491245477e-06, + "loss": 1.8468, + "mean_token_accuracy": 0.5628270506858826, + "num_tokens": 5864811626.0, + "step": 11474 + }, + { + "epoch": 3.10302866414278, + "grad_norm": 1.0009251832962036, + "learning_rate": 7.983048169864568e-06, + "loss": 1.9428, + "mean_token_accuracy": 0.5447671413421631, + "num_tokens": 5865335849.0, + "step": 11475 + }, + { + "epoch": 3.1032990805840996, + "grad_norm": 1.1125797033309937, + "learning_rate": 7.981562941052613e-06, + "loss": 1.916, + "mean_token_accuracy": 0.5738750696182251, + "num_tokens": 5865860050.0, + "step": 11476 + }, + { + "epoch": 3.1035694970254193, + "grad_norm": 1.1051859855651855, + "learning_rate": 7.980077804855178e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5704107284545898, + "num_tokens": 5866384194.0, + "step": 11477 + }, + { + "epoch": 3.103839913466739, + "grad_norm": 0.9906623959541321, + "learning_rate": 7.978592761317841e-06, + "loss": 1.9225, + "mean_token_accuracy": 0.5495564937591553, + "num_tokens": 5866908360.0, + "step": 11478 + }, + { + "epoch": 3.1041103299080586, + "grad_norm": 0.9437917470932007, + "learning_rate": 7.97710781048616e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5659855008125305, + "num_tokens": 5867432498.0, + "step": 11479 + }, + { + "epoch": 3.104380746349378, + "grad_norm": 1.1136996746063232, + "learning_rate": 7.975622952405703e-06, + "loss": 1.8702, + "mean_token_accuracy": 0.5705851316452026, + "num_tokens": 5867956777.0, + "step": 11480 + }, + { + "epoch": 3.104651162790698, + "grad_norm": 0.5106080770492554, + "learning_rate": 7.974138187122032e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.7190675735473633, + "num_tokens": 5868458215.0, + "step": 11481 + }, + { + "epoch": 3.1049215792320175, + "grad_norm": 1.2595292329788208, + "learning_rate": 7.972653514680692e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5754573345184326, + "num_tokens": 5868982341.0, + "step": 11482 + }, + { + "epoch": 3.105191995673337, + "grad_norm": 1.367861032485962, + "learning_rate": 7.971168935127252e-06, + "loss": 2.0144, + "mean_token_accuracy": 0.5418399572372437, + "num_tokens": 5869506613.0, + "step": 11483 + }, + { + "epoch": 3.1054624121146563, + "grad_norm": 1.0396281480789185, + "learning_rate": 7.969684448507253e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5590922236442566, + "num_tokens": 5870030865.0, + "step": 11484 + }, + { + "epoch": 3.105732828555976, + "grad_norm": 0.8372063636779785, + "learning_rate": 7.968200054866247e-06, + "loss": 1.8554, + "mean_token_accuracy": 0.5776488780975342, + "num_tokens": 5870555037.0, + "step": 11485 + }, + { + "epoch": 3.1060032449972956, + "grad_norm": 1.2302476167678833, + "learning_rate": 7.96671575424978e-06, + "loss": 1.925, + "mean_token_accuracy": 0.5540259480476379, + "num_tokens": 5871079249.0, + "step": 11486 + }, + { + "epoch": 3.1062736614386153, + "grad_norm": 1.269910216331482, + "learning_rate": 7.965231546703395e-06, + "loss": 1.8654, + "mean_token_accuracy": 0.5738270282745361, + "num_tokens": 5871603521.0, + "step": 11487 + }, + { + "epoch": 3.106544077879935, + "grad_norm": 1.1460639238357544, + "learning_rate": 7.963747432272627e-06, + "loss": 1.7618, + "mean_token_accuracy": 0.5957852602005005, + "num_tokens": 5872067639.0, + "step": 11488 + }, + { + "epoch": 3.1068144943212546, + "grad_norm": 1.2537003755569458, + "learning_rate": 7.96226341100302e-06, + "loss": 1.8718, + "mean_token_accuracy": 0.5674456357955933, + "num_tokens": 5872591716.0, + "step": 11489 + }, + { + "epoch": 3.107084910762574, + "grad_norm": 1.4627689123153687, + "learning_rate": 7.960779482940107e-06, + "loss": 1.8693, + "mean_token_accuracy": 0.5641582608222961, + "num_tokens": 5873013603.0, + "step": 11490 + }, + { + "epoch": 3.107355327203894, + "grad_norm": 1.1723659038543701, + "learning_rate": 7.959295648129411e-06, + "loss": 1.9238, + "mean_token_accuracy": 0.5590342879295349, + "num_tokens": 5873537880.0, + "step": 11491 + }, + { + "epoch": 3.1076257436452135, + "grad_norm": 0.9466039538383484, + "learning_rate": 7.957811906616472e-06, + "loss": 1.7589, + "mean_token_accuracy": 0.5842950344085693, + "num_tokens": 5874036887.0, + "step": 11492 + }, + { + "epoch": 3.107896160086533, + "grad_norm": 0.9718641042709351, + "learning_rate": 7.956328258446808e-06, + "loss": 1.8607, + "mean_token_accuracy": 0.5745254755020142, + "num_tokens": 5874561142.0, + "step": 11493 + }, + { + "epoch": 3.1081665765278528, + "grad_norm": 1.495646357536316, + "learning_rate": 7.954844703665945e-06, + "loss": 1.7803, + "mean_token_accuracy": 0.6001828908920288, + "num_tokens": 5875024739.0, + "step": 11494 + }, + { + "epoch": 3.1084369929691724, + "grad_norm": 0.9967360496520996, + "learning_rate": 7.9533612423194e-06, + "loss": 1.834, + "mean_token_accuracy": 0.5745322704315186, + "num_tokens": 5875541157.0, + "step": 11495 + }, + { + "epoch": 3.108707409410492, + "grad_norm": 1.2501189708709717, + "learning_rate": 7.951877874452694e-06, + "loss": 1.71, + "mean_token_accuracy": 0.6023678779602051, + "num_tokens": 5876040634.0, + "step": 11496 + }, + { + "epoch": 3.1089778258518117, + "grad_norm": 1.2515368461608887, + "learning_rate": 7.950394600111335e-06, + "loss": 1.8015, + "mean_token_accuracy": 0.576850950717926, + "num_tokens": 5876564859.0, + "step": 11497 + }, + { + "epoch": 3.1092482422931313, + "grad_norm": 1.2165712118148804, + "learning_rate": 7.948911419340841e-06, + "loss": 1.8923, + "mean_token_accuracy": 0.563696026802063, + "num_tokens": 5877089024.0, + "step": 11498 + }, + { + "epoch": 3.109518658734451, + "grad_norm": 1.0052684545516968, + "learning_rate": 7.94742833218672e-06, + "loss": 1.9223, + "mean_token_accuracy": 0.5574342608451843, + "num_tokens": 5877613146.0, + "step": 11499 + }, + { + "epoch": 3.1097890751757706, + "grad_norm": 1.0584614276885986, + "learning_rate": 7.945945338694467e-06, + "loss": 1.9258, + "mean_token_accuracy": 0.5712181329727173, + "num_tokens": 5878113471.0, + "step": 11500 + }, + { + "epoch": 3.1100594916170903, + "grad_norm": 0.37986892461776733, + "learning_rate": 7.944462438909598e-06, + "loss": 1.1077, + "mean_token_accuracy": 0.6999174356460571, + "num_tokens": 5878637679.0, + "step": 11501 + }, + { + "epoch": 3.11032990805841, + "grad_norm": 1.3653936386108398, + "learning_rate": 7.942979632877605e-06, + "loss": 1.8213, + "mean_token_accuracy": 0.5665544271469116, + "num_tokens": 5879161954.0, + "step": 11502 + }, + { + "epoch": 3.1106003244997296, + "grad_norm": 1.1998145580291748, + "learning_rate": 7.941496920643987e-06, + "loss": 1.7248, + "mean_token_accuracy": 0.5845227241516113, + "num_tokens": 5879686224.0, + "step": 11503 + }, + { + "epoch": 3.110870740941049, + "grad_norm": 0.8675436973571777, + "learning_rate": 7.940014302254243e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5672395825386047, + "num_tokens": 5880210454.0, + "step": 11504 + }, + { + "epoch": 3.111141157382369, + "grad_norm": 0.9129371047019958, + "learning_rate": 7.93853177775385e-06, + "loss": 1.7112, + "mean_token_accuracy": 0.6084048748016357, + "num_tokens": 5880694973.0, + "step": 11505 + }, + { + "epoch": 3.1114115738236885, + "grad_norm": 1.084952473640442, + "learning_rate": 7.937049347188313e-06, + "loss": 1.9008, + "mean_token_accuracy": 0.560560405254364, + "num_tokens": 5881219242.0, + "step": 11506 + }, + { + "epoch": 3.111681990265008, + "grad_norm": 1.151436686515808, + "learning_rate": 7.935567010603106e-06, + "loss": 1.9099, + "mean_token_accuracy": 0.5675240755081177, + "num_tokens": 5881703546.0, + "step": 11507 + }, + { + "epoch": 3.111952406706328, + "grad_norm": 1.0839848518371582, + "learning_rate": 7.934084768043715e-06, + "loss": 1.7738, + "mean_token_accuracy": 0.5842794179916382, + "num_tokens": 5882227809.0, + "step": 11508 + }, + { + "epoch": 3.1122228231476474, + "grad_norm": 0.958977997303009, + "learning_rate": 7.93260261955562e-06, + "loss": 1.9802, + "mean_token_accuracy": 0.5440126657485962, + "num_tokens": 5882751955.0, + "step": 11509 + }, + { + "epoch": 3.112493239588967, + "grad_norm": 0.9633509516716003, + "learning_rate": 7.9311205651843e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5698580741882324, + "num_tokens": 5883276131.0, + "step": 11510 + }, + { + "epoch": 3.1127636560302867, + "grad_norm": 0.9081708788871765, + "learning_rate": 7.92963860497522e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5766491889953613, + "num_tokens": 5883788958.0, + "step": 11511 + }, + { + "epoch": 3.1130340724716064, + "grad_norm": 1.0812065601348877, + "learning_rate": 7.928156738973865e-06, + "loss": 1.9476, + "mean_token_accuracy": 0.5622428059577942, + "num_tokens": 5884313226.0, + "step": 11512 + }, + { + "epoch": 3.113304488912926, + "grad_norm": 1.2153007984161377, + "learning_rate": 7.92667496722569e-06, + "loss": 1.8762, + "mean_token_accuracy": 0.5671142935752869, + "num_tokens": 5884837414.0, + "step": 11513 + }, + { + "epoch": 3.1135749053542456, + "grad_norm": 1.0099129676818848, + "learning_rate": 7.925193289776162e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.56981360912323, + "num_tokens": 5885361650.0, + "step": 11514 + }, + { + "epoch": 3.1138453217955653, + "grad_norm": 1.0844353437423706, + "learning_rate": 7.923711706670752e-06, + "loss": 1.8618, + "mean_token_accuracy": 0.5387970805168152, + "num_tokens": 5885885857.0, + "step": 11515 + }, + { + "epoch": 3.114115738236885, + "grad_norm": 1.0799891948699951, + "learning_rate": 7.922230217954912e-06, + "loss": 1.7954, + "mean_token_accuracy": 0.5825456380844116, + "num_tokens": 5886374313.0, + "step": 11516 + }, + { + "epoch": 3.1143861546782046, + "grad_norm": 1.1295981407165527, + "learning_rate": 7.920748823674098e-06, + "loss": 1.8851, + "mean_token_accuracy": 0.5621289610862732, + "num_tokens": 5886898548.0, + "step": 11517 + }, + { + "epoch": 3.114656571119524, + "grad_norm": 0.9706839919090271, + "learning_rate": 7.919267523873768e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.5691711902618408, + "num_tokens": 5887422709.0, + "step": 11518 + }, + { + "epoch": 3.114926987560844, + "grad_norm": 1.0109323263168335, + "learning_rate": 7.91778631859937e-06, + "loss": 1.7909, + "mean_token_accuracy": 0.5661799907684326, + "num_tokens": 5887929438.0, + "step": 11519 + }, + { + "epoch": 3.1151974040021635, + "grad_norm": 1.1846760511398315, + "learning_rate": 7.91630520789635e-06, + "loss": 1.8926, + "mean_token_accuracy": 0.5477539896965027, + "num_tokens": 5888453716.0, + "step": 11520 + }, + { + "epoch": 3.115467820443483, + "grad_norm": 0.4637124240398407, + "learning_rate": 7.914824191810158e-06, + "loss": 1.0962, + "mean_token_accuracy": 0.7078486680984497, + "num_tokens": 5888973929.0, + "step": 11521 + }, + { + "epoch": 3.115738236884803, + "grad_norm": 1.562429666519165, + "learning_rate": 7.91334327038623e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.5729456543922424, + "num_tokens": 5889498100.0, + "step": 11522 + }, + { + "epoch": 3.1160086533261224, + "grad_norm": 1.1956804990768433, + "learning_rate": 7.911862443670007e-06, + "loss": 1.7359, + "mean_token_accuracy": 0.6163691878318787, + "num_tokens": 5890022355.0, + "step": 11523 + }, + { + "epoch": 3.116279069767442, + "grad_norm": 0.8779099583625793, + "learning_rate": 7.91038171170693e-06, + "loss": 1.8531, + "mean_token_accuracy": 0.571670413017273, + "num_tokens": 5890546617.0, + "step": 11524 + }, + { + "epoch": 3.1165494862087613, + "grad_norm": 1.1604063510894775, + "learning_rate": 7.90890107454242e-06, + "loss": 1.8583, + "mean_token_accuracy": 0.5760689973831177, + "num_tokens": 5891070894.0, + "step": 11525 + }, + { + "epoch": 3.1168199026500814, + "grad_norm": 1.158458948135376, + "learning_rate": 7.907420532221924e-06, + "loss": 1.9818, + "mean_token_accuracy": 0.5435531139373779, + "num_tokens": 5891558045.0, + "step": 11526 + }, + { + "epoch": 3.1170903190914006, + "grad_norm": 0.9623655080795288, + "learning_rate": 7.90594008479086e-06, + "loss": 1.8911, + "mean_token_accuracy": 0.5667171478271484, + "num_tokens": 5892082269.0, + "step": 11527 + }, + { + "epoch": 3.11736073553272, + "grad_norm": 0.9291481375694275, + "learning_rate": 7.904459732294647e-06, + "loss": 1.8379, + "mean_token_accuracy": 0.5812413692474365, + "num_tokens": 5892573381.0, + "step": 11528 + }, + { + "epoch": 3.11763115197404, + "grad_norm": 1.191317081451416, + "learning_rate": 7.902979474778717e-06, + "loss": 1.9275, + "mean_token_accuracy": 0.5649979710578918, + "num_tokens": 5893097602.0, + "step": 11529 + }, + { + "epoch": 3.1179015684153595, + "grad_norm": 1.3215433359146118, + "learning_rate": 7.901499312288486e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5631162524223328, + "num_tokens": 5893509627.0, + "step": 11530 + }, + { + "epoch": 3.118171984856679, + "grad_norm": 0.9657028317451477, + "learning_rate": 7.900019244869364e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5912954807281494, + "num_tokens": 5894033891.0, + "step": 11531 + }, + { + "epoch": 3.1184424012979988, + "grad_norm": 1.0573723316192627, + "learning_rate": 7.89853927256677e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5797531008720398, + "num_tokens": 5894522825.0, + "step": 11532 + }, + { + "epoch": 3.1187128177393184, + "grad_norm": 1.0137733221054077, + "learning_rate": 7.897059395426114e-06, + "loss": 1.7444, + "mean_token_accuracy": 0.5775926113128662, + "num_tokens": 5895047078.0, + "step": 11533 + }, + { + "epoch": 3.118983234180638, + "grad_norm": 1.020761251449585, + "learning_rate": 7.895579613492796e-06, + "loss": 1.7635, + "mean_token_accuracy": 0.5814554691314697, + "num_tokens": 5895571243.0, + "step": 11534 + }, + { + "epoch": 3.1192536506219577, + "grad_norm": 1.0222489833831787, + "learning_rate": 7.894099926812231e-06, + "loss": 1.8831, + "mean_token_accuracy": 0.5510485768318176, + "num_tokens": 5896092217.0, + "step": 11535 + }, + { + "epoch": 3.1195240670632773, + "grad_norm": 1.1327518224716187, + "learning_rate": 7.892620335429812e-06, + "loss": 1.9479, + "mean_token_accuracy": 0.5586122870445251, + "num_tokens": 5896616501.0, + "step": 11536 + }, + { + "epoch": 3.119794483504597, + "grad_norm": 1.1370631456375122, + "learning_rate": 7.891140839390936e-06, + "loss": 1.7523, + "mean_token_accuracy": 0.6183319091796875, + "num_tokens": 5897130044.0, + "step": 11537 + }, + { + "epoch": 3.1200648999459166, + "grad_norm": 0.9964814186096191, + "learning_rate": 7.889661438741005e-06, + "loss": 1.87, + "mean_token_accuracy": 0.5732876062393188, + "num_tokens": 5897654268.0, + "step": 11538 + }, + { + "epoch": 3.1203353163872363, + "grad_norm": 1.0170576572418213, + "learning_rate": 7.888182133525409e-06, + "loss": 1.6595, + "mean_token_accuracy": 0.5926549434661865, + "num_tokens": 5898178485.0, + "step": 11539 + }, + { + "epoch": 3.120605732828556, + "grad_norm": 1.1210623979568481, + "learning_rate": 7.88670292378953e-06, + "loss": 1.9002, + "mean_token_accuracy": 0.5438498854637146, + "num_tokens": 5898702510.0, + "step": 11540 + }, + { + "epoch": 3.1208761492698756, + "grad_norm": 0.3631386458873749, + "learning_rate": 7.885223809578768e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.6986749768257141, + "num_tokens": 5899226715.0, + "step": 11541 + }, + { + "epoch": 3.121146565711195, + "grad_norm": 1.3282357454299927, + "learning_rate": 7.8837447909385e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5778326988220215, + "num_tokens": 5899750794.0, + "step": 11542 + }, + { + "epoch": 3.121416982152515, + "grad_norm": 1.0387815237045288, + "learning_rate": 7.882265867914098e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5659695863723755, + "num_tokens": 5900275068.0, + "step": 11543 + }, + { + "epoch": 3.1216873985938345, + "grad_norm": 1.0467184782028198, + "learning_rate": 7.880787040550954e-06, + "loss": 1.8409, + "mean_token_accuracy": 0.5538081526756287, + "num_tokens": 5900799323.0, + "step": 11544 + }, + { + "epoch": 3.121957815035154, + "grad_norm": 1.145164966583252, + "learning_rate": 7.879308308894438e-06, + "loss": 1.7618, + "mean_token_accuracy": 0.579535722732544, + "num_tokens": 5901297682.0, + "step": 11545 + }, + { + "epoch": 3.1222282314764738, + "grad_norm": 1.1940224170684814, + "learning_rate": 7.877829672989913e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5713827013969421, + "num_tokens": 5901821889.0, + "step": 11546 + }, + { + "epoch": 3.1224986479177934, + "grad_norm": 1.0757852792739868, + "learning_rate": 7.876351132882763e-06, + "loss": 1.8646, + "mean_token_accuracy": 0.5638653039932251, + "num_tokens": 5902346057.0, + "step": 11547 + }, + { + "epoch": 3.122769064359113, + "grad_norm": 1.0516602993011475, + "learning_rate": 7.87487268861834e-06, + "loss": 1.9476, + "mean_token_accuracy": 0.5658508539199829, + "num_tokens": 5902870149.0, + "step": 11548 + }, + { + "epoch": 3.1230394808004327, + "grad_norm": 1.137856364250183, + "learning_rate": 7.87339434024202e-06, + "loss": 1.8774, + "mean_token_accuracy": 0.5566468834877014, + "num_tokens": 5903394377.0, + "step": 11549 + }, + { + "epoch": 3.1233098972417523, + "grad_norm": 1.2235143184661865, + "learning_rate": 7.871916087799151e-06, + "loss": 1.7377, + "mean_token_accuracy": 0.5677502155303955, + "num_tokens": 5903881477.0, + "step": 11550 + }, + { + "epoch": 3.123580313683072, + "grad_norm": 1.090156078338623, + "learning_rate": 7.8704379313351e-06, + "loss": 1.8337, + "mean_token_accuracy": 0.5741986036300659, + "num_tokens": 5904405743.0, + "step": 11551 + }, + { + "epoch": 3.1238507301243916, + "grad_norm": 0.9079066514968872, + "learning_rate": 7.868959870895212e-06, + "loss": 1.7636, + "mean_token_accuracy": 0.5765292644500732, + "num_tokens": 5904929971.0, + "step": 11552 + }, + { + "epoch": 3.1241211465657113, + "grad_norm": 0.9204006195068359, + "learning_rate": 7.867481906524847e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5691360235214233, + "num_tokens": 5905454257.0, + "step": 11553 + }, + { + "epoch": 3.124391563007031, + "grad_norm": 1.0293738842010498, + "learning_rate": 7.866004038269347e-06, + "loss": 1.9436, + "mean_token_accuracy": 0.5390048027038574, + "num_tokens": 5905978455.0, + "step": 11554 + }, + { + "epoch": 3.1246619794483506, + "grad_norm": 0.9570052623748779, + "learning_rate": 7.864526266174061e-06, + "loss": 1.7701, + "mean_token_accuracy": 0.5704426765441895, + "num_tokens": 5906502671.0, + "step": 11555 + }, + { + "epoch": 3.12493239588967, + "grad_norm": 0.89946448802948, + "learning_rate": 7.86304859028433e-06, + "loss": 1.8067, + "mean_token_accuracy": 0.5890578627586365, + "num_tokens": 5906931463.0, + "step": 11556 + }, + { + "epoch": 3.12520281233099, + "grad_norm": 0.9695659279823303, + "learning_rate": 7.861571010645492e-06, + "loss": 1.7277, + "mean_token_accuracy": 0.5836818218231201, + "num_tokens": 5907455738.0, + "step": 11557 + }, + { + "epoch": 3.1254732287723095, + "grad_norm": 1.0627654790878296, + "learning_rate": 7.860093527302884e-06, + "loss": 1.8207, + "mean_token_accuracy": 0.5727602243423462, + "num_tokens": 5907979969.0, + "step": 11558 + }, + { + "epoch": 3.125743645213629, + "grad_norm": 1.0984307527542114, + "learning_rate": 7.858616140301845e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.5577847361564636, + "num_tokens": 5908504143.0, + "step": 11559 + }, + { + "epoch": 3.1260140616549488, + "grad_norm": 1.0884230136871338, + "learning_rate": 7.857138849687693e-06, + "loss": 1.7491, + "mean_token_accuracy": 0.5917695164680481, + "num_tokens": 5909028227.0, + "step": 11560 + }, + { + "epoch": 3.1262844780962684, + "grad_norm": 0.39874032139778137, + "learning_rate": 7.85566165550577e-06, + "loss": 1.1629, + "mean_token_accuracy": 0.6935127377510071, + "num_tokens": 5909552495.0, + "step": 11561 + }, + { + "epoch": 3.126554894537588, + "grad_norm": 1.3616931438446045, + "learning_rate": 7.854184557801397e-06, + "loss": 1.8741, + "mean_token_accuracy": 0.5622361898422241, + "num_tokens": 5910036700.0, + "step": 11562 + }, + { + "epoch": 3.1268253109789077, + "grad_norm": 1.1449034214019775, + "learning_rate": 7.852707556619886e-06, + "loss": 1.7917, + "mean_token_accuracy": 0.578802227973938, + "num_tokens": 5910560968.0, + "step": 11563 + }, + { + "epoch": 3.1270957274202273, + "grad_norm": 0.9946082234382629, + "learning_rate": 7.851230652006568e-06, + "loss": 1.8711, + "mean_token_accuracy": 0.5734061002731323, + "num_tokens": 5911085200.0, + "step": 11564 + }, + { + "epoch": 3.127366143861547, + "grad_norm": 1.172264575958252, + "learning_rate": 7.84975384400675e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5739775896072388, + "num_tokens": 5911598983.0, + "step": 11565 + }, + { + "epoch": 3.127636560302866, + "grad_norm": 1.2541842460632324, + "learning_rate": 7.848277132665747e-06, + "loss": 1.9161, + "mean_token_accuracy": 0.5454229116439819, + "num_tokens": 5912123261.0, + "step": 11566 + }, + { + "epoch": 3.1279069767441863, + "grad_norm": 1.023118495941162, + "learning_rate": 7.846800518028875e-06, + "loss": 1.9121, + "mean_token_accuracy": 0.5716922283172607, + "num_tokens": 5912647513.0, + "step": 11567 + }, + { + "epoch": 3.1281773931855055, + "grad_norm": 1.2015973329544067, + "learning_rate": 7.845324000141434e-06, + "loss": 1.8176, + "mean_token_accuracy": 0.5789301991462708, + "num_tokens": 5913171650.0, + "step": 11568 + }, + { + "epoch": 3.128447809626825, + "grad_norm": 1.122903823852539, + "learning_rate": 7.843847579048729e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.5823377370834351, + "num_tokens": 5913639738.0, + "step": 11569 + }, + { + "epoch": 3.1287182260681448, + "grad_norm": 0.937499463558197, + "learning_rate": 7.842371254796064e-06, + "loss": 1.796, + "mean_token_accuracy": 0.575351357460022, + "num_tokens": 5914163858.0, + "step": 11570 + }, + { + "epoch": 3.1289886425094644, + "grad_norm": 1.017551064491272, + "learning_rate": 7.84089502742873e-06, + "loss": 1.7551, + "mean_token_accuracy": 0.5786263346672058, + "num_tokens": 5914688068.0, + "step": 11571 + }, + { + "epoch": 3.129259058950784, + "grad_norm": 1.4282807111740112, + "learning_rate": 7.839418896992032e-06, + "loss": 1.9378, + "mean_token_accuracy": 0.5430638790130615, + "num_tokens": 5915212253.0, + "step": 11572 + }, + { + "epoch": 3.1295294753921037, + "grad_norm": 1.0818719863891602, + "learning_rate": 7.837942863531257e-06, + "loss": 1.8834, + "mean_token_accuracy": 0.5766184329986572, + "num_tokens": 5915736428.0, + "step": 11573 + }, + { + "epoch": 3.1297998918334233, + "grad_norm": 0.9132322072982788, + "learning_rate": 7.836466927091688e-06, + "loss": 1.944, + "mean_token_accuracy": 0.5392158627510071, + "num_tokens": 5916260715.0, + "step": 11574 + }, + { + "epoch": 3.130070308274743, + "grad_norm": 1.0772805213928223, + "learning_rate": 7.834991087718623e-06, + "loss": 1.9519, + "mean_token_accuracy": 0.5534949898719788, + "num_tokens": 5916784978.0, + "step": 11575 + }, + { + "epoch": 3.1303407247160626, + "grad_norm": 0.9834199547767639, + "learning_rate": 7.833515345457339e-06, + "loss": 1.8283, + "mean_token_accuracy": 0.5829730033874512, + "num_tokens": 5917309119.0, + "step": 11576 + }, + { + "epoch": 3.1306111411573823, + "grad_norm": 1.0855637788772583, + "learning_rate": 7.832039700353111e-06, + "loss": 1.8868, + "mean_token_accuracy": 0.5827822685241699, + "num_tokens": 5917784412.0, + "step": 11577 + }, + { + "epoch": 3.130881557598702, + "grad_norm": 1.1235500574111938, + "learning_rate": 7.830564152451228e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5647115707397461, + "num_tokens": 5918308581.0, + "step": 11578 + }, + { + "epoch": 3.1311519740400215, + "grad_norm": 0.9413535594940186, + "learning_rate": 7.829088701796954e-06, + "loss": 1.5996, + "mean_token_accuracy": 0.6358362436294556, + "num_tokens": 5918832710.0, + "step": 11579 + }, + { + "epoch": 3.131422390481341, + "grad_norm": 1.104904055595398, + "learning_rate": 7.827613348435563e-06, + "loss": 1.8919, + "mean_token_accuracy": 0.5650032758712769, + "num_tokens": 5919356953.0, + "step": 11580 + }, + { + "epoch": 3.131692806922661, + "grad_norm": 0.37729087471961975, + "learning_rate": 7.826138092412325e-06, + "loss": 1.1093, + "mean_token_accuracy": 0.7105821371078491, + "num_tokens": 5919832825.0, + "step": 11581 + }, + { + "epoch": 3.1319632233639805, + "grad_norm": 1.5590258836746216, + "learning_rate": 7.824662933772507e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.5784837007522583, + "num_tokens": 5920356913.0, + "step": 11582 + }, + { + "epoch": 3.1322336398053, + "grad_norm": 1.4053802490234375, + "learning_rate": 7.823187872561363e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.5741593837738037, + "num_tokens": 5920881076.0, + "step": 11583 + }, + { + "epoch": 3.1325040562466198, + "grad_norm": 0.9704470038414001, + "learning_rate": 7.82171290882416e-06, + "loss": 1.6273, + "mean_token_accuracy": 0.6282389163970947, + "num_tokens": 5921405300.0, + "step": 11584 + }, + { + "epoch": 3.1327744726879394, + "grad_norm": 1.1547956466674805, + "learning_rate": 7.820238042606153e-06, + "loss": 1.7973, + "mean_token_accuracy": 0.5617077350616455, + "num_tokens": 5921929487.0, + "step": 11585 + }, + { + "epoch": 3.133044889129259, + "grad_norm": 1.170801043510437, + "learning_rate": 7.818763273952589e-06, + "loss": 1.8891, + "mean_token_accuracy": 0.5700297355651855, + "num_tokens": 5922431851.0, + "step": 11586 + }, + { + "epoch": 3.1333153055705787, + "grad_norm": 1.163275957107544, + "learning_rate": 7.817288602908729e-06, + "loss": 1.8613, + "mean_token_accuracy": 0.5614830255508423, + "num_tokens": 5922902291.0, + "step": 11587 + }, + { + "epoch": 3.1335857220118983, + "grad_norm": 1.1490850448608398, + "learning_rate": 7.815814029519812e-06, + "loss": 1.8075, + "mean_token_accuracy": 0.5827358961105347, + "num_tokens": 5923426449.0, + "step": 11588 + }, + { + "epoch": 3.133856138453218, + "grad_norm": 1.0167442560195923, + "learning_rate": 7.814339553831082e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5693038105964661, + "num_tokens": 5923950679.0, + "step": 11589 + }, + { + "epoch": 3.1341265548945376, + "grad_norm": 1.0761789083480835, + "learning_rate": 7.812865175887788e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.5798293948173523, + "num_tokens": 5924448426.0, + "step": 11590 + }, + { + "epoch": 3.1343969713358573, + "grad_norm": 1.1900047063827515, + "learning_rate": 7.811390895735162e-06, + "loss": 1.6296, + "mean_token_accuracy": 0.6105462312698364, + "num_tokens": 5924964846.0, + "step": 11591 + }, + { + "epoch": 3.134667387777177, + "grad_norm": 1.4069336652755737, + "learning_rate": 7.809916713418438e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5504387617111206, + "num_tokens": 5925489032.0, + "step": 11592 + }, + { + "epoch": 3.1349378042184965, + "grad_norm": 1.1139014959335327, + "learning_rate": 7.808442628982851e-06, + "loss": 1.8196, + "mean_token_accuracy": 0.5746320486068726, + "num_tokens": 5926003621.0, + "step": 11593 + }, + { + "epoch": 3.135208220659816, + "grad_norm": 1.2469147443771362, + "learning_rate": 7.806968642473629e-06, + "loss": 1.9195, + "mean_token_accuracy": 0.5551051497459412, + "num_tokens": 5926483020.0, + "step": 11594 + }, + { + "epoch": 3.135478637101136, + "grad_norm": 1.1003901958465576, + "learning_rate": 7.805494753936e-06, + "loss": 1.8538, + "mean_token_accuracy": 0.5661462545394897, + "num_tokens": 5926989537.0, + "step": 11595 + }, + { + "epoch": 3.1357490535424555, + "grad_norm": 1.094791054725647, + "learning_rate": 7.804020963415191e-06, + "loss": 1.8905, + "mean_token_accuracy": 0.5574922561645508, + "num_tokens": 5927513770.0, + "step": 11596 + }, + { + "epoch": 3.136019469983775, + "grad_norm": 1.3274766206741333, + "learning_rate": 7.802547270956411e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5744588971138, + "num_tokens": 5928037900.0, + "step": 11597 + }, + { + "epoch": 3.1362898864250948, + "grad_norm": 1.1247799396514893, + "learning_rate": 7.801073676604889e-06, + "loss": 1.9394, + "mean_token_accuracy": 0.5283994674682617, + "num_tokens": 5928561968.0, + "step": 11598 + }, + { + "epoch": 3.1365603028664144, + "grad_norm": 1.4717377424240112, + "learning_rate": 7.799600180405833e-06, + "loss": 1.9638, + "mean_token_accuracy": 0.5723252892494202, + "num_tokens": 5929042531.0, + "step": 11599 + }, + { + "epoch": 3.136830719307734, + "grad_norm": 1.0888128280639648, + "learning_rate": 7.798126782404449e-06, + "loss": 1.9082, + "mean_token_accuracy": 0.5483459830284119, + "num_tokens": 5929566683.0, + "step": 11600 + }, + { + "epoch": 3.1371011357490537, + "grad_norm": 0.41720566153526306, + "learning_rate": 7.796653482645957e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7340401411056519, + "num_tokens": 5930090886.0, + "step": 11601 + }, + { + "epoch": 3.1373715521903733, + "grad_norm": 1.516391634941101, + "learning_rate": 7.795180281175558e-06, + "loss": 1.8727, + "mean_token_accuracy": 0.5631990432739258, + "num_tokens": 5930565173.0, + "step": 11602 + }, + { + "epoch": 3.137641968631693, + "grad_norm": 1.29611074924469, + "learning_rate": 7.793707178038447e-06, + "loss": 1.8539, + "mean_token_accuracy": 0.5661337971687317, + "num_tokens": 5931089417.0, + "step": 11603 + }, + { + "epoch": 3.1379123850730126, + "grad_norm": 1.2923030853271484, + "learning_rate": 7.792234173279835e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.5736112594604492, + "num_tokens": 5931556026.0, + "step": 11604 + }, + { + "epoch": 3.1381828015143323, + "grad_norm": 1.0396335124969482, + "learning_rate": 7.79076126694491e-06, + "loss": 1.9068, + "mean_token_accuracy": 0.5537958145141602, + "num_tokens": 5932080189.0, + "step": 11605 + }, + { + "epoch": 3.138453217955652, + "grad_norm": 1.1719835996627808, + "learning_rate": 7.789288459078865e-06, + "loss": 1.9211, + "mean_token_accuracy": 0.5625104308128357, + "num_tokens": 5932604330.0, + "step": 11606 + }, + { + "epoch": 3.138723634396971, + "grad_norm": 1.0270947217941284, + "learning_rate": 7.787815749726895e-06, + "loss": 1.8147, + "mean_token_accuracy": 0.5829821825027466, + "num_tokens": 5933128580.0, + "step": 11607 + }, + { + "epoch": 3.138994050838291, + "grad_norm": 0.9310479164123535, + "learning_rate": 7.786343138934182e-06, + "loss": 1.8004, + "mean_token_accuracy": 0.5701943039894104, + "num_tokens": 5933652864.0, + "step": 11608 + }, + { + "epoch": 3.1392644672796104, + "grad_norm": 1.2865898609161377, + "learning_rate": 7.784870626745912e-06, + "loss": 1.8229, + "mean_token_accuracy": 0.5818867683410645, + "num_tokens": 5934176970.0, + "step": 11609 + }, + { + "epoch": 3.13953488372093, + "grad_norm": 1.0587577819824219, + "learning_rate": 7.78339821320727e-06, + "loss": 2.0172, + "mean_token_accuracy": 0.535212516784668, + "num_tokens": 5934701002.0, + "step": 11610 + }, + { + "epoch": 3.1398053001622497, + "grad_norm": 0.9628158211708069, + "learning_rate": 7.781925898363428e-06, + "loss": 1.8385, + "mean_token_accuracy": 0.5837223529815674, + "num_tokens": 5935225270.0, + "step": 11611 + }, + { + "epoch": 3.1400757166035693, + "grad_norm": 1.0635846853256226, + "learning_rate": 7.780453682259561e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5769689083099365, + "num_tokens": 5935749519.0, + "step": 11612 + }, + { + "epoch": 3.140346133044889, + "grad_norm": 0.9173077940940857, + "learning_rate": 7.778981564940846e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5771729946136475, + "num_tokens": 5936255699.0, + "step": 11613 + }, + { + "epoch": 3.1406165494862086, + "grad_norm": 1.1689976453781128, + "learning_rate": 7.777509546452446e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.5700523853302002, + "num_tokens": 5936779957.0, + "step": 11614 + }, + { + "epoch": 3.1408869659275283, + "grad_norm": 1.0967463254928589, + "learning_rate": 7.776037626839532e-06, + "loss": 1.7379, + "mean_token_accuracy": 0.5793418288230896, + "num_tokens": 5937304155.0, + "step": 11615 + }, + { + "epoch": 3.141157382368848, + "grad_norm": 1.1515411138534546, + "learning_rate": 7.774565806147266e-06, + "loss": 1.8094, + "mean_token_accuracy": 0.580077052116394, + "num_tokens": 5937828370.0, + "step": 11616 + }, + { + "epoch": 3.1414277988101675, + "grad_norm": 0.9574901461601257, + "learning_rate": 7.773094084420803e-06, + "loss": 1.8045, + "mean_token_accuracy": 0.5833314657211304, + "num_tokens": 5938352619.0, + "step": 11617 + }, + { + "epoch": 3.141698215251487, + "grad_norm": 0.9552162289619446, + "learning_rate": 7.771622461705303e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5754746198654175, + "num_tokens": 5938816311.0, + "step": 11618 + }, + { + "epoch": 3.141968631692807, + "grad_norm": 1.2033989429473877, + "learning_rate": 7.770150938045926e-06, + "loss": 1.8097, + "mean_token_accuracy": 0.5792274475097656, + "num_tokens": 5939340432.0, + "step": 11619 + }, + { + "epoch": 3.1422390481341265, + "grad_norm": 1.4116483926773071, + "learning_rate": 7.768679513487812e-06, + "loss": 1.879, + "mean_token_accuracy": 0.5785501003265381, + "num_tokens": 5939864683.0, + "step": 11620 + }, + { + "epoch": 3.142509464575446, + "grad_norm": 0.39709651470184326, + "learning_rate": 7.767208188076111e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7201277017593384, + "num_tokens": 5940388919.0, + "step": 11621 + }, + { + "epoch": 3.1427798810167658, + "grad_norm": 1.2667816877365112, + "learning_rate": 7.765736961855974e-06, + "loss": 1.8252, + "mean_token_accuracy": 0.5472087860107422, + "num_tokens": 5940913196.0, + "step": 11622 + }, + { + "epoch": 3.1430502974580854, + "grad_norm": 1.18528413772583, + "learning_rate": 7.764265834872534e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5781642198562622, + "num_tokens": 5941437450.0, + "step": 11623 + }, + { + "epoch": 3.143320713899405, + "grad_norm": 1.1882877349853516, + "learning_rate": 7.76279480717094e-06, + "loss": 1.8467, + "mean_token_accuracy": 0.5788674354553223, + "num_tokens": 5941955945.0, + "step": 11624 + }, + { + "epoch": 3.1435911303407247, + "grad_norm": 1.0631009340286255, + "learning_rate": 7.761323878796318e-06, + "loss": 1.7993, + "mean_token_accuracy": 0.597298800945282, + "num_tokens": 5942462636.0, + "step": 11625 + }, + { + "epoch": 3.1438615467820443, + "grad_norm": 1.1364378929138184, + "learning_rate": 7.759853049793802e-06, + "loss": 1.8771, + "mean_token_accuracy": 0.5729061365127563, + "num_tokens": 5942986913.0, + "step": 11626 + }, + { + "epoch": 3.144131963223364, + "grad_norm": 1.216671347618103, + "learning_rate": 7.758382320208526e-06, + "loss": 1.8634, + "mean_token_accuracy": 0.560641884803772, + "num_tokens": 5943511196.0, + "step": 11627 + }, + { + "epoch": 3.1444023796646836, + "grad_norm": 1.1939005851745605, + "learning_rate": 7.756911690085613e-06, + "loss": 1.8392, + "mean_token_accuracy": 0.5846465826034546, + "num_tokens": 5943985074.0, + "step": 11628 + }, + { + "epoch": 3.1446727961060033, + "grad_norm": 1.0831186771392822, + "learning_rate": 7.755441159470186e-06, + "loss": 1.8766, + "mean_token_accuracy": 0.5661659836769104, + "num_tokens": 5944509252.0, + "step": 11629 + }, + { + "epoch": 3.144943212547323, + "grad_norm": 1.1277827024459839, + "learning_rate": 7.753970728407366e-06, + "loss": 1.6808, + "mean_token_accuracy": 0.5981605052947998, + "num_tokens": 5945033522.0, + "step": 11630 + }, + { + "epoch": 3.1452136289886425, + "grad_norm": 1.0877212285995483, + "learning_rate": 7.75250039694227e-06, + "loss": 1.9099, + "mean_token_accuracy": 0.569465160369873, + "num_tokens": 5945557691.0, + "step": 11631 + }, + { + "epoch": 3.145484045429962, + "grad_norm": 1.1830509901046753, + "learning_rate": 7.751030165120011e-06, + "loss": 1.8027, + "mean_token_accuracy": 0.5836446285247803, + "num_tokens": 5946029244.0, + "step": 11632 + }, + { + "epoch": 3.145754461871282, + "grad_norm": 1.2099753618240356, + "learning_rate": 7.749560032985704e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5834892988204956, + "num_tokens": 5946500228.0, + "step": 11633 + }, + { + "epoch": 3.1460248783126015, + "grad_norm": 1.079940915107727, + "learning_rate": 7.748090000584452e-06, + "loss": 1.9513, + "mean_token_accuracy": 0.5639750957489014, + "num_tokens": 5947024166.0, + "step": 11634 + }, + { + "epoch": 3.146295294753921, + "grad_norm": 1.0158488750457764, + "learning_rate": 7.746620067961361e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.57323157787323, + "num_tokens": 5947548380.0, + "step": 11635 + }, + { + "epoch": 3.1465657111952408, + "grad_norm": 1.2512489557266235, + "learning_rate": 7.745150235161533e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5738312005996704, + "num_tokens": 5948072586.0, + "step": 11636 + }, + { + "epoch": 3.1468361276365604, + "grad_norm": 1.0970311164855957, + "learning_rate": 7.743680502230073e-06, + "loss": 1.8524, + "mean_token_accuracy": 0.5674132108688354, + "num_tokens": 5948568439.0, + "step": 11637 + }, + { + "epoch": 3.14710654407788, + "grad_norm": 1.0589447021484375, + "learning_rate": 7.742210869212065e-06, + "loss": 1.8809, + "mean_token_accuracy": 0.5534669160842896, + "num_tokens": 5949092585.0, + "step": 11638 + }, + { + "epoch": 3.1473769605191997, + "grad_norm": 1.162172555923462, + "learning_rate": 7.740741336152614e-06, + "loss": 1.8902, + "mean_token_accuracy": 0.5836098790168762, + "num_tokens": 5949611140.0, + "step": 11639 + }, + { + "epoch": 3.1476473769605193, + "grad_norm": 0.9696404933929443, + "learning_rate": 7.739271903096799e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5742872357368469, + "num_tokens": 5950122709.0, + "step": 11640 + }, + { + "epoch": 3.147917793401839, + "grad_norm": 0.4498618245124817, + "learning_rate": 7.737802570089714e-06, + "loss": 1.1035, + "mean_token_accuracy": 0.6801144480705261, + "num_tokens": 5950646933.0, + "step": 11641 + }, + { + "epoch": 3.1481882098431586, + "grad_norm": 1.520943522453308, + "learning_rate": 7.736333337176441e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.5814624428749084, + "num_tokens": 5951171167.0, + "step": 11642 + }, + { + "epoch": 3.1484586262844783, + "grad_norm": 1.27323317527771, + "learning_rate": 7.734864204402053e-06, + "loss": 1.7632, + "mean_token_accuracy": 0.5884738564491272, + "num_tokens": 5951663142.0, + "step": 11643 + }, + { + "epoch": 3.148729042725798, + "grad_norm": 1.1248619556427002, + "learning_rate": 7.733395171811637e-06, + "loss": 1.7255, + "mean_token_accuracy": 0.5768966674804688, + "num_tokens": 5952187248.0, + "step": 11644 + }, + { + "epoch": 3.1489994591671175, + "grad_norm": 1.148779273033142, + "learning_rate": 7.731926239450265e-06, + "loss": 1.9708, + "mean_token_accuracy": 0.5577188730239868, + "num_tokens": 5952711522.0, + "step": 11645 + }, + { + "epoch": 3.149269875608437, + "grad_norm": 1.1738190650939941, + "learning_rate": 7.730457407363004e-06, + "loss": 1.7445, + "mean_token_accuracy": 0.5944949388504028, + "num_tokens": 5953235796.0, + "step": 11646 + }, + { + "epoch": 3.149540292049757, + "grad_norm": 1.167465090751648, + "learning_rate": 7.728988675594926e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5736933946609497, + "num_tokens": 5953759998.0, + "step": 11647 + }, + { + "epoch": 3.149810708491076, + "grad_norm": 0.9684596657752991, + "learning_rate": 7.727520044191096e-06, + "loss": 1.7439, + "mean_token_accuracy": 0.5911574363708496, + "num_tokens": 5954284093.0, + "step": 11648 + }, + { + "epoch": 3.150081124932396, + "grad_norm": 1.0910662412643433, + "learning_rate": 7.726051513196573e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5596914887428284, + "num_tokens": 5954808214.0, + "step": 11649 + }, + { + "epoch": 3.1503515413737153, + "grad_norm": 1.3865118026733398, + "learning_rate": 7.724583082656418e-06, + "loss": 1.9203, + "mean_token_accuracy": 0.5563381910324097, + "num_tokens": 5955332469.0, + "step": 11650 + }, + { + "epoch": 3.150621957815035, + "grad_norm": 1.3154234886169434, + "learning_rate": 7.72311475261569e-06, + "loss": 1.8041, + "mean_token_accuracy": 0.5746248364448547, + "num_tokens": 5955798291.0, + "step": 11651 + }, + { + "epoch": 3.1508923742563546, + "grad_norm": 1.133988618850708, + "learning_rate": 7.72164652311943e-06, + "loss": 1.9406, + "mean_token_accuracy": 0.5299876928329468, + "num_tokens": 5956322456.0, + "step": 11652 + }, + { + "epoch": 3.1511627906976742, + "grad_norm": 1.0556845664978027, + "learning_rate": 7.720178394212702e-06, + "loss": 1.7869, + "mean_token_accuracy": 0.57232266664505, + "num_tokens": 5956846562.0, + "step": 11653 + }, + { + "epoch": 3.151433207138994, + "grad_norm": 1.4494074583053589, + "learning_rate": 7.71871036594055e-06, + "loss": 1.7975, + "mean_token_accuracy": 0.5948778390884399, + "num_tokens": 5957370525.0, + "step": 11654 + }, + { + "epoch": 3.1517036235803135, + "grad_norm": 0.966592013835907, + "learning_rate": 7.717242438348005e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5902293920516968, + "num_tokens": 5957858332.0, + "step": 11655 + }, + { + "epoch": 3.151974040021633, + "grad_norm": 1.144305944442749, + "learning_rate": 7.71577461148012e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.6152334213256836, + "num_tokens": 5958313620.0, + "step": 11656 + }, + { + "epoch": 3.152244456462953, + "grad_norm": 1.2031807899475098, + "learning_rate": 7.714306885381928e-06, + "loss": 1.919, + "mean_token_accuracy": 0.5555813908576965, + "num_tokens": 5958837848.0, + "step": 11657 + }, + { + "epoch": 3.1525148729042725, + "grad_norm": 1.1071816682815552, + "learning_rate": 7.712839260098461e-06, + "loss": 1.8906, + "mean_token_accuracy": 0.5651355981826782, + "num_tokens": 5959310501.0, + "step": 11658 + }, + { + "epoch": 3.152785289345592, + "grad_norm": 0.9305201768875122, + "learning_rate": 7.711371735674755e-06, + "loss": 1.793, + "mean_token_accuracy": 0.5913939476013184, + "num_tokens": 5959770855.0, + "step": 11659 + }, + { + "epoch": 3.1530557057869117, + "grad_norm": 1.1951870918273926, + "learning_rate": 7.709904312155835e-06, + "loss": 1.9369, + "mean_token_accuracy": 0.5540257692337036, + "num_tokens": 5960274527.0, + "step": 11660 + }, + { + "epoch": 3.1533261222282314, + "grad_norm": 0.4540731906890869, + "learning_rate": 7.708436989586724e-06, + "loss": 1.2411, + "mean_token_accuracy": 0.670674741268158, + "num_tokens": 5960798679.0, + "step": 11661 + }, + { + "epoch": 3.153596538669551, + "grad_norm": 1.433591365814209, + "learning_rate": 7.706969768012452e-06, + "loss": 1.9069, + "mean_token_accuracy": 0.5731699466705322, + "num_tokens": 5961222500.0, + "step": 11662 + }, + { + "epoch": 3.1538669551108707, + "grad_norm": 1.3303865194320679, + "learning_rate": 7.705502647478024e-06, + "loss": 1.7847, + "mean_token_accuracy": 0.5836242437362671, + "num_tokens": 5961702934.0, + "step": 11663 + }, + { + "epoch": 3.1541373715521903, + "grad_norm": 1.0106838941574097, + "learning_rate": 7.70403562802847e-06, + "loss": 1.7897, + "mean_token_accuracy": 0.587804913520813, + "num_tokens": 5962227154.0, + "step": 11664 + }, + { + "epoch": 3.15440778799351, + "grad_norm": 1.126721978187561, + "learning_rate": 7.702568709708792e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5849186182022095, + "num_tokens": 5962751296.0, + "step": 11665 + }, + { + "epoch": 3.1546782044348296, + "grad_norm": 1.068302035331726, + "learning_rate": 7.701101892564003e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.577545166015625, + "num_tokens": 5963275448.0, + "step": 11666 + }, + { + "epoch": 3.1549486208761492, + "grad_norm": 1.2628202438354492, + "learning_rate": 7.699635176639108e-06, + "loss": 1.9563, + "mean_token_accuracy": 0.5718846321105957, + "num_tokens": 5963791722.0, + "step": 11667 + }, + { + "epoch": 3.155219037317469, + "grad_norm": 1.261215090751648, + "learning_rate": 7.698168561979118e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.57869553565979, + "num_tokens": 5964315936.0, + "step": 11668 + }, + { + "epoch": 3.1554894537587885, + "grad_norm": 1.0222951173782349, + "learning_rate": 7.69670204862902e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5801268815994263, + "num_tokens": 5964840174.0, + "step": 11669 + }, + { + "epoch": 3.155759870200108, + "grad_norm": 1.0138421058654785, + "learning_rate": 7.695235636633822e-06, + "loss": 1.8258, + "mean_token_accuracy": 0.5878517031669617, + "num_tokens": 5965364413.0, + "step": 11670 + }, + { + "epoch": 3.156030286641428, + "grad_norm": 1.183007836341858, + "learning_rate": 7.69376932603851e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.57666015625, + "num_tokens": 5965888671.0, + "step": 11671 + }, + { + "epoch": 3.1563007030827475, + "grad_norm": 0.9711063504219055, + "learning_rate": 7.692303116888078e-06, + "loss": 1.8693, + "mean_token_accuracy": 0.5600080490112305, + "num_tokens": 5966412821.0, + "step": 11672 + }, + { + "epoch": 3.156571119524067, + "grad_norm": 1.054707407951355, + "learning_rate": 7.690837009227515e-06, + "loss": 1.946, + "mean_token_accuracy": 0.5553158521652222, + "num_tokens": 5966937003.0, + "step": 11673 + }, + { + "epoch": 3.1568415359653867, + "grad_norm": 1.1300419569015503, + "learning_rate": 7.689371003101806e-06, + "loss": 1.91, + "mean_token_accuracy": 0.5602147579193115, + "num_tokens": 5967461283.0, + "step": 11674 + }, + { + "epoch": 3.1571119524067064, + "grad_norm": 0.9387942552566528, + "learning_rate": 7.687905098555927e-06, + "loss": 1.8302, + "mean_token_accuracy": 0.5817953944206238, + "num_tokens": 5967964621.0, + "step": 11675 + }, + { + "epoch": 3.157382368848026, + "grad_norm": 0.9463624954223633, + "learning_rate": 7.686439295634863e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5642526149749756, + "num_tokens": 5968488861.0, + "step": 11676 + }, + { + "epoch": 3.1576527852893457, + "grad_norm": 0.9935664534568787, + "learning_rate": 7.684973594383585e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.5688726305961609, + "num_tokens": 5969013020.0, + "step": 11677 + }, + { + "epoch": 3.1579232017306653, + "grad_norm": 1.0167405605316162, + "learning_rate": 7.683507994847064e-06, + "loss": 1.9637, + "mean_token_accuracy": 0.5601285696029663, + "num_tokens": 5969537249.0, + "step": 11678 + }, + { + "epoch": 3.158193618171985, + "grad_norm": 1.1633355617523193, + "learning_rate": 7.682042497070272e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.6083845496177673, + "num_tokens": 5970022311.0, + "step": 11679 + }, + { + "epoch": 3.1584640346133046, + "grad_norm": 1.1934188604354858, + "learning_rate": 7.680577101098174e-06, + "loss": 1.7552, + "mean_token_accuracy": 0.5808994174003601, + "num_tokens": 5970546446.0, + "step": 11680 + }, + { + "epoch": 3.1587344510546242, + "grad_norm": 0.49971553683280945, + "learning_rate": 7.67911180697573e-06, + "loss": 1.1833, + "mean_token_accuracy": 0.6856050491333008, + "num_tokens": 5971070715.0, + "step": 11681 + }, + { + "epoch": 3.159004867495944, + "grad_norm": 1.0579187870025635, + "learning_rate": 7.677646614747904e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5810595154762268, + "num_tokens": 5971594995.0, + "step": 11682 + }, + { + "epoch": 3.1592752839372635, + "grad_norm": 1.069983720779419, + "learning_rate": 7.676181524459654e-06, + "loss": 1.8613, + "mean_token_accuracy": 0.5779929757118225, + "num_tokens": 5972119251.0, + "step": 11683 + }, + { + "epoch": 3.159545700378583, + "grad_norm": 0.9293950796127319, + "learning_rate": 7.674716536155925e-06, + "loss": 1.8932, + "mean_token_accuracy": 0.5626360177993774, + "num_tokens": 5972643502.0, + "step": 11684 + }, + { + "epoch": 3.159816116819903, + "grad_norm": 1.1251388788223267, + "learning_rate": 7.67325164988167e-06, + "loss": 1.7984, + "mean_token_accuracy": 0.5975708961486816, + "num_tokens": 5973167669.0, + "step": 11685 + }, + { + "epoch": 3.1600865332612225, + "grad_norm": 1.0780670642852783, + "learning_rate": 7.671786865681841e-06, + "loss": 1.9269, + "mean_token_accuracy": 0.5515897274017334, + "num_tokens": 5973691913.0, + "step": 11686 + }, + { + "epoch": 3.160356949702542, + "grad_norm": 0.9489275217056274, + "learning_rate": 7.670322183601378e-06, + "loss": 1.8709, + "mean_token_accuracy": 0.5850780010223389, + "num_tokens": 5974216115.0, + "step": 11687 + }, + { + "epoch": 3.1606273661438617, + "grad_norm": 1.1720950603485107, + "learning_rate": 7.668857603685224e-06, + "loss": 1.7968, + "mean_token_accuracy": 0.5728114247322083, + "num_tokens": 5974706371.0, + "step": 11688 + }, + { + "epoch": 3.160897782585181, + "grad_norm": 1.1230390071868896, + "learning_rate": 7.66739312597831e-06, + "loss": 1.8283, + "mean_token_accuracy": 0.5555828809738159, + "num_tokens": 5975230604.0, + "step": 11689 + }, + { + "epoch": 3.161168199026501, + "grad_norm": 0.9791556596755981, + "learning_rate": 7.66592875052558e-06, + "loss": 1.9237, + "mean_token_accuracy": 0.5360544919967651, + "num_tokens": 5975754831.0, + "step": 11690 + }, + { + "epoch": 3.1614386154678202, + "grad_norm": 1.0171505212783813, + "learning_rate": 7.66446447737196e-06, + "loss": 1.8345, + "mean_token_accuracy": 0.5628821849822998, + "num_tokens": 5976279075.0, + "step": 11691 + }, + { + "epoch": 3.16170903190914, + "grad_norm": 1.1742814779281616, + "learning_rate": 7.663000306562378e-06, + "loss": 1.7672, + "mean_token_accuracy": 0.5720136165618896, + "num_tokens": 5976803332.0, + "step": 11692 + }, + { + "epoch": 3.1619794483504595, + "grad_norm": 1.0402655601501465, + "learning_rate": 7.661536238141761e-06, + "loss": 1.8505, + "mean_token_accuracy": 0.5734419822692871, + "num_tokens": 5977327431.0, + "step": 11693 + }, + { + "epoch": 3.162249864791779, + "grad_norm": 1.2128585577011108, + "learning_rate": 7.66007227215503e-06, + "loss": 1.9088, + "mean_token_accuracy": 0.5802233219146729, + "num_tokens": 5977814358.0, + "step": 11694 + }, + { + "epoch": 3.162520281233099, + "grad_norm": 1.0132031440734863, + "learning_rate": 7.658608408647102e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5922832489013672, + "num_tokens": 5978338558.0, + "step": 11695 + }, + { + "epoch": 3.1627906976744184, + "grad_norm": 0.9754440188407898, + "learning_rate": 7.657144647662895e-06, + "loss": 1.7833, + "mean_token_accuracy": 0.5750866532325745, + "num_tokens": 5978862678.0, + "step": 11696 + }, + { + "epoch": 3.163061114115738, + "grad_norm": 0.9084758162498474, + "learning_rate": 7.655680989247325e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.5830098986625671, + "num_tokens": 5979386927.0, + "step": 11697 + }, + { + "epoch": 3.1633315305570577, + "grad_norm": 0.996290385723114, + "learning_rate": 7.654217433445292e-06, + "loss": 1.9332, + "mean_token_accuracy": 0.5620225667953491, + "num_tokens": 5979911122.0, + "step": 11698 + }, + { + "epoch": 3.1636019469983774, + "grad_norm": 1.0848281383514404, + "learning_rate": 7.65275398030171e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5722026228904724, + "num_tokens": 5980435282.0, + "step": 11699 + }, + { + "epoch": 3.163872363439697, + "grad_norm": 0.9493978023529053, + "learning_rate": 7.651290629861479e-06, + "loss": 1.9423, + "mean_token_accuracy": 0.5608692169189453, + "num_tokens": 5980959563.0, + "step": 11700 + }, + { + "epoch": 3.1641427798810167, + "grad_norm": 0.43603047728538513, + "learning_rate": 7.649827382169497e-06, + "loss": 1.1331, + "mean_token_accuracy": 0.6827681660652161, + "num_tokens": 5981483569.0, + "step": 11701 + }, + { + "epoch": 3.1644131963223363, + "grad_norm": 1.4075350761413574, + "learning_rate": 7.648364237270667e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5683451890945435, + "num_tokens": 5982007799.0, + "step": 11702 + }, + { + "epoch": 3.164683612763656, + "grad_norm": 1.4501316547393799, + "learning_rate": 7.646901195209877e-06, + "loss": 1.9, + "mean_token_accuracy": 0.5531415939331055, + "num_tokens": 5982471146.0, + "step": 11703 + }, + { + "epoch": 3.1649540292049756, + "grad_norm": 1.0788649320602417, + "learning_rate": 7.645438256032019e-06, + "loss": 1.9108, + "mean_token_accuracy": 0.5612626075744629, + "num_tokens": 5982995411.0, + "step": 11704 + }, + { + "epoch": 3.1652244456462952, + "grad_norm": 1.0952929258346558, + "learning_rate": 7.643975419781982e-06, + "loss": 1.8817, + "mean_token_accuracy": 0.5666521787643433, + "num_tokens": 5983519629.0, + "step": 11705 + }, + { + "epoch": 3.165494862087615, + "grad_norm": 1.066712498664856, + "learning_rate": 7.642512686504647e-06, + "loss": 1.766, + "mean_token_accuracy": 0.6018780469894409, + "num_tokens": 5984043771.0, + "step": 11706 + }, + { + "epoch": 3.1657652785289345, + "grad_norm": 1.0552711486816406, + "learning_rate": 7.641050056244897e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.5760711431503296, + "num_tokens": 5984568044.0, + "step": 11707 + }, + { + "epoch": 3.166035694970254, + "grad_norm": 1.050492763519287, + "learning_rate": 7.639587529047607e-06, + "loss": 1.7286, + "mean_token_accuracy": 0.5999414324760437, + "num_tokens": 5985039633.0, + "step": 11708 + }, + { + "epoch": 3.166306111411574, + "grad_norm": 1.2058335542678833, + "learning_rate": 7.638125104957654e-06, + "loss": 1.8966, + "mean_token_accuracy": 0.5846564173698425, + "num_tokens": 5985505644.0, + "step": 11709 + }, + { + "epoch": 3.1665765278528935, + "grad_norm": 0.996458888053894, + "learning_rate": 7.636662784019912e-06, + "loss": 1.8647, + "mean_token_accuracy": 0.5731279850006104, + "num_tokens": 5986029807.0, + "step": 11710 + }, + { + "epoch": 3.166846944294213, + "grad_norm": 1.0525013208389282, + "learning_rate": 7.635200566279248e-06, + "loss": 1.8669, + "mean_token_accuracy": 0.578251302242279, + "num_tokens": 5986553957.0, + "step": 11711 + }, + { + "epoch": 3.1671173607355327, + "grad_norm": 1.1348912715911865, + "learning_rate": 7.633738451780521e-06, + "loss": 1.7383, + "mean_token_accuracy": 0.6020956039428711, + "num_tokens": 5987078235.0, + "step": 11712 + }, + { + "epoch": 3.1673877771768524, + "grad_norm": 1.0401520729064941, + "learning_rate": 7.632276440568603e-06, + "loss": 1.8051, + "mean_token_accuracy": 0.5730254054069519, + "num_tokens": 5987602413.0, + "step": 11713 + }, + { + "epoch": 3.167658193618172, + "grad_norm": 1.0587472915649414, + "learning_rate": 7.630814532688346e-06, + "loss": 1.8264, + "mean_token_accuracy": 0.5857755541801453, + "num_tokens": 5988126569.0, + "step": 11714 + }, + { + "epoch": 3.1679286100594917, + "grad_norm": 1.0840253829956055, + "learning_rate": 7.629352728184605e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5602430105209351, + "num_tokens": 5988650803.0, + "step": 11715 + }, + { + "epoch": 3.1681990265008113, + "grad_norm": 1.183016300201416, + "learning_rate": 7.627891027102237e-06, + "loss": 1.9386, + "mean_token_accuracy": 0.5461102724075317, + "num_tokens": 5989174991.0, + "step": 11716 + }, + { + "epoch": 3.168469442942131, + "grad_norm": 1.1220853328704834, + "learning_rate": 7.6264294294860905e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.569071888923645, + "num_tokens": 5989699253.0, + "step": 11717 + }, + { + "epoch": 3.1687398593834506, + "grad_norm": 1.1548113822937012, + "learning_rate": 7.624967935381005e-06, + "loss": 1.959, + "mean_token_accuracy": 0.5484086275100708, + "num_tokens": 5990223414.0, + "step": 11718 + }, + { + "epoch": 3.1690102758247702, + "grad_norm": 1.1037124395370483, + "learning_rate": 7.623506544831834e-06, + "loss": 1.9449, + "mean_token_accuracy": 0.5650456547737122, + "num_tokens": 5990747674.0, + "step": 11719 + }, + { + "epoch": 3.16928069226609, + "grad_norm": 1.1167830228805542, + "learning_rate": 7.622045257883408e-06, + "loss": 1.9287, + "mean_token_accuracy": 0.5578152537345886, + "num_tokens": 5991271720.0, + "step": 11720 + }, + { + "epoch": 3.1695511087074095, + "grad_norm": 0.45597031712532043, + "learning_rate": 7.620584074580568e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.7155027985572815, + "num_tokens": 5991781944.0, + "step": 11721 + }, + { + "epoch": 3.169821525148729, + "grad_norm": 1.632584571838379, + "learning_rate": 7.619122994968149e-06, + "loss": 1.9071, + "mean_token_accuracy": 0.5660412907600403, + "num_tokens": 5992306170.0, + "step": 11722 + }, + { + "epoch": 3.170091941590049, + "grad_norm": 1.6749696731567383, + "learning_rate": 7.617662019090979e-06, + "loss": 1.844, + "mean_token_accuracy": 0.5835385918617249, + "num_tokens": 5992779925.0, + "step": 11723 + }, + { + "epoch": 3.1703623580313685, + "grad_norm": 1.0090898275375366, + "learning_rate": 7.61620114699388e-06, + "loss": 1.8064, + "mean_token_accuracy": 0.5858026742935181, + "num_tokens": 5993304179.0, + "step": 11724 + }, + { + "epoch": 3.170632774472688, + "grad_norm": 1.1069639921188354, + "learning_rate": 7.614740378721688e-06, + "loss": 1.9611, + "mean_token_accuracy": 0.5563446283340454, + "num_tokens": 5993828389.0, + "step": 11725 + }, + { + "epoch": 3.1709031909140077, + "grad_norm": 1.1382249593734741, + "learning_rate": 7.613279714319215e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5688191056251526, + "num_tokens": 5994352628.0, + "step": 11726 + }, + { + "epoch": 3.1711736073553274, + "grad_norm": 1.3267312049865723, + "learning_rate": 7.611819153831275e-06, + "loss": 1.8143, + "mean_token_accuracy": 0.5685994029045105, + "num_tokens": 5994876897.0, + "step": 11727 + }, + { + "epoch": 3.171444023796647, + "grad_norm": 1.2544348239898682, + "learning_rate": 7.610358697302691e-06, + "loss": 1.9159, + "mean_token_accuracy": 0.5600370168685913, + "num_tokens": 5995387552.0, + "step": 11728 + }, + { + "epoch": 3.1717144402379667, + "grad_norm": 1.064466118812561, + "learning_rate": 7.608898344778271e-06, + "loss": 1.8153, + "mean_token_accuracy": 0.5822006464004517, + "num_tokens": 5995911739.0, + "step": 11729 + }, + { + "epoch": 3.171984856679286, + "grad_norm": 1.0987614393234253, + "learning_rate": 7.60743809630282e-06, + "loss": 1.937, + "mean_token_accuracy": 0.5495486259460449, + "num_tokens": 5996436002.0, + "step": 11730 + }, + { + "epoch": 3.172255273120606, + "grad_norm": 0.9822379946708679, + "learning_rate": 7.605977951921148e-06, + "loss": 1.7645, + "mean_token_accuracy": 0.589049220085144, + "num_tokens": 5996960085.0, + "step": 11731 + }, + { + "epoch": 3.172525689561925, + "grad_norm": 1.2022120952606201, + "learning_rate": 7.604517911678049e-06, + "loss": 1.9513, + "mean_token_accuracy": 0.5569309592247009, + "num_tokens": 5997484356.0, + "step": 11732 + }, + { + "epoch": 3.172796106003245, + "grad_norm": 1.1575922966003418, + "learning_rate": 7.603057975618331e-06, + "loss": 1.9566, + "mean_token_accuracy": 0.552613377571106, + "num_tokens": 5998008585.0, + "step": 11733 + }, + { + "epoch": 3.1730665224445644, + "grad_norm": 1.0557425022125244, + "learning_rate": 7.601598143786783e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5790233612060547, + "num_tokens": 5998532688.0, + "step": 11734 + }, + { + "epoch": 3.173336938885884, + "grad_norm": 1.0721598863601685, + "learning_rate": 7.600138416228196e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5961476564407349, + "num_tokens": 5999056959.0, + "step": 11735 + }, + { + "epoch": 3.1736073553272037, + "grad_norm": 1.040017008781433, + "learning_rate": 7.59867879298736e-06, + "loss": 1.7126, + "mean_token_accuracy": 0.5995311737060547, + "num_tokens": 5999551985.0, + "step": 11736 + }, + { + "epoch": 3.1738777717685234, + "grad_norm": 1.0170047283172607, + "learning_rate": 7.597219274109064e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5667818784713745, + "num_tokens": 6000076239.0, + "step": 11737 + }, + { + "epoch": 3.174148188209843, + "grad_norm": 0.8797300457954407, + "learning_rate": 7.5957598596380855e-06, + "loss": 1.784, + "mean_token_accuracy": 0.579208254814148, + "num_tokens": 6000600251.0, + "step": 11738 + }, + { + "epoch": 3.1744186046511627, + "grad_norm": 1.072086215019226, + "learning_rate": 7.594300549619209e-06, + "loss": 1.9127, + "mean_token_accuracy": 0.5644245147705078, + "num_tokens": 6001122256.0, + "step": 11739 + }, + { + "epoch": 3.1746890210924823, + "grad_norm": 1.0960780382156372, + "learning_rate": 7.592841344097206e-06, + "loss": 2.0012, + "mean_token_accuracy": 0.5649328827857971, + "num_tokens": 6001646499.0, + "step": 11740 + }, + { + "epoch": 3.174959437533802, + "grad_norm": 0.4023268520832062, + "learning_rate": 7.591382243116847e-06, + "loss": 1.1387, + "mean_token_accuracy": 0.6886492371559143, + "num_tokens": 6002170778.0, + "step": 11741 + }, + { + "epoch": 3.1752298539751216, + "grad_norm": 0.9874895215034485, + "learning_rate": 7.58992324672291e-06, + "loss": 1.8151, + "mean_token_accuracy": 0.5853073000907898, + "num_tokens": 6002695004.0, + "step": 11742 + }, + { + "epoch": 3.1755002704164412, + "grad_norm": 0.9857088923454285, + "learning_rate": 7.588464354960152e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.583046019077301, + "num_tokens": 6003219182.0, + "step": 11743 + }, + { + "epoch": 3.175770686857761, + "grad_norm": 0.8650644421577454, + "learning_rate": 7.58700556787334e-06, + "loss": 1.8853, + "mean_token_accuracy": 0.5637847185134888, + "num_tokens": 6003743439.0, + "step": 11744 + }, + { + "epoch": 3.1760411032990805, + "grad_norm": 0.9952272772789001, + "learning_rate": 7.585546885507238e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5687981843948364, + "num_tokens": 6004267614.0, + "step": 11745 + }, + { + "epoch": 3.1763115197404, + "grad_norm": 0.9583932161331177, + "learning_rate": 7.584088307906596e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5865049362182617, + "num_tokens": 6004758927.0, + "step": 11746 + }, + { + "epoch": 3.17658193618172, + "grad_norm": 1.0733568668365479, + "learning_rate": 7.5826298351161705e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5891413688659668, + "num_tokens": 6005269356.0, + "step": 11747 + }, + { + "epoch": 3.1768523526230394, + "grad_norm": 0.8422605991363525, + "learning_rate": 7.581171467180715e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5794762372970581, + "num_tokens": 6005793632.0, + "step": 11748 + }, + { + "epoch": 3.177122769064359, + "grad_norm": 1.0185322761535645, + "learning_rate": 7.579713204144968e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5648847818374634, + "num_tokens": 6006317737.0, + "step": 11749 + }, + { + "epoch": 3.1773931855056787, + "grad_norm": 1.155739188194275, + "learning_rate": 7.5782550460536796e-06, + "loss": 1.7774, + "mean_token_accuracy": 0.5809166431427002, + "num_tokens": 6006825150.0, + "step": 11750 + }, + { + "epoch": 3.1776636019469984, + "grad_norm": 0.9339614510536194, + "learning_rate": 7.576796992951592e-06, + "loss": 2.0189, + "mean_token_accuracy": 0.5294713973999023, + "num_tokens": 6007349345.0, + "step": 11751 + }, + { + "epoch": 3.177934018388318, + "grad_norm": 1.0786750316619873, + "learning_rate": 7.575339044883438e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5827007293701172, + "num_tokens": 6007814153.0, + "step": 11752 + }, + { + "epoch": 3.1782044348296377, + "grad_norm": 1.0660808086395264, + "learning_rate": 7.573881201893951e-06, + "loss": 1.8191, + "mean_token_accuracy": 0.5668187737464905, + "num_tokens": 6008338428.0, + "step": 11753 + }, + { + "epoch": 3.1784748512709573, + "grad_norm": 0.9413601160049438, + "learning_rate": 7.5724234640278675e-06, + "loss": 1.7817, + "mean_token_accuracy": 0.5913306474685669, + "num_tokens": 6008855639.0, + "step": 11754 + }, + { + "epoch": 3.178745267712277, + "grad_norm": 0.914222002029419, + "learning_rate": 7.570965831329909e-06, + "loss": 1.8313, + "mean_token_accuracy": 0.5551074743270874, + "num_tokens": 6009379766.0, + "step": 11755 + }, + { + "epoch": 3.1790156841535966, + "grad_norm": 1.1824630498886108, + "learning_rate": 7.569508303844804e-06, + "loss": 1.9502, + "mean_token_accuracy": 0.5468655824661255, + "num_tokens": 6009866648.0, + "step": 11756 + }, + { + "epoch": 3.1792861005949162, + "grad_norm": 0.9204351305961609, + "learning_rate": 7.568050881617272e-06, + "loss": 1.8638, + "mean_token_accuracy": 0.5790208578109741, + "num_tokens": 6010390812.0, + "step": 11757 + }, + { + "epoch": 3.179556517036236, + "grad_norm": 0.9304022789001465, + "learning_rate": 7.566593564692031e-06, + "loss": 1.7641, + "mean_token_accuracy": 0.597424328327179, + "num_tokens": 6010915036.0, + "step": 11758 + }, + { + "epoch": 3.1798269334775555, + "grad_norm": 1.019801139831543, + "learning_rate": 7.5651363531138e-06, + "loss": 1.8339, + "mean_token_accuracy": 0.5710423588752747, + "num_tokens": 6011439182.0, + "step": 11759 + }, + { + "epoch": 3.180097349918875, + "grad_norm": 0.9414653182029724, + "learning_rate": 7.563679246927287e-06, + "loss": 1.8054, + "mean_token_accuracy": 0.5904885530471802, + "num_tokens": 6011963344.0, + "step": 11760 + }, + { + "epoch": 3.180367766360195, + "grad_norm": 0.43494611978530884, + "learning_rate": 7.562222246177196e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.728236198425293, + "num_tokens": 6012487569.0, + "step": 11761 + }, + { + "epoch": 3.1806381828015144, + "grad_norm": 1.0614651441574097, + "learning_rate": 7.560765350908242e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5835294723510742, + "num_tokens": 6013011783.0, + "step": 11762 + }, + { + "epoch": 3.180908599242834, + "grad_norm": 0.9197864532470703, + "learning_rate": 7.559308561165115e-06, + "loss": 1.805, + "mean_token_accuracy": 0.59014493227005, + "num_tokens": 6013535870.0, + "step": 11763 + }, + { + "epoch": 3.1811790156841537, + "grad_norm": 0.9478070139884949, + "learning_rate": 7.557851876992523e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5705944299697876, + "num_tokens": 6014060015.0, + "step": 11764 + }, + { + "epoch": 3.1814494321254734, + "grad_norm": 0.939940333366394, + "learning_rate": 7.556395298435157e-06, + "loss": 1.8307, + "mean_token_accuracy": 0.574948251247406, + "num_tokens": 6014584270.0, + "step": 11765 + }, + { + "epoch": 3.181719848566793, + "grad_norm": 0.9812304377555847, + "learning_rate": 7.554938825537712e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5658038854598999, + "num_tokens": 6015108395.0, + "step": 11766 + }, + { + "epoch": 3.1819902650081127, + "grad_norm": 0.9081761837005615, + "learning_rate": 7.5534824583448715e-06, + "loss": 1.852, + "mean_token_accuracy": 0.5750579833984375, + "num_tokens": 6015632593.0, + "step": 11767 + }, + { + "epoch": 3.1822606814494323, + "grad_norm": 1.158389925956726, + "learning_rate": 7.55202619690133e-06, + "loss": 1.978, + "mean_token_accuracy": 0.5605320930480957, + "num_tokens": 6016156847.0, + "step": 11768 + }, + { + "epoch": 3.182531097890752, + "grad_norm": 0.9760165810585022, + "learning_rate": 7.550570041251764e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5608198642730713, + "num_tokens": 6016681127.0, + "step": 11769 + }, + { + "epoch": 3.1828015143320716, + "grad_norm": 0.8949003219604492, + "learning_rate": 7.549113991440848e-06, + "loss": 1.914, + "mean_token_accuracy": 0.5475919246673584, + "num_tokens": 6017205310.0, + "step": 11770 + }, + { + "epoch": 3.183071930773391, + "grad_norm": 1.1645196676254272, + "learning_rate": 7.547658047513267e-06, + "loss": 1.9655, + "mean_token_accuracy": 0.5815862417221069, + "num_tokens": 6017729526.0, + "step": 11771 + }, + { + "epoch": 3.183342347214711, + "grad_norm": 1.0838302373886108, + "learning_rate": 7.546202209513691e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5739110708236694, + "num_tokens": 6018253758.0, + "step": 11772 + }, + { + "epoch": 3.18361276365603, + "grad_norm": 0.8988072276115417, + "learning_rate": 7.544746477486784e-06, + "loss": 1.9557, + "mean_token_accuracy": 0.5526565313339233, + "num_tokens": 6018777977.0, + "step": 11773 + }, + { + "epoch": 3.1838831800973497, + "grad_norm": 1.099310278892517, + "learning_rate": 7.543290851477221e-06, + "loss": 1.8709, + "mean_token_accuracy": 0.5671447515487671, + "num_tokens": 6019302236.0, + "step": 11774 + }, + { + "epoch": 3.1841535965386694, + "grad_norm": 1.0557482242584229, + "learning_rate": 7.541835331529656e-06, + "loss": 1.7541, + "mean_token_accuracy": 0.6051827669143677, + "num_tokens": 6019816944.0, + "step": 11775 + }, + { + "epoch": 3.184424012979989, + "grad_norm": 1.046268105506897, + "learning_rate": 7.5403799176887536e-06, + "loss": 1.7613, + "mean_token_accuracy": 0.5625752210617065, + "num_tokens": 6020336605.0, + "step": 11776 + }, + { + "epoch": 3.1846944294213086, + "grad_norm": 1.0120718479156494, + "learning_rate": 7.538924609999172e-06, + "loss": 1.7853, + "mean_token_accuracy": 0.5769312381744385, + "num_tokens": 6020860814.0, + "step": 11777 + }, + { + "epoch": 3.1849648458626283, + "grad_norm": 1.012802004814148, + "learning_rate": 7.537469408505558e-06, + "loss": 1.8965, + "mean_token_accuracy": 0.5614721775054932, + "num_tokens": 6021385063.0, + "step": 11778 + }, + { + "epoch": 3.185235262303948, + "grad_norm": 1.009386658668518, + "learning_rate": 7.536014313252567e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5900777578353882, + "num_tokens": 6021851811.0, + "step": 11779 + }, + { + "epoch": 3.1855056787452676, + "grad_norm": 1.180132269859314, + "learning_rate": 7.534559324284847e-06, + "loss": 1.822, + "mean_token_accuracy": 0.584032416343689, + "num_tokens": 6022370154.0, + "step": 11780 + }, + { + "epoch": 3.185776095186587, + "grad_norm": 0.4125131070613861, + "learning_rate": 7.533104441647033e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.7095332145690918, + "num_tokens": 6022894230.0, + "step": 11781 + }, + { + "epoch": 3.186046511627907, + "grad_norm": 1.2438198328018188, + "learning_rate": 7.531649665383774e-06, + "loss": 1.8769, + "mean_token_accuracy": 0.5737366676330566, + "num_tokens": 6023418389.0, + "step": 11782 + }, + { + "epoch": 3.1863169280692265, + "grad_norm": 1.209403157234192, + "learning_rate": 7.5301949955397015e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5806849598884583, + "num_tokens": 6023942652.0, + "step": 11783 + }, + { + "epoch": 3.186587344510546, + "grad_norm": 1.0113072395324707, + "learning_rate": 7.528740432159447e-06, + "loss": 1.8577, + "mean_token_accuracy": 0.572751522064209, + "num_tokens": 6024441473.0, + "step": 11784 + }, + { + "epoch": 3.186857760951866, + "grad_norm": 1.010311484336853, + "learning_rate": 7.52728597528765e-06, + "loss": 1.9472, + "mean_token_accuracy": 0.5590915083885193, + "num_tokens": 6024965702.0, + "step": 11785 + }, + { + "epoch": 3.1871281773931854, + "grad_norm": 1.0124858617782593, + "learning_rate": 7.5258316249689315e-06, + "loss": 1.746, + "mean_token_accuracy": 0.591265082359314, + "num_tokens": 6025489906.0, + "step": 11786 + }, + { + "epoch": 3.187398593834505, + "grad_norm": 1.0674824714660645, + "learning_rate": 7.524377381247909e-06, + "loss": 1.8999, + "mean_token_accuracy": 0.5706346035003662, + "num_tokens": 6026014110.0, + "step": 11787 + }, + { + "epoch": 3.1876690102758247, + "grad_norm": 0.973982036113739, + "learning_rate": 7.522923244169218e-06, + "loss": 1.8628, + "mean_token_accuracy": 0.5772387981414795, + "num_tokens": 6026538290.0, + "step": 11788 + }, + { + "epoch": 3.1879394267171444, + "grad_norm": 0.9155057072639465, + "learning_rate": 7.521469213777466e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.577068567276001, + "num_tokens": 6027033013.0, + "step": 11789 + }, + { + "epoch": 3.188209843158464, + "grad_norm": 0.9409471154212952, + "learning_rate": 7.520015290117262e-06, + "loss": 1.7429, + "mean_token_accuracy": 0.5930798053741455, + "num_tokens": 6027557211.0, + "step": 11790 + }, + { + "epoch": 3.1884802595997837, + "grad_norm": 0.9519196152687073, + "learning_rate": 7.518561473233228e-06, + "loss": 1.73, + "mean_token_accuracy": 0.592309832572937, + "num_tokens": 6028081344.0, + "step": 11791 + }, + { + "epoch": 3.1887506760411033, + "grad_norm": 1.0231069326400757, + "learning_rate": 7.517107763169966e-06, + "loss": 1.8528, + "mean_token_accuracy": 0.5737898349761963, + "num_tokens": 6028605557.0, + "step": 11792 + }, + { + "epoch": 3.189021092482423, + "grad_norm": 1.2648890018463135, + "learning_rate": 7.515654159972076e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5498798489570618, + "num_tokens": 6029129640.0, + "step": 11793 + }, + { + "epoch": 3.1892915089237426, + "grad_norm": 1.0380395650863647, + "learning_rate": 7.514200663684168e-06, + "loss": 1.8297, + "mean_token_accuracy": 0.5781029462814331, + "num_tokens": 6029605675.0, + "step": 11794 + }, + { + "epoch": 3.1895619253650622, + "grad_norm": 1.0531071424484253, + "learning_rate": 7.512747274350835e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5755111575126648, + "num_tokens": 6030113603.0, + "step": 11795 + }, + { + "epoch": 3.189832341806382, + "grad_norm": 1.1053049564361572, + "learning_rate": 7.5112939920166685e-06, + "loss": 1.9882, + "mean_token_accuracy": 0.5608538389205933, + "num_tokens": 6030581316.0, + "step": 11796 + }, + { + "epoch": 3.1901027582477015, + "grad_norm": 1.068912148475647, + "learning_rate": 7.509840816726264e-06, + "loss": 1.9225, + "mean_token_accuracy": 0.549066424369812, + "num_tokens": 6031105535.0, + "step": 11797 + }, + { + "epoch": 3.190373174689021, + "grad_norm": 1.0522311925888062, + "learning_rate": 7.508387748524204e-06, + "loss": 1.92, + "mean_token_accuracy": 0.5890600681304932, + "num_tokens": 6031567940.0, + "step": 11798 + }, + { + "epoch": 3.190643591130341, + "grad_norm": 1.0935620069503784, + "learning_rate": 7.506934787455078e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5863372087478638, + "num_tokens": 6032092219.0, + "step": 11799 + }, + { + "epoch": 3.1909140075716604, + "grad_norm": 1.1444945335388184, + "learning_rate": 7.505481933563466e-06, + "loss": 1.8898, + "mean_token_accuracy": 0.5685540437698364, + "num_tokens": 6032616499.0, + "step": 11800 + }, + { + "epoch": 3.19118442401298, + "grad_norm": 0.42854759097099304, + "learning_rate": 7.504029186893942e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.7254258394241333, + "num_tokens": 6033067887.0, + "step": 11801 + }, + { + "epoch": 3.1914548404542997, + "grad_norm": 1.2507368326187134, + "learning_rate": 7.502576547491085e-06, + "loss": 1.8796, + "mean_token_accuracy": 0.5824877619743347, + "num_tokens": 6033592111.0, + "step": 11802 + }, + { + "epoch": 3.1917252568956194, + "grad_norm": 1.495653748512268, + "learning_rate": 7.501124015399468e-06, + "loss": 1.8447, + "mean_token_accuracy": 0.5697890520095825, + "num_tokens": 6034116347.0, + "step": 11803 + }, + { + "epoch": 3.191995673336939, + "grad_norm": 1.132944941520691, + "learning_rate": 7.499671590663651e-06, + "loss": 1.9314, + "mean_token_accuracy": 0.5534117817878723, + "num_tokens": 6034640548.0, + "step": 11804 + }, + { + "epoch": 3.1922660897782587, + "grad_norm": 1.0427072048187256, + "learning_rate": 7.498219273328207e-06, + "loss": 1.7161, + "mean_token_accuracy": 0.5910577774047852, + "num_tokens": 6035116380.0, + "step": 11805 + }, + { + "epoch": 3.1925365062195783, + "grad_norm": 0.9839488863945007, + "learning_rate": 7.496767063437691e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.5838062763214111, + "num_tokens": 6035640454.0, + "step": 11806 + }, + { + "epoch": 3.192806922660898, + "grad_norm": 1.1502116918563843, + "learning_rate": 7.495314961036666e-06, + "loss": 1.9594, + "mean_token_accuracy": 0.5376536250114441, + "num_tokens": 6036164547.0, + "step": 11807 + }, + { + "epoch": 3.1930773391022176, + "grad_norm": 1.1656568050384521, + "learning_rate": 7.493862966169683e-06, + "loss": 1.9477, + "mean_token_accuracy": 0.5793077349662781, + "num_tokens": 6036688773.0, + "step": 11808 + }, + { + "epoch": 3.1933477555435372, + "grad_norm": 0.9732213616371155, + "learning_rate": 7.4924110788812965e-06, + "loss": 1.7746, + "mean_token_accuracy": 0.6232973337173462, + "num_tokens": 6037148333.0, + "step": 11809 + }, + { + "epoch": 3.193618171984857, + "grad_norm": 1.0164846181869507, + "learning_rate": 7.49095929921605e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5734460949897766, + "num_tokens": 6037642380.0, + "step": 11810 + }, + { + "epoch": 3.1938885884261765, + "grad_norm": 1.3231866359710693, + "learning_rate": 7.489507627218495e-06, + "loss": 1.7634, + "mean_token_accuracy": 0.5944504737854004, + "num_tokens": 6038166656.0, + "step": 11811 + }, + { + "epoch": 3.1941590048674957, + "grad_norm": 1.1012332439422607, + "learning_rate": 7.488056062933171e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.5872819423675537, + "num_tokens": 6038690790.0, + "step": 11812 + }, + { + "epoch": 3.194429421308816, + "grad_norm": 1.0098148584365845, + "learning_rate": 7.48660460640461e-06, + "loss": 1.815, + "mean_token_accuracy": 0.577498197555542, + "num_tokens": 6039151380.0, + "step": 11813 + }, + { + "epoch": 3.194699837750135, + "grad_norm": 1.1242594718933105, + "learning_rate": 7.485153257677354e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5825563073158264, + "num_tokens": 6039632859.0, + "step": 11814 + }, + { + "epoch": 3.1949702541914546, + "grad_norm": 0.9917238354682922, + "learning_rate": 7.483702016795935e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5774169564247131, + "num_tokens": 6040156997.0, + "step": 11815 + }, + { + "epoch": 3.1952406706327743, + "grad_norm": 0.8791786432266235, + "learning_rate": 7.482250883804874e-06, + "loss": 1.7335, + "mean_token_accuracy": 0.5953161716461182, + "num_tokens": 6040681276.0, + "step": 11816 + }, + { + "epoch": 3.195511087074094, + "grad_norm": 1.0789626836776733, + "learning_rate": 7.4807998587487065e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5767192244529724, + "num_tokens": 6041149813.0, + "step": 11817 + }, + { + "epoch": 3.1957815035154136, + "grad_norm": 1.0764328241348267, + "learning_rate": 7.479348941671947e-06, + "loss": 1.7764, + "mean_token_accuracy": 0.5780385732650757, + "num_tokens": 6041674060.0, + "step": 11818 + }, + { + "epoch": 3.196051919956733, + "grad_norm": 1.1980546712875366, + "learning_rate": 7.477898132619112e-06, + "loss": 1.8701, + "mean_token_accuracy": 0.5600228309631348, + "num_tokens": 6042198339.0, + "step": 11819 + }, + { + "epoch": 3.196322336398053, + "grad_norm": 1.0491927862167358, + "learning_rate": 7.476447431634723e-06, + "loss": 2.0027, + "mean_token_accuracy": 0.5541285276412964, + "num_tokens": 6042722581.0, + "step": 11820 + }, + { + "epoch": 3.1965927528393725, + "grad_norm": 0.3699760138988495, + "learning_rate": 7.474996838763291e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.7029080390930176, + "num_tokens": 6043246669.0, + "step": 11821 + }, + { + "epoch": 3.196863169280692, + "grad_norm": 1.1752251386642456, + "learning_rate": 7.473546354049317e-06, + "loss": 1.8141, + "mean_token_accuracy": 0.5832599401473999, + "num_tokens": 6043770735.0, + "step": 11822 + }, + { + "epoch": 3.197133585722012, + "grad_norm": 1.0611413717269897, + "learning_rate": 7.472095977537315e-06, + "loss": 1.875, + "mean_token_accuracy": 0.5675163865089417, + "num_tokens": 6044294910.0, + "step": 11823 + }, + { + "epoch": 3.1974040021633314, + "grad_norm": 0.9606508612632751, + "learning_rate": 7.470645709271782e-06, + "loss": 1.797, + "mean_token_accuracy": 0.5809786319732666, + "num_tokens": 6044819173.0, + "step": 11824 + }, + { + "epoch": 3.197674418604651, + "grad_norm": 1.0497550964355469, + "learning_rate": 7.469195549297219e-06, + "loss": 1.7017, + "mean_token_accuracy": 0.6129792332649231, + "num_tokens": 6045287219.0, + "step": 11825 + }, + { + "epoch": 3.1979448350459707, + "grad_norm": 1.1012983322143555, + "learning_rate": 7.467745497658121e-06, + "loss": 1.8868, + "mean_token_accuracy": 0.5701574683189392, + "num_tokens": 6045803690.0, + "step": 11826 + }, + { + "epoch": 3.1982152514872904, + "grad_norm": 0.9182665348052979, + "learning_rate": 7.466295554398974e-06, + "loss": 1.7524, + "mean_token_accuracy": 0.5799961090087891, + "num_tokens": 6046327924.0, + "step": 11827 + }, + { + "epoch": 3.19848566792861, + "grad_norm": 0.9512636065483093, + "learning_rate": 7.464845719564275e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5995182991027832, + "num_tokens": 6046802582.0, + "step": 11828 + }, + { + "epoch": 3.1987560843699296, + "grad_norm": 1.1100515127182007, + "learning_rate": 7.463395993198508e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.6036313772201538, + "num_tokens": 6047264581.0, + "step": 11829 + }, + { + "epoch": 3.1990265008112493, + "grad_norm": 1.0567970275878906, + "learning_rate": 7.4619463753461475e-06, + "loss": 1.8682, + "mean_token_accuracy": 0.5722934007644653, + "num_tokens": 6047788864.0, + "step": 11830 + }, + { + "epoch": 3.199296917252569, + "grad_norm": 1.0813820362091064, + "learning_rate": 7.460496866051679e-06, + "loss": 1.7208, + "mean_token_accuracy": 0.6019391417503357, + "num_tokens": 6048265894.0, + "step": 11831 + }, + { + "epoch": 3.1995673336938886, + "grad_norm": 0.933368444442749, + "learning_rate": 7.459047465359577e-06, + "loss": 1.5288, + "mean_token_accuracy": 0.6271567344665527, + "num_tokens": 6048790154.0, + "step": 11832 + }, + { + "epoch": 3.199837750135208, + "grad_norm": 1.1763554811477661, + "learning_rate": 7.457598173314309e-06, + "loss": 1.8934, + "mean_token_accuracy": 0.5653340220451355, + "num_tokens": 6049314264.0, + "step": 11833 + }, + { + "epoch": 3.200108166576528, + "grad_norm": 1.1505775451660156, + "learning_rate": 7.45614898996035e-06, + "loss": 1.8637, + "mean_token_accuracy": 0.5793060064315796, + "num_tokens": 6049722546.0, + "step": 11834 + }, + { + "epoch": 3.2003785830178475, + "grad_norm": 0.9418684244155884, + "learning_rate": 7.45469991534216e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5717471837997437, + "num_tokens": 6050237991.0, + "step": 11835 + }, + { + "epoch": 3.200648999459167, + "grad_norm": 1.0473253726959229, + "learning_rate": 7.453250949504201e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5827596187591553, + "num_tokens": 6050747462.0, + "step": 11836 + }, + { + "epoch": 3.200919415900487, + "grad_norm": 1.011113166809082, + "learning_rate": 7.451802092490936e-06, + "loss": 1.8713, + "mean_token_accuracy": 0.5825043320655823, + "num_tokens": 6051236440.0, + "step": 11837 + }, + { + "epoch": 3.2011898323418064, + "grad_norm": 1.0428310632705688, + "learning_rate": 7.450353344346817e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5719442367553711, + "num_tokens": 6051760561.0, + "step": 11838 + }, + { + "epoch": 3.201460248783126, + "grad_norm": 1.0153594017028809, + "learning_rate": 7.448904705116293e-06, + "loss": 1.9276, + "mean_token_accuracy": 0.5940819978713989, + "num_tokens": 6052220096.0, + "step": 11839 + }, + { + "epoch": 3.2017306652244457, + "grad_norm": 1.0054044723510742, + "learning_rate": 7.447456174843819e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.5708667039871216, + "num_tokens": 6052744340.0, + "step": 11840 + }, + { + "epoch": 3.2020010816657654, + "grad_norm": 0.35687056183815, + "learning_rate": 7.446007753573836e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7443234920501709, + "num_tokens": 6053268606.0, + "step": 11841 + }, + { + "epoch": 3.202271498107085, + "grad_norm": 1.3537029027938843, + "learning_rate": 7.444559441350787e-06, + "loss": 1.8371, + "mean_token_accuracy": 0.587009072303772, + "num_tokens": 6053788472.0, + "step": 11842 + }, + { + "epoch": 3.2025419145484046, + "grad_norm": 1.0299603939056396, + "learning_rate": 7.44311123821911e-06, + "loss": 1.787, + "mean_token_accuracy": 0.5915348529815674, + "num_tokens": 6054312487.0, + "step": 11843 + }, + { + "epoch": 3.2028123309897243, + "grad_norm": 1.0698078870773315, + "learning_rate": 7.4416631442232435e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.5743829011917114, + "num_tokens": 6054815242.0, + "step": 11844 + }, + { + "epoch": 3.203082747431044, + "grad_norm": 1.101133108139038, + "learning_rate": 7.440215159407609e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5802603363990784, + "num_tokens": 6055339435.0, + "step": 11845 + }, + { + "epoch": 3.2033531638723636, + "grad_norm": 0.9793105721473694, + "learning_rate": 7.438767283816648e-06, + "loss": 1.7594, + "mean_token_accuracy": 0.5855897665023804, + "num_tokens": 6055823813.0, + "step": 11846 + }, + { + "epoch": 3.203623580313683, + "grad_norm": 0.9840914011001587, + "learning_rate": 7.437319517494774e-06, + "loss": 1.9404, + "mean_token_accuracy": 0.545956552028656, + "num_tokens": 6056348080.0, + "step": 11847 + }, + { + "epoch": 3.203893996755003, + "grad_norm": 1.0419390201568604, + "learning_rate": 7.435871860486421e-06, + "loss": 1.7719, + "mean_token_accuracy": 0.6017953157424927, + "num_tokens": 6056872242.0, + "step": 11848 + }, + { + "epoch": 3.2041644131963225, + "grad_norm": 1.2537298202514648, + "learning_rate": 7.434424312835997e-06, + "loss": 1.9445, + "mean_token_accuracy": 0.5516859292984009, + "num_tokens": 6057396359.0, + "step": 11849 + }, + { + "epoch": 3.204434829637642, + "grad_norm": 1.0351825952529907, + "learning_rate": 7.432976874587921e-06, + "loss": 1.9337, + "mean_token_accuracy": 0.5755380392074585, + "num_tokens": 6057920595.0, + "step": 11850 + }, + { + "epoch": 3.204705246078962, + "grad_norm": 1.0830974578857422, + "learning_rate": 7.431529545786606e-06, + "loss": 1.9207, + "mean_token_accuracy": 0.5738108158111572, + "num_tokens": 6058404717.0, + "step": 11851 + }, + { + "epoch": 3.2049756625202814, + "grad_norm": 1.0925153493881226, + "learning_rate": 7.430082326476458e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.5730562210083008, + "num_tokens": 6058928980.0, + "step": 11852 + }, + { + "epoch": 3.2052460789616006, + "grad_norm": 1.005753755569458, + "learning_rate": 7.4286352167018805e-06, + "loss": 1.8652, + "mean_token_accuracy": 0.5542166233062744, + "num_tokens": 6059453112.0, + "step": 11853 + }, + { + "epoch": 3.2055164954029207, + "grad_norm": 1.1305044889450073, + "learning_rate": 7.42718821650728e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.581847071647644, + "num_tokens": 6059977119.0, + "step": 11854 + }, + { + "epoch": 3.20578691184424, + "grad_norm": 1.3221180438995361, + "learning_rate": 7.425741325937053e-06, + "loss": 1.6498, + "mean_token_accuracy": 0.6193199157714844, + "num_tokens": 6060477051.0, + "step": 11855 + }, + { + "epoch": 3.2060573282855596, + "grad_norm": 1.159024953842163, + "learning_rate": 7.424294545035588e-06, + "loss": 1.872, + "mean_token_accuracy": 0.577850341796875, + "num_tokens": 6060973825.0, + "step": 11856 + }, + { + "epoch": 3.206327744726879, + "grad_norm": 1.2539684772491455, + "learning_rate": 7.422847873847285e-06, + "loss": 1.7577, + "mean_token_accuracy": 0.5510819554328918, + "num_tokens": 6061419697.0, + "step": 11857 + }, + { + "epoch": 3.206598161168199, + "grad_norm": 1.4740405082702637, + "learning_rate": 7.421401312416532e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.6175661087036133, + "num_tokens": 6061875761.0, + "step": 11858 + }, + { + "epoch": 3.2068685776095185, + "grad_norm": 1.4906476736068726, + "learning_rate": 7.419954860787706e-06, + "loss": 1.914, + "mean_token_accuracy": 0.5684860944747925, + "num_tokens": 6062399894.0, + "step": 11859 + }, + { + "epoch": 3.207138994050838, + "grad_norm": 1.2353978157043457, + "learning_rate": 7.418508519005196e-06, + "loss": 1.8694, + "mean_token_accuracy": 0.5756118893623352, + "num_tokens": 6062924173.0, + "step": 11860 + }, + { + "epoch": 3.2074094104921578, + "grad_norm": 0.3915657103061676, + "learning_rate": 7.417062287113379e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.7104442119598389, + "num_tokens": 6063448410.0, + "step": 11861 + }, + { + "epoch": 3.2076798269334774, + "grad_norm": 1.5718598365783691, + "learning_rate": 7.4156161651566205e-06, + "loss": 1.9003, + "mean_token_accuracy": 0.563900351524353, + "num_tokens": 6063953665.0, + "step": 11862 + }, + { + "epoch": 3.207950243374797, + "grad_norm": 1.4414665699005127, + "learning_rate": 7.414170153179303e-06, + "loss": 1.7662, + "mean_token_accuracy": 0.6145191192626953, + "num_tokens": 6064388056.0, + "step": 11863 + }, + { + "epoch": 3.2082206598161167, + "grad_norm": 1.1182957887649536, + "learning_rate": 7.412724251225794e-06, + "loss": 1.8154, + "mean_token_accuracy": 0.5839750170707703, + "num_tokens": 6064882827.0, + "step": 11864 + }, + { + "epoch": 3.2084910762574363, + "grad_norm": 1.0753940343856812, + "learning_rate": 7.4112784593404475e-06, + "loss": 1.9603, + "mean_token_accuracy": 0.5591682195663452, + "num_tokens": 6065390328.0, + "step": 11865 + }, + { + "epoch": 3.208761492698756, + "grad_norm": 1.2841378450393677, + "learning_rate": 7.409832777567638e-06, + "loss": 1.8787, + "mean_token_accuracy": 0.5762627124786377, + "num_tokens": 6065877779.0, + "step": 11866 + }, + { + "epoch": 3.2090319091400756, + "grad_norm": 1.2075680494308472, + "learning_rate": 7.408387205951717e-06, + "loss": 1.8522, + "mean_token_accuracy": 0.5731346607208252, + "num_tokens": 6066378841.0, + "step": 11867 + }, + { + "epoch": 3.2093023255813953, + "grad_norm": 1.0362355709075928, + "learning_rate": 7.4069417445370324e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5827720165252686, + "num_tokens": 6066886012.0, + "step": 11868 + }, + { + "epoch": 3.209572742022715, + "grad_norm": 0.9781968593597412, + "learning_rate": 7.405496393367946e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5682666897773743, + "num_tokens": 6067410246.0, + "step": 11869 + }, + { + "epoch": 3.2098431584640346, + "grad_norm": 1.407853126525879, + "learning_rate": 7.404051152488798e-06, + "loss": 1.9053, + "mean_token_accuracy": 0.5756886601448059, + "num_tokens": 6067897784.0, + "step": 11870 + }, + { + "epoch": 3.210113574905354, + "grad_norm": 1.3380653858184814, + "learning_rate": 7.402606021943939e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.5765430927276611, + "num_tokens": 6068421969.0, + "step": 11871 + }, + { + "epoch": 3.210383991346674, + "grad_norm": 1.1135703325271606, + "learning_rate": 7.401161001777706e-06, + "loss": 1.9544, + "mean_token_accuracy": 0.5565186738967896, + "num_tokens": 6068870638.0, + "step": 11872 + }, + { + "epoch": 3.2106544077879935, + "grad_norm": 1.1442290544509888, + "learning_rate": 7.3997160920344344e-06, + "loss": 1.5522, + "mean_token_accuracy": 0.6351354122161865, + "num_tokens": 6069394788.0, + "step": 11873 + }, + { + "epoch": 3.210924824229313, + "grad_norm": 1.0309261083602905, + "learning_rate": 7.398271292758465e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5829311609268188, + "num_tokens": 6069918976.0, + "step": 11874 + }, + { + "epoch": 3.2111952406706328, + "grad_norm": 1.2689160108566284, + "learning_rate": 7.396826603994123e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.577022910118103, + "num_tokens": 6070443249.0, + "step": 11875 + }, + { + "epoch": 3.2114656571119524, + "grad_norm": 1.1840213537216187, + "learning_rate": 7.395382025785733e-06, + "loss": 1.8256, + "mean_token_accuracy": 0.5561480522155762, + "num_tokens": 6070967426.0, + "step": 11876 + }, + { + "epoch": 3.211736073553272, + "grad_norm": 1.4030444622039795, + "learning_rate": 7.3939375581776284e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.581362247467041, + "num_tokens": 6071443047.0, + "step": 11877 + }, + { + "epoch": 3.2120064899945917, + "grad_norm": 1.3619598150253296, + "learning_rate": 7.392493201214122e-06, + "loss": 1.8245, + "mean_token_accuracy": 0.5640882253646851, + "num_tokens": 6071967172.0, + "step": 11878 + }, + { + "epoch": 3.2122769064359114, + "grad_norm": 0.9964812994003296, + "learning_rate": 7.39104895493953e-06, + "loss": 1.875, + "mean_token_accuracy": 0.5687023401260376, + "num_tokens": 6072491397.0, + "step": 11879 + }, + { + "epoch": 3.212547322877231, + "grad_norm": 1.3206467628479004, + "learning_rate": 7.389604819398175e-06, + "loss": 1.8667, + "mean_token_accuracy": 0.5713329315185547, + "num_tokens": 6073015664.0, + "step": 11880 + }, + { + "epoch": 3.2128177393185506, + "grad_norm": 0.37190404534339905, + "learning_rate": 7.3881607946343605e-06, + "loss": 1.1608, + "mean_token_accuracy": 0.6925527453422546, + "num_tokens": 6073539863.0, + "step": 11881 + }, + { + "epoch": 3.2130881557598703, + "grad_norm": 1.533509373664856, + "learning_rate": 7.38671688069239e-06, + "loss": 1.8612, + "mean_token_accuracy": 0.5696719884872437, + "num_tokens": 6074064052.0, + "step": 11882 + }, + { + "epoch": 3.21335857220119, + "grad_norm": 1.1736509799957275, + "learning_rate": 7.385273077616577e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.572493314743042, + "num_tokens": 6074570221.0, + "step": 11883 + }, + { + "epoch": 3.2136289886425096, + "grad_norm": 1.1039907932281494, + "learning_rate": 7.383829385451213e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.570549726486206, + "num_tokens": 6075054924.0, + "step": 11884 + }, + { + "epoch": 3.213899405083829, + "grad_norm": 1.0368050336837769, + "learning_rate": 7.382385804240599e-06, + "loss": 1.8373, + "mean_token_accuracy": 0.5779432654380798, + "num_tokens": 6075579205.0, + "step": 11885 + }, + { + "epoch": 3.214169821525149, + "grad_norm": 1.0635685920715332, + "learning_rate": 7.3809423340290265e-06, + "loss": 1.8165, + "mean_token_accuracy": 0.5770871043205261, + "num_tokens": 6076103238.0, + "step": 11886 + }, + { + "epoch": 3.2144402379664685, + "grad_norm": 1.418692708015442, + "learning_rate": 7.379498974860788e-06, + "loss": 1.867, + "mean_token_accuracy": 0.5706992149353027, + "num_tokens": 6076627495.0, + "step": 11887 + }, + { + "epoch": 3.214710654407788, + "grad_norm": 1.2024461030960083, + "learning_rate": 7.378055726780164e-06, + "loss": 1.6753, + "mean_token_accuracy": 0.5815627574920654, + "num_tokens": 6077151743.0, + "step": 11888 + }, + { + "epoch": 3.214981070849108, + "grad_norm": 1.0988909006118774, + "learning_rate": 7.376612589831446e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.5829571485519409, + "num_tokens": 6077666901.0, + "step": 11889 + }, + { + "epoch": 3.2152514872904274, + "grad_norm": 1.0982669591903687, + "learning_rate": 7.375169564058908e-06, + "loss": 1.8896, + "mean_token_accuracy": 0.564804196357727, + "num_tokens": 6078191114.0, + "step": 11890 + }, + { + "epoch": 3.215521903731747, + "grad_norm": 1.23646080493927, + "learning_rate": 7.373726649506827e-06, + "loss": 1.8104, + "mean_token_accuracy": 0.5773035287857056, + "num_tokens": 6078715223.0, + "step": 11891 + }, + { + "epoch": 3.2157923201730667, + "grad_norm": 0.9849449992179871, + "learning_rate": 7.372283846219477e-06, + "loss": 1.8729, + "mean_token_accuracy": 0.5631634593009949, + "num_tokens": 6079239451.0, + "step": 11892 + }, + { + "epoch": 3.2160627366143864, + "grad_norm": 0.9570782780647278, + "learning_rate": 7.370841154241126e-06, + "loss": 1.885, + "mean_token_accuracy": 0.5684627294540405, + "num_tokens": 6079763739.0, + "step": 11893 + }, + { + "epoch": 3.2163331530557056, + "grad_norm": 1.118147850036621, + "learning_rate": 7.369398573616042e-06, + "loss": 1.9077, + "mean_token_accuracy": 0.5727188587188721, + "num_tokens": 6080251831.0, + "step": 11894 + }, + { + "epoch": 3.2166035694970256, + "grad_norm": 0.9938634634017944, + "learning_rate": 7.3679561043884896e-06, + "loss": 1.9106, + "mean_token_accuracy": 0.5823934674263, + "num_tokens": 6080744879.0, + "step": 11895 + }, + { + "epoch": 3.216873985938345, + "grad_norm": 0.9867343306541443, + "learning_rate": 7.366513746602719e-06, + "loss": 1.8179, + "mean_token_accuracy": 0.5636208057403564, + "num_tokens": 6081269071.0, + "step": 11896 + }, + { + "epoch": 3.2171444023796645, + "grad_norm": 1.1422901153564453, + "learning_rate": 7.3650715003029985e-06, + "loss": 1.9379, + "mean_token_accuracy": 0.569202184677124, + "num_tokens": 6081698156.0, + "step": 11897 + }, + { + "epoch": 3.217414818820984, + "grad_norm": 1.014237880706787, + "learning_rate": 7.363629365533569e-06, + "loss": 1.8706, + "mean_token_accuracy": 0.5869454145431519, + "num_tokens": 6082222391.0, + "step": 11898 + }, + { + "epoch": 3.2176852352623038, + "grad_norm": 0.8673007488250732, + "learning_rate": 7.362187342338687e-06, + "loss": 1.9044, + "mean_token_accuracy": 0.5589599609375, + "num_tokens": 6082746526.0, + "step": 11899 + }, + { + "epoch": 3.2179556517036234, + "grad_norm": 0.9406976699829102, + "learning_rate": 7.360745430762595e-06, + "loss": 1.8573, + "mean_token_accuracy": 0.5672023296356201, + "num_tokens": 6083270802.0, + "step": 11900 + }, + { + "epoch": 3.218226068144943, + "grad_norm": 0.44490304589271545, + "learning_rate": 7.359303630849537e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.7105108499526978, + "num_tokens": 6083795081.0, + "step": 11901 + }, + { + "epoch": 3.2184964845862627, + "grad_norm": 1.2471280097961426, + "learning_rate": 7.357861942643745e-06, + "loss": 1.9457, + "mean_token_accuracy": 0.5625389814376831, + "num_tokens": 6084309466.0, + "step": 11902 + }, + { + "epoch": 3.2187669010275823, + "grad_norm": 1.1585391759872437, + "learning_rate": 7.356420366189465e-06, + "loss": 1.9118, + "mean_token_accuracy": 0.5495850443840027, + "num_tokens": 6084833512.0, + "step": 11903 + }, + { + "epoch": 3.219037317468902, + "grad_norm": 0.9423506259918213, + "learning_rate": 7.354978901530922e-06, + "loss": 1.7305, + "mean_token_accuracy": 0.5915467143058777, + "num_tokens": 6085299006.0, + "step": 11904 + }, + { + "epoch": 3.2193077339102216, + "grad_norm": 0.9328309893608093, + "learning_rate": 7.353537548712344e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5721055865287781, + "num_tokens": 6085823157.0, + "step": 11905 + }, + { + "epoch": 3.2195781503515413, + "grad_norm": 0.948631227016449, + "learning_rate": 7.352096307777958e-06, + "loss": 1.8193, + "mean_token_accuracy": 0.5896562337875366, + "num_tokens": 6086302899.0, + "step": 11906 + }, + { + "epoch": 3.219848566792861, + "grad_norm": 0.9859997630119324, + "learning_rate": 7.350655178771986e-06, + "loss": 1.813, + "mean_token_accuracy": 0.5714689493179321, + "num_tokens": 6086826950.0, + "step": 11907 + }, + { + "epoch": 3.2201189832341806, + "grad_norm": 0.9430743455886841, + "learning_rate": 7.349214161738642e-06, + "loss": 1.8125, + "mean_token_accuracy": 0.573153555393219, + "num_tokens": 6087351137.0, + "step": 11908 + }, + { + "epoch": 3.2203893996755, + "grad_norm": 0.7952927350997925, + "learning_rate": 7.347773256722148e-06, + "loss": 1.7936, + "mean_token_accuracy": 0.5906324982643127, + "num_tokens": 6087875328.0, + "step": 11909 + }, + { + "epoch": 3.22065981611682, + "grad_norm": 1.0111901760101318, + "learning_rate": 7.346332463766712e-06, + "loss": 1.8003, + "mean_token_accuracy": 0.5862911939620972, + "num_tokens": 6088342415.0, + "step": 11910 + }, + { + "epoch": 3.2209302325581395, + "grad_norm": 1.0485289096832275, + "learning_rate": 7.344891782916537e-06, + "loss": 1.837, + "mean_token_accuracy": 0.5814992189407349, + "num_tokens": 6088866667.0, + "step": 11911 + }, + { + "epoch": 3.221200648999459, + "grad_norm": 0.911089301109314, + "learning_rate": 7.343451214215833e-06, + "loss": 1.7602, + "mean_token_accuracy": 0.5913522243499756, + "num_tokens": 6089390831.0, + "step": 11912 + }, + { + "epoch": 3.2214710654407788, + "grad_norm": 0.8517546057701111, + "learning_rate": 7.342010757708797e-06, + "loss": 1.8385, + "mean_token_accuracy": 0.5678201913833618, + "num_tokens": 6089915107.0, + "step": 11913 + }, + { + "epoch": 3.2217414818820984, + "grad_norm": 1.4006519317626953, + "learning_rate": 7.340570413439627e-06, + "loss": 1.717, + "mean_token_accuracy": 0.6004093885421753, + "num_tokens": 6090439367.0, + "step": 11914 + }, + { + "epoch": 3.222011898323418, + "grad_norm": 1.3292152881622314, + "learning_rate": 7.339130181452523e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.5508185625076294, + "num_tokens": 6090927642.0, + "step": 11915 + }, + { + "epoch": 3.2222823147647377, + "grad_norm": 1.1353933811187744, + "learning_rate": 7.337690061791667e-06, + "loss": 1.7153, + "mean_token_accuracy": 0.6043657660484314, + "num_tokens": 6091451916.0, + "step": 11916 + }, + { + "epoch": 3.2225527312060573, + "grad_norm": 1.0772507190704346, + "learning_rate": 7.336250054501253e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5665455460548401, + "num_tokens": 6091976037.0, + "step": 11917 + }, + { + "epoch": 3.222823147647377, + "grad_norm": 1.0559170246124268, + "learning_rate": 7.3348101596254624e-06, + "loss": 1.8903, + "mean_token_accuracy": 0.5570418834686279, + "num_tokens": 6092500304.0, + "step": 11918 + }, + { + "epoch": 3.2230935640886966, + "grad_norm": 1.1176679134368896, + "learning_rate": 7.3333703772084704e-06, + "loss": 1.7899, + "mean_token_accuracy": 0.5817910432815552, + "num_tokens": 6093024547.0, + "step": 11919 + }, + { + "epoch": 3.2233639805300163, + "grad_norm": 1.0153884887695312, + "learning_rate": 7.331930707294463e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5755305290222168, + "num_tokens": 6093485613.0, + "step": 11920 + }, + { + "epoch": 3.223634396971336, + "grad_norm": 0.3568556308746338, + "learning_rate": 7.330491149927606e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.709109902381897, + "num_tokens": 6094009833.0, + "step": 11921 + }, + { + "epoch": 3.2239048134126556, + "grad_norm": 1.1078402996063232, + "learning_rate": 7.329051705152071e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.5846924185752869, + "num_tokens": 6094534025.0, + "step": 11922 + }, + { + "epoch": 3.224175229853975, + "grad_norm": 1.2271226644515991, + "learning_rate": 7.327612373012025e-06, + "loss": 1.9003, + "mean_token_accuracy": 0.5728389024734497, + "num_tokens": 6095056312.0, + "step": 11923 + }, + { + "epoch": 3.224445646295295, + "grad_norm": 1.1721373796463013, + "learning_rate": 7.326173153551634e-06, + "loss": 1.8081, + "mean_token_accuracy": 0.606559157371521, + "num_tokens": 6095485282.0, + "step": 11924 + }, + { + "epoch": 3.2247160627366145, + "grad_norm": 0.9176048040390015, + "learning_rate": 7.324734046815052e-06, + "loss": 1.7674, + "mean_token_accuracy": 0.5884312391281128, + "num_tokens": 6096009449.0, + "step": 11925 + }, + { + "epoch": 3.224986479177934, + "grad_norm": 1.0228625535964966, + "learning_rate": 7.32329505284644e-06, + "loss": 1.9326, + "mean_token_accuracy": 0.5806074142456055, + "num_tokens": 6096499870.0, + "step": 11926 + }, + { + "epoch": 3.2252568956192538, + "grad_norm": 1.0813592672348022, + "learning_rate": 7.321856171689947e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5734888911247253, + "num_tokens": 6097024134.0, + "step": 11927 + }, + { + "epoch": 3.2255273120605734, + "grad_norm": 1.0367945432662964, + "learning_rate": 7.3204174033897215e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5698530673980713, + "num_tokens": 6097546358.0, + "step": 11928 + }, + { + "epoch": 3.225797728501893, + "grad_norm": 1.0064363479614258, + "learning_rate": 7.318978747989915e-06, + "loss": 1.9239, + "mean_token_accuracy": 0.5704196691513062, + "num_tokens": 6097966120.0, + "step": 11929 + }, + { + "epoch": 3.2260681449432127, + "grad_norm": 0.9799158573150635, + "learning_rate": 7.317540205534663e-06, + "loss": 1.7186, + "mean_token_accuracy": 0.6180764436721802, + "num_tokens": 6098435381.0, + "step": 11930 + }, + { + "epoch": 3.2263385613845323, + "grad_norm": 1.0049866437911987, + "learning_rate": 7.316101776068107e-06, + "loss": 1.9073, + "mean_token_accuracy": 0.5579602122306824, + "num_tokens": 6098959443.0, + "step": 11931 + }, + { + "epoch": 3.226608977825852, + "grad_norm": 1.0120959281921387, + "learning_rate": 7.3146634596343855e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5702997446060181, + "num_tokens": 6099483721.0, + "step": 11932 + }, + { + "epoch": 3.2268793942671716, + "grad_norm": 1.1169476509094238, + "learning_rate": 7.3132252562776225e-06, + "loss": 1.9002, + "mean_token_accuracy": 0.5508778691291809, + "num_tokens": 6100007802.0, + "step": 11933 + }, + { + "epoch": 3.2271498107084913, + "grad_norm": 1.1265015602111816, + "learning_rate": 7.311787166041953e-06, + "loss": 1.7939, + "mean_token_accuracy": 0.5936048030853271, + "num_tokens": 6100471198.0, + "step": 11934 + }, + { + "epoch": 3.227420227149811, + "grad_norm": 1.0277652740478516, + "learning_rate": 7.3103491889715e-06, + "loss": 1.884, + "mean_token_accuracy": 0.5584188103675842, + "num_tokens": 6100995293.0, + "step": 11935 + }, + { + "epoch": 3.2276906435911306, + "grad_norm": 1.0452954769134521, + "learning_rate": 7.3089113251103836e-06, + "loss": 1.8518, + "mean_token_accuracy": 0.5823732614517212, + "num_tokens": 6101500212.0, + "step": 11936 + }, + { + "epoch": 3.2279610600324498, + "grad_norm": 1.1459805965423584, + "learning_rate": 7.307473574502719e-06, + "loss": 1.7757, + "mean_token_accuracy": 0.5760532021522522, + "num_tokens": 6101971839.0, + "step": 11937 + }, + { + "epoch": 3.2282314764737694, + "grad_norm": 1.1397595405578613, + "learning_rate": 7.306035937192631e-06, + "loss": 2.0189, + "mean_token_accuracy": 0.5591867566108704, + "num_tokens": 6102435636.0, + "step": 11938 + }, + { + "epoch": 3.228501892915089, + "grad_norm": 0.9098040461540222, + "learning_rate": 7.304598413224218e-06, + "loss": 1.8845, + "mean_token_accuracy": 0.5446680188179016, + "num_tokens": 6102959882.0, + "step": 11939 + }, + { + "epoch": 3.2287723093564087, + "grad_norm": 0.9434788823127747, + "learning_rate": 7.3031610026415965e-06, + "loss": 1.777, + "mean_token_accuracy": 0.5744499564170837, + "num_tokens": 6103442450.0, + "step": 11940 + }, + { + "epoch": 3.2290427257977283, + "grad_norm": 0.3654935359954834, + "learning_rate": 7.301723705488866e-06, + "loss": 1.065, + "mean_token_accuracy": 0.7076691389083862, + "num_tokens": 6103966682.0, + "step": 11941 + }, + { + "epoch": 3.229313142239048, + "grad_norm": 1.3154877424240112, + "learning_rate": 7.300286521810127e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5891985893249512, + "num_tokens": 6104418814.0, + "step": 11942 + }, + { + "epoch": 3.2295835586803676, + "grad_norm": 1.0199298858642578, + "learning_rate": 7.298849451649479e-06, + "loss": 1.9202, + "mean_token_accuracy": 0.5553111433982849, + "num_tokens": 6104942973.0, + "step": 11943 + }, + { + "epoch": 3.2298539751216873, + "grad_norm": 0.9592680335044861, + "learning_rate": 7.297412495051015e-06, + "loss": 1.9553, + "mean_token_accuracy": 0.559101939201355, + "num_tokens": 6105467226.0, + "step": 11944 + }, + { + "epoch": 3.230124391563007, + "grad_norm": 1.1356338262557983, + "learning_rate": 7.29597565205882e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.5814084410667419, + "num_tokens": 6105991445.0, + "step": 11945 + }, + { + "epoch": 3.2303948080043265, + "grad_norm": 1.118899941444397, + "learning_rate": 7.294538922716989e-06, + "loss": 1.7844, + "mean_token_accuracy": 0.5665574073791504, + "num_tokens": 6106515566.0, + "step": 11946 + }, + { + "epoch": 3.230665224445646, + "grad_norm": 1.0325533151626587, + "learning_rate": 7.293102307069601e-06, + "loss": 1.8505, + "mean_token_accuracy": 0.5873144865036011, + "num_tokens": 6107039741.0, + "step": 11947 + }, + { + "epoch": 3.230935640886966, + "grad_norm": 0.9521201848983765, + "learning_rate": 7.2916658051607325e-06, + "loss": 1.819, + "mean_token_accuracy": 0.5884256362915039, + "num_tokens": 6107544112.0, + "step": 11948 + }, + { + "epoch": 3.2312060573282855, + "grad_norm": 1.2174320220947266, + "learning_rate": 7.290229417034467e-06, + "loss": 1.7577, + "mean_token_accuracy": 0.5979624390602112, + "num_tokens": 6108068282.0, + "step": 11949 + }, + { + "epoch": 3.231476473769605, + "grad_norm": 1.0582078695297241, + "learning_rate": 7.288793142734871e-06, + "loss": 1.7634, + "mean_token_accuracy": 0.5991661548614502, + "num_tokens": 6108557302.0, + "step": 11950 + }, + { + "epoch": 3.2317468902109248, + "grad_norm": 0.9938032031059265, + "learning_rate": 7.2873569823060134e-06, + "loss": 1.9302, + "mean_token_accuracy": 0.5749551653862, + "num_tokens": 6109044873.0, + "step": 11951 + }, + { + "epoch": 3.2320173066522444, + "grad_norm": 1.061375379562378, + "learning_rate": 7.285920935791963e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.5736895799636841, + "num_tokens": 6109569040.0, + "step": 11952 + }, + { + "epoch": 3.232287723093564, + "grad_norm": 1.0112882852554321, + "learning_rate": 7.284485003236783e-06, + "loss": 1.9154, + "mean_token_accuracy": 0.5561311841011047, + "num_tokens": 6110093267.0, + "step": 11953 + }, + { + "epoch": 3.2325581395348837, + "grad_norm": 1.0833938121795654, + "learning_rate": 7.283049184684523e-06, + "loss": 2.0615, + "mean_token_accuracy": 0.5418046712875366, + "num_tokens": 6110591426.0, + "step": 11954 + }, + { + "epoch": 3.2328285559762033, + "grad_norm": 0.9343193173408508, + "learning_rate": 7.281613480179253e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5580371618270874, + "num_tokens": 6111115676.0, + "step": 11955 + }, + { + "epoch": 3.233098972417523, + "grad_norm": 0.9400226473808289, + "learning_rate": 7.280177889765012e-06, + "loss": 1.9557, + "mean_token_accuracy": 0.5586049556732178, + "num_tokens": 6111639947.0, + "step": 11956 + }, + { + "epoch": 3.2333693888588426, + "grad_norm": 0.9773380756378174, + "learning_rate": 7.278742413485851e-06, + "loss": 1.7283, + "mean_token_accuracy": 0.5937997698783875, + "num_tokens": 6112164226.0, + "step": 11957 + }, + { + "epoch": 3.2336398053001623, + "grad_norm": 1.047837734222412, + "learning_rate": 7.27730705138582e-06, + "loss": 1.9111, + "mean_token_accuracy": 0.5569191575050354, + "num_tokens": 6112688424.0, + "step": 11958 + }, + { + "epoch": 3.233910221741482, + "grad_norm": 0.8870515823364258, + "learning_rate": 7.275871803508956e-06, + "loss": 1.8078, + "mean_token_accuracy": 0.545448899269104, + "num_tokens": 6113212701.0, + "step": 11959 + }, + { + "epoch": 3.2341806381828015, + "grad_norm": 1.1781586408615112, + "learning_rate": 7.274436669899294e-06, + "loss": 1.9679, + "mean_token_accuracy": 0.551056981086731, + "num_tokens": 6113693938.0, + "step": 11960 + }, + { + "epoch": 3.234451054624121, + "grad_norm": 8.282720565795898, + "learning_rate": 7.273001650600875e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.7080808877944946, + "num_tokens": 6114218148.0, + "step": 11961 + }, + { + "epoch": 3.234721471065441, + "grad_norm": 1.4093769788742065, + "learning_rate": 7.271566745657721e-06, + "loss": 1.9212, + "mean_token_accuracy": 0.5623297095298767, + "num_tokens": 6114742417.0, + "step": 11962 + }, + { + "epoch": 3.2349918875067605, + "grad_norm": 1.4177407026290894, + "learning_rate": 7.270131955113866e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5572762489318848, + "num_tokens": 6115266589.0, + "step": 11963 + }, + { + "epoch": 3.23526230394808, + "grad_norm": 1.0466737747192383, + "learning_rate": 7.268697279013334e-06, + "loss": 1.9709, + "mean_token_accuracy": 0.5375732183456421, + "num_tokens": 6115790847.0, + "step": 11964 + }, + { + "epoch": 3.2355327203893998, + "grad_norm": 1.3475816249847412, + "learning_rate": 7.267262717400138e-06, + "loss": 1.9095, + "mean_token_accuracy": 0.5677914023399353, + "num_tokens": 6116315117.0, + "step": 11965 + }, + { + "epoch": 3.2358031368307194, + "grad_norm": 0.9201609492301941, + "learning_rate": 7.265828270318305e-06, + "loss": 1.8894, + "mean_token_accuracy": 0.5617749691009521, + "num_tokens": 6116839323.0, + "step": 11966 + }, + { + "epoch": 3.236073553272039, + "grad_norm": 1.1539942026138306, + "learning_rate": 7.26439393781184e-06, + "loss": 1.9981, + "mean_token_accuracy": 0.5460155010223389, + "num_tokens": 6117363527.0, + "step": 11967 + }, + { + "epoch": 3.2363439697133587, + "grad_norm": 0.9975171089172363, + "learning_rate": 7.2629597199247515e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5636688470840454, + "num_tokens": 6117887793.0, + "step": 11968 + }, + { + "epoch": 3.2366143861546783, + "grad_norm": 0.9073588848114014, + "learning_rate": 7.261525616701053e-06, + "loss": 1.8025, + "mean_token_accuracy": 0.5673897862434387, + "num_tokens": 6118412022.0, + "step": 11969 + }, + { + "epoch": 3.236884802595998, + "grad_norm": 1.006671667098999, + "learning_rate": 7.26009162818474e-06, + "loss": 1.7707, + "mean_token_accuracy": 0.5802838802337646, + "num_tokens": 6118936300.0, + "step": 11970 + }, + { + "epoch": 3.2371552190373176, + "grad_norm": 1.0196974277496338, + "learning_rate": 7.258657754419811e-06, + "loss": 1.8691, + "mean_token_accuracy": 0.5638998746871948, + "num_tokens": 6119460447.0, + "step": 11971 + }, + { + "epoch": 3.2374256354786373, + "grad_norm": 1.1616591215133667, + "learning_rate": 7.257223995450271e-06, + "loss": 1.6546, + "mean_token_accuracy": 0.6050923466682434, + "num_tokens": 6119984719.0, + "step": 11972 + }, + { + "epoch": 3.237696051919957, + "grad_norm": 0.9626056551933289, + "learning_rate": 7.255790351320104e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5895713567733765, + "num_tokens": 6120508898.0, + "step": 11973 + }, + { + "epoch": 3.2379664683612766, + "grad_norm": 0.9284655451774597, + "learning_rate": 7.254356822073296e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5655173063278198, + "num_tokens": 6120987800.0, + "step": 11974 + }, + { + "epoch": 3.238236884802596, + "grad_norm": 0.9602915048599243, + "learning_rate": 7.252923407753843e-06, + "loss": 1.6318, + "mean_token_accuracy": 0.6296775341033936, + "num_tokens": 6121511925.0, + "step": 11975 + }, + { + "epoch": 3.238507301243916, + "grad_norm": 0.9466050267219543, + "learning_rate": 7.2514901084057144e-06, + "loss": 1.834, + "mean_token_accuracy": 0.5681969523429871, + "num_tokens": 6122036190.0, + "step": 11976 + }, + { + "epoch": 3.2387777176852355, + "grad_norm": 1.0190876722335815, + "learning_rate": 7.2500569240728925e-06, + "loss": 1.8742, + "mean_token_accuracy": 0.5820811986923218, + "num_tokens": 6122500494.0, + "step": 11977 + }, + { + "epoch": 3.2390481341265547, + "grad_norm": 0.8591974377632141, + "learning_rate": 7.248623854799355e-06, + "loss": 1.822, + "mean_token_accuracy": 0.5674638152122498, + "num_tokens": 6123024774.0, + "step": 11978 + }, + { + "epoch": 3.2393185505678743, + "grad_norm": 0.9944599270820618, + "learning_rate": 7.247190900629066e-06, + "loss": 1.832, + "mean_token_accuracy": 0.5585722923278809, + "num_tokens": 6123549041.0, + "step": 11979 + }, + { + "epoch": 3.239588967009194, + "grad_norm": 1.2176151275634766, + "learning_rate": 7.245758061606001e-06, + "loss": 1.8875, + "mean_token_accuracy": 0.5745599269866943, + "num_tokens": 6124073310.0, + "step": 11980 + }, + { + "epoch": 3.2398593834505136, + "grad_norm": 0.3837212026119232, + "learning_rate": 7.244325337774114e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.7165435552597046, + "num_tokens": 6124597414.0, + "step": 11981 + }, + { + "epoch": 3.2401297998918333, + "grad_norm": 1.1590664386749268, + "learning_rate": 7.242892729177373e-06, + "loss": 1.67, + "mean_token_accuracy": 0.6116559505462646, + "num_tokens": 6125111510.0, + "step": 11982 + }, + { + "epoch": 3.240400216333153, + "grad_norm": 1.2189185619354248, + "learning_rate": 7.2414602358597316e-06, + "loss": 1.8797, + "mean_token_accuracy": 0.576643705368042, + "num_tokens": 6125635665.0, + "step": 11983 + }, + { + "epoch": 3.2406706327744725, + "grad_norm": 1.1102629899978638, + "learning_rate": 7.240027857865144e-06, + "loss": 1.744, + "mean_token_accuracy": 0.5805885195732117, + "num_tokens": 6126159816.0, + "step": 11984 + }, + { + "epoch": 3.240941049215792, + "grad_norm": 0.9745904207229614, + "learning_rate": 7.238595595237556e-06, + "loss": 1.8623, + "mean_token_accuracy": 0.5698274374008179, + "num_tokens": 6126684073.0, + "step": 11985 + }, + { + "epoch": 3.241211465657112, + "grad_norm": 1.0705984830856323, + "learning_rate": 7.237163448020923e-06, + "loss": 1.906, + "mean_token_accuracy": 0.5527727007865906, + "num_tokens": 6127208354.0, + "step": 11986 + }, + { + "epoch": 3.2414818820984315, + "grad_norm": 1.3456324338912964, + "learning_rate": 7.23573141625918e-06, + "loss": 1.9018, + "mean_token_accuracy": 0.5684933066368103, + "num_tokens": 6127732638.0, + "step": 11987 + }, + { + "epoch": 3.241752298539751, + "grad_norm": 1.144118309020996, + "learning_rate": 7.23429949999626e-06, + "loss": 1.8956, + "mean_token_accuracy": 0.5726875066757202, + "num_tokens": 6128256839.0, + "step": 11988 + }, + { + "epoch": 3.2420227149810708, + "grad_norm": 1.3602676391601562, + "learning_rate": 7.232867699276111e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5802909135818481, + "num_tokens": 6128722701.0, + "step": 11989 + }, + { + "epoch": 3.2422931314223904, + "grad_norm": 1.0684995651245117, + "learning_rate": 7.2314360141426635e-06, + "loss": 1.852, + "mean_token_accuracy": 0.5684547424316406, + "num_tokens": 6129246923.0, + "step": 11990 + }, + { + "epoch": 3.24256354786371, + "grad_norm": 1.0123707056045532, + "learning_rate": 7.230004444639833e-06, + "loss": 1.78, + "mean_token_accuracy": 0.5893869996070862, + "num_tokens": 6129771115.0, + "step": 11991 + }, + { + "epoch": 3.2428339643050297, + "grad_norm": 1.4254133701324463, + "learning_rate": 7.22857299081156e-06, + "loss": 1.9593, + "mean_token_accuracy": 0.5223774909973145, + "num_tokens": 6130283219.0, + "step": 11992 + }, + { + "epoch": 3.2431043807463493, + "grad_norm": 0.9287592172622681, + "learning_rate": 7.227141652701757e-06, + "loss": 1.953, + "mean_token_accuracy": 0.566325306892395, + "num_tokens": 6130749050.0, + "step": 11993 + }, + { + "epoch": 3.243374797187669, + "grad_norm": 1.214436411857605, + "learning_rate": 7.22571043035434e-06, + "loss": 1.8468, + "mean_token_accuracy": 0.5668131113052368, + "num_tokens": 6131273118.0, + "step": 11994 + }, + { + "epoch": 3.2436452136289886, + "grad_norm": 1.090694785118103, + "learning_rate": 7.224279323813226e-06, + "loss": 1.9238, + "mean_token_accuracy": 0.5663948059082031, + "num_tokens": 6131797153.0, + "step": 11995 + }, + { + "epoch": 3.2439156300703083, + "grad_norm": 1.033806324005127, + "learning_rate": 7.22284833312233e-06, + "loss": 1.7446, + "mean_token_accuracy": 0.5833473801612854, + "num_tokens": 6132321275.0, + "step": 11996 + }, + { + "epoch": 3.244186046511628, + "grad_norm": 1.088897943496704, + "learning_rate": 7.22141745832555e-06, + "loss": 1.9153, + "mean_token_accuracy": 0.5681861639022827, + "num_tokens": 6132845489.0, + "step": 11997 + }, + { + "epoch": 3.2444564629529475, + "grad_norm": 1.114950180053711, + "learning_rate": 7.219986699466801e-06, + "loss": 1.8372, + "mean_token_accuracy": 0.5646395683288574, + "num_tokens": 6133369707.0, + "step": 11998 + }, + { + "epoch": 3.244726879394267, + "grad_norm": 1.0099273920059204, + "learning_rate": 7.218556056589973e-06, + "loss": 1.7958, + "mean_token_accuracy": 0.5770381689071655, + "num_tokens": 6133893871.0, + "step": 11999 + }, + { + "epoch": 3.244997295835587, + "grad_norm": 1.1160082817077637, + "learning_rate": 7.2171255297389664e-06, + "loss": 1.7614, + "mean_token_accuracy": 0.5833960175514221, + "num_tokens": 6134418122.0, + "step": 12000 + }, + { + "epoch": 3.2452677122769065, + "grad_norm": 2.1417622566223145, + "learning_rate": 7.215695118957673e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7374378442764282, + "num_tokens": 6134942395.0, + "step": 12001 + }, + { + "epoch": 3.245538128718226, + "grad_norm": 1.3308593034744263, + "learning_rate": 7.2142648242899826e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5689661502838135, + "num_tokens": 6135466619.0, + "step": 12002 + }, + { + "epoch": 3.2458085451595458, + "grad_norm": 1.419214129447937, + "learning_rate": 7.21283464577978e-06, + "loss": 1.8437, + "mean_token_accuracy": 0.578230619430542, + "num_tokens": 6135989897.0, + "step": 12003 + }, + { + "epoch": 3.2460789616008654, + "grad_norm": 1.205595850944519, + "learning_rate": 7.211404583470949e-06, + "loss": 1.9873, + "mean_token_accuracy": 0.5517299771308899, + "num_tokens": 6136514181.0, + "step": 12004 + }, + { + "epoch": 3.246349378042185, + "grad_norm": 1.216759204864502, + "learning_rate": 7.209974637407367e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5725771188735962, + "num_tokens": 6136928399.0, + "step": 12005 + }, + { + "epoch": 3.2466197944835047, + "grad_norm": 1.3368853330612183, + "learning_rate": 7.208544807632905e-06, + "loss": 1.873, + "mean_token_accuracy": 0.5756229162216187, + "num_tokens": 6137415181.0, + "step": 12006 + }, + { + "epoch": 3.2468902109248243, + "grad_norm": 1.013541340827942, + "learning_rate": 7.207115094191441e-06, + "loss": 1.7914, + "mean_token_accuracy": 0.5801929235458374, + "num_tokens": 6137939368.0, + "step": 12007 + }, + { + "epoch": 3.247160627366144, + "grad_norm": 1.0100045204162598, + "learning_rate": 7.2056854971268395e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5798255205154419, + "num_tokens": 6138463639.0, + "step": 12008 + }, + { + "epoch": 3.2474310438074636, + "grad_norm": 1.0292948484420776, + "learning_rate": 7.204256016482967e-06, + "loss": 1.9565, + "mean_token_accuracy": 0.5599857568740845, + "num_tokens": 6138987834.0, + "step": 12009 + }, + { + "epoch": 3.2477014602487833, + "grad_norm": 1.1444759368896484, + "learning_rate": 7.202826652303679e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.5797871351242065, + "num_tokens": 6139501305.0, + "step": 12010 + }, + { + "epoch": 3.247971876690103, + "grad_norm": 1.292002558708191, + "learning_rate": 7.2013974046328375e-06, + "loss": 1.9265, + "mean_token_accuracy": 0.5646653175354004, + "num_tokens": 6140015059.0, + "step": 12011 + }, + { + "epoch": 3.2482422931314225, + "grad_norm": 1.0182644128799438, + "learning_rate": 7.199968273514295e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5682222247123718, + "num_tokens": 6140539178.0, + "step": 12012 + }, + { + "epoch": 3.248512709572742, + "grad_norm": 1.064517617225647, + "learning_rate": 7.198539258991903e-06, + "loss": 1.7802, + "mean_token_accuracy": 0.6008133292198181, + "num_tokens": 6141061781.0, + "step": 12013 + }, + { + "epoch": 3.248783126014062, + "grad_norm": 1.1982382535934448, + "learning_rate": 7.197110361109503e-06, + "loss": 1.9612, + "mean_token_accuracy": 0.5698714256286621, + "num_tokens": 6141540905.0, + "step": 12014 + }, + { + "epoch": 3.2490535424553815, + "grad_norm": 0.957883358001709, + "learning_rate": 7.195681579910942e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.556998074054718, + "num_tokens": 6142065079.0, + "step": 12015 + }, + { + "epoch": 3.249323958896701, + "grad_norm": 1.1382521390914917, + "learning_rate": 7.194252915440061e-06, + "loss": 1.7672, + "mean_token_accuracy": 0.5900167226791382, + "num_tokens": 6142589230.0, + "step": 12016 + }, + { + "epoch": 3.2495943753380208, + "grad_norm": 1.0698872804641724, + "learning_rate": 7.192824367740687e-06, + "loss": 1.8684, + "mean_token_accuracy": 0.5796303749084473, + "num_tokens": 6143113444.0, + "step": 12017 + }, + { + "epoch": 3.2498647917793404, + "grad_norm": 1.0299971103668213, + "learning_rate": 7.19139593685666e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5767136812210083, + "num_tokens": 6143637708.0, + "step": 12018 + }, + { + "epoch": 3.2501352082206596, + "grad_norm": 1.1976498365402222, + "learning_rate": 7.189967622831809e-06, + "loss": 1.7092, + "mean_token_accuracy": 0.5868422389030457, + "num_tokens": 6144161943.0, + "step": 12019 + }, + { + "epoch": 3.2504056246619797, + "grad_norm": 1.1291593313217163, + "learning_rate": 7.188539425709951e-06, + "loss": 1.9214, + "mean_token_accuracy": 0.5572338104248047, + "num_tokens": 6144686218.0, + "step": 12020 + }, + { + "epoch": 3.250676041103299, + "grad_norm": 0.4283568263053894, + "learning_rate": 7.187111345534916e-06, + "loss": 1.1402, + "mean_token_accuracy": 0.6956998109817505, + "num_tokens": 6145210408.0, + "step": 12021 + }, + { + "epoch": 3.2509464575446185, + "grad_norm": 1.1140174865722656, + "learning_rate": 7.185683382350512e-06, + "loss": 1.8736, + "mean_token_accuracy": 0.5522953271865845, + "num_tokens": 6145734604.0, + "step": 12022 + }, + { + "epoch": 3.251216873985938, + "grad_norm": 1.0789936780929565, + "learning_rate": 7.184255536200566e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5944880843162537, + "num_tokens": 6146196501.0, + "step": 12023 + }, + { + "epoch": 3.251487290427258, + "grad_norm": 0.9863090515136719, + "learning_rate": 7.182827807128876e-06, + "loss": 1.849, + "mean_token_accuracy": 0.5792111158370972, + "num_tokens": 6146706271.0, + "step": 12024 + }, + { + "epoch": 3.2517577068685775, + "grad_norm": 0.8224120736122131, + "learning_rate": 7.181400195179258e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5692193508148193, + "num_tokens": 6147230550.0, + "step": 12025 + }, + { + "epoch": 3.252028123309897, + "grad_norm": 0.8441934585571289, + "learning_rate": 7.179972700395508e-06, + "loss": 1.7202, + "mean_token_accuracy": 0.6021977663040161, + "num_tokens": 6147704131.0, + "step": 12026 + }, + { + "epoch": 3.2522985397512167, + "grad_norm": 1.1631474494934082, + "learning_rate": 7.1785453228214354e-06, + "loss": 1.9487, + "mean_token_accuracy": 0.5647502541542053, + "num_tokens": 6148215611.0, + "step": 12027 + }, + { + "epoch": 3.2525689561925364, + "grad_norm": 0.9074471592903137, + "learning_rate": 7.177118062500828e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5746161937713623, + "num_tokens": 6148739894.0, + "step": 12028 + }, + { + "epoch": 3.252839372633856, + "grad_norm": 1.115182876586914, + "learning_rate": 7.175690919477478e-06, + "loss": 1.8591, + "mean_token_accuracy": 0.5734629034996033, + "num_tokens": 6149264094.0, + "step": 12029 + }, + { + "epoch": 3.2531097890751757, + "grad_norm": 0.8900560140609741, + "learning_rate": 7.1742638937951796e-06, + "loss": 1.6714, + "mean_token_accuracy": 0.5865015387535095, + "num_tokens": 6149776117.0, + "step": 12030 + }, + { + "epoch": 3.2533802055164953, + "grad_norm": 0.9075934290885925, + "learning_rate": 7.172836985497717e-06, + "loss": 1.7628, + "mean_token_accuracy": 0.5978740453720093, + "num_tokens": 6150239596.0, + "step": 12031 + }, + { + "epoch": 3.253650621957815, + "grad_norm": 1.0125867128372192, + "learning_rate": 7.171410194628867e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5860651135444641, + "num_tokens": 6150651195.0, + "step": 12032 + }, + { + "epoch": 3.2539210383991346, + "grad_norm": 1.2857775688171387, + "learning_rate": 7.1699835212324155e-06, + "loss": 1.8963, + "mean_token_accuracy": 0.5651994347572327, + "num_tokens": 6151175440.0, + "step": 12033 + }, + { + "epoch": 3.2541914548404542, + "grad_norm": 1.1303991079330444, + "learning_rate": 7.168556965352129e-06, + "loss": 1.8719, + "mean_token_accuracy": 0.5692525506019592, + "num_tokens": 6151699554.0, + "step": 12034 + }, + { + "epoch": 3.254461871281774, + "grad_norm": 4.480858325958252, + "learning_rate": 7.167130527031785e-06, + "loss": 1.6556, + "mean_token_accuracy": 0.6101282835006714, + "num_tokens": 6152212735.0, + "step": 12035 + }, + { + "epoch": 3.2547322877230935, + "grad_norm": 1.2957651615142822, + "learning_rate": 7.16570420631515e-06, + "loss": 1.7877, + "mean_token_accuracy": 0.5847120881080627, + "num_tokens": 6152682907.0, + "step": 12036 + }, + { + "epoch": 3.255002704164413, + "grad_norm": 1.4366042613983154, + "learning_rate": 7.164278003245979e-06, + "loss": 1.9848, + "mean_token_accuracy": 0.5519973635673523, + "num_tokens": 6153183471.0, + "step": 12037 + }, + { + "epoch": 3.255273120605733, + "grad_norm": 1.1657084226608276, + "learning_rate": 7.162851917868043e-06, + "loss": 1.9578, + "mean_token_accuracy": 0.5462297797203064, + "num_tokens": 6153666535.0, + "step": 12038 + }, + { + "epoch": 3.2555435370470525, + "grad_norm": 1.0690068006515503, + "learning_rate": 7.161425950225096e-06, + "loss": 1.7861, + "mean_token_accuracy": 0.592033326625824, + "num_tokens": 6154164272.0, + "step": 12039 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 1.3645915985107422, + "learning_rate": 7.160000100360887e-06, + "loss": 1.777, + "mean_token_accuracy": 0.5782168507575989, + "num_tokens": 6154642279.0, + "step": 12040 + }, + { + "epoch": 3.2560843699296917, + "grad_norm": 0.37197408080101013, + "learning_rate": 7.1585743683191685e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7382762432098389, + "num_tokens": 6155136558.0, + "step": 12041 + }, + { + "epoch": 3.2563547863710114, + "grad_norm": 1.2601150274276733, + "learning_rate": 7.157148754143688e-06, + "loss": 1.7716, + "mean_token_accuracy": 0.5864274501800537, + "num_tokens": 6155660733.0, + "step": 12042 + }, + { + "epoch": 3.256625202812331, + "grad_norm": 1.1650426387786865, + "learning_rate": 7.15572325787818e-06, + "loss": 1.7963, + "mean_token_accuracy": 0.5752727389335632, + "num_tokens": 6156163091.0, + "step": 12043 + }, + { + "epoch": 3.2568956192536507, + "grad_norm": 1.024746298789978, + "learning_rate": 7.15429787956639e-06, + "loss": 1.9248, + "mean_token_accuracy": 0.5462853908538818, + "num_tokens": 6156687360.0, + "step": 12044 + }, + { + "epoch": 3.2571660356949703, + "grad_norm": 1.143311619758606, + "learning_rate": 7.152872619252054e-06, + "loss": 1.9095, + "mean_token_accuracy": 0.5795857906341553, + "num_tokens": 6157211606.0, + "step": 12045 + }, + { + "epoch": 3.25743645213629, + "grad_norm": 1.0893723964691162, + "learning_rate": 7.151447476978892e-06, + "loss": 1.822, + "mean_token_accuracy": 0.5711898803710938, + "num_tokens": 6157735870.0, + "step": 12046 + }, + { + "epoch": 3.2577068685776096, + "grad_norm": 1.2071216106414795, + "learning_rate": 7.150022452790646e-06, + "loss": 1.8772, + "mean_token_accuracy": 0.5593279004096985, + "num_tokens": 6158260048.0, + "step": 12047 + }, + { + "epoch": 3.2579772850189292, + "grad_norm": 1.1326096057891846, + "learning_rate": 7.148597546731031e-06, + "loss": 1.7349, + "mean_token_accuracy": 0.5886373519897461, + "num_tokens": 6158784253.0, + "step": 12048 + }, + { + "epoch": 3.258247701460249, + "grad_norm": 1.0848281383514404, + "learning_rate": 7.147172758843768e-06, + "loss": 1.8831, + "mean_token_accuracy": 0.5498752593994141, + "num_tokens": 6159308390.0, + "step": 12049 + }, + { + "epoch": 3.2585181179015685, + "grad_norm": 0.949342668056488, + "learning_rate": 7.145748089172579e-06, + "loss": 1.7898, + "mean_token_accuracy": 0.5893505811691284, + "num_tokens": 6159795840.0, + "step": 12050 + }, + { + "epoch": 3.258788534342888, + "grad_norm": 1.0247658491134644, + "learning_rate": 7.144323537761166e-06, + "loss": 1.8293, + "mean_token_accuracy": 0.5805678963661194, + "num_tokens": 6160287045.0, + "step": 12051 + }, + { + "epoch": 3.259058950784208, + "grad_norm": 1.0365943908691406, + "learning_rate": 7.1428991046532496e-06, + "loss": 1.815, + "mean_token_accuracy": 0.566625714302063, + "num_tokens": 6160811252.0, + "step": 12052 + }, + { + "epoch": 3.2593293672255275, + "grad_norm": 1.0089608430862427, + "learning_rate": 7.141474789892533e-06, + "loss": 1.7616, + "mean_token_accuracy": 0.580727219581604, + "num_tokens": 6161335303.0, + "step": 12053 + }, + { + "epoch": 3.259599783666847, + "grad_norm": 0.9866039156913757, + "learning_rate": 7.140050593522718e-06, + "loss": 1.8049, + "mean_token_accuracy": 0.5637160539627075, + "num_tokens": 6161859479.0, + "step": 12054 + }, + { + "epoch": 3.2598702001081667, + "grad_norm": 1.1471226215362549, + "learning_rate": 7.138626515587499e-06, + "loss": 1.8796, + "mean_token_accuracy": 0.5726271271705627, + "num_tokens": 6162383680.0, + "step": 12055 + }, + { + "epoch": 3.2601406165494864, + "grad_norm": 1.2553093433380127, + "learning_rate": 7.137202556130576e-06, + "loss": 1.9148, + "mean_token_accuracy": 0.5628407001495361, + "num_tokens": 6162895348.0, + "step": 12056 + }, + { + "epoch": 3.260411032990806, + "grad_norm": 1.16665780544281, + "learning_rate": 7.135778715195636e-06, + "loss": 1.7917, + "mean_token_accuracy": 0.6051462888717651, + "num_tokens": 6163419542.0, + "step": 12057 + }, + { + "epoch": 3.2606814494321252, + "grad_norm": 0.9146960377693176, + "learning_rate": 7.134354992826373e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5732851028442383, + "num_tokens": 6163943816.0, + "step": 12058 + }, + { + "epoch": 3.2609518658734453, + "grad_norm": 0.9442288279533386, + "learning_rate": 7.132931389066466e-06, + "loss": 1.8169, + "mean_token_accuracy": 0.5864421725273132, + "num_tokens": 6164467935.0, + "step": 12059 + }, + { + "epoch": 3.2612222823147645, + "grad_norm": 0.9465312957763672, + "learning_rate": 7.131507903959593e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.5464973449707031, + "num_tokens": 6164992187.0, + "step": 12060 + }, + { + "epoch": 3.2614926987560846, + "grad_norm": 0.3929150700569153, + "learning_rate": 7.130084537549438e-06, + "loss": 1.1257, + "mean_token_accuracy": 0.6995428800582886, + "num_tokens": 6165465018.0, + "step": 12061 + }, + { + "epoch": 3.261763115197404, + "grad_norm": 1.268684983253479, + "learning_rate": 7.1286612898796705e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5908995866775513, + "num_tokens": 6165934797.0, + "step": 12062 + }, + { + "epoch": 3.2620335316387234, + "grad_norm": 1.0210868120193481, + "learning_rate": 7.127238160993955e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5745493173599243, + "num_tokens": 6166458863.0, + "step": 12063 + }, + { + "epoch": 3.262303948080043, + "grad_norm": 0.9242476224899292, + "learning_rate": 7.125815150935964e-06, + "loss": 1.9356, + "mean_token_accuracy": 0.5740584135055542, + "num_tokens": 6166918566.0, + "step": 12064 + }, + { + "epoch": 3.2625743645213627, + "grad_norm": 0.8866825103759766, + "learning_rate": 7.1243922597493556e-06, + "loss": 1.8465, + "mean_token_accuracy": 0.5736247301101685, + "num_tokens": 6167442640.0, + "step": 12065 + }, + { + "epoch": 3.2628447809626824, + "grad_norm": 1.0885499715805054, + "learning_rate": 7.122969487477793e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.5630742311477661, + "num_tokens": 6167966897.0, + "step": 12066 + }, + { + "epoch": 3.263115197404002, + "grad_norm": 0.9188299179077148, + "learning_rate": 7.1215468341649256e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5586800575256348, + "num_tokens": 6168491157.0, + "step": 12067 + }, + { + "epoch": 3.2633856138453217, + "grad_norm": 0.9860482811927795, + "learning_rate": 7.12012429985441e-06, + "loss": 1.916, + "mean_token_accuracy": 0.5554612278938293, + "num_tokens": 6169015356.0, + "step": 12068 + }, + { + "epoch": 3.2636560302866413, + "grad_norm": 1.1426829099655151, + "learning_rate": 7.118701884589887e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.560998260974884, + "num_tokens": 6169539364.0, + "step": 12069 + }, + { + "epoch": 3.263926446727961, + "grad_norm": 1.14749014377594, + "learning_rate": 7.117279588415007e-06, + "loss": 1.9348, + "mean_token_accuracy": 0.5512357950210571, + "num_tokens": 6170063640.0, + "step": 12070 + }, + { + "epoch": 3.2641968631692806, + "grad_norm": 1.0045706033706665, + "learning_rate": 7.11585741137341e-06, + "loss": 1.8528, + "mean_token_accuracy": 0.5803841948509216, + "num_tokens": 6170587920.0, + "step": 12071 + }, + { + "epoch": 3.2644672796106002, + "grad_norm": 0.976382315158844, + "learning_rate": 7.1144353535087244e-06, + "loss": 1.9028, + "mean_token_accuracy": 0.5552427768707275, + "num_tokens": 6171112082.0, + "step": 12072 + }, + { + "epoch": 3.26473769605192, + "grad_norm": 1.1403734683990479, + "learning_rate": 7.113013414864594e-06, + "loss": 2.0448, + "mean_token_accuracy": 0.5463981628417969, + "num_tokens": 6171636287.0, + "step": 12073 + }, + { + "epoch": 3.2650081124932395, + "grad_norm": 1.182447075843811, + "learning_rate": 7.111591595484642e-06, + "loss": 1.8146, + "mean_token_accuracy": 0.5832813382148743, + "num_tokens": 6172158368.0, + "step": 12074 + }, + { + "epoch": 3.265278528934559, + "grad_norm": 0.8892958760261536, + "learning_rate": 7.1101698954124905e-06, + "loss": 1.8392, + "mean_token_accuracy": 0.5733476281166077, + "num_tokens": 6172682557.0, + "step": 12075 + }, + { + "epoch": 3.265548945375879, + "grad_norm": 1.038323998451233, + "learning_rate": 7.108748314691771e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.5700522065162659, + "num_tokens": 6173163079.0, + "step": 12076 + }, + { + "epoch": 3.2658193618171985, + "grad_norm": 1.1247888803482056, + "learning_rate": 7.1073268533660965e-06, + "loss": 1.7275, + "mean_token_accuracy": 0.602924108505249, + "num_tokens": 6173687148.0, + "step": 12077 + }, + { + "epoch": 3.266089778258518, + "grad_norm": 1.141090750694275, + "learning_rate": 7.105905511479076e-06, + "loss": 1.9217, + "mean_token_accuracy": 0.5669734477996826, + "num_tokens": 6174211423.0, + "step": 12078 + }, + { + "epoch": 3.2663601946998377, + "grad_norm": 1.0167863368988037, + "learning_rate": 7.104484289074329e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.5815508365631104, + "num_tokens": 6174735694.0, + "step": 12079 + }, + { + "epoch": 3.2666306111411574, + "grad_norm": 0.8969597220420837, + "learning_rate": 7.103063186195462e-06, + "loss": 1.7989, + "mean_token_accuracy": 0.5763875842094421, + "num_tokens": 6175257546.0, + "step": 12080 + }, + { + "epoch": 3.266901027582477, + "grad_norm": 0.3747466206550598, + "learning_rate": 7.101642202886075e-06, + "loss": 1.1412, + "mean_token_accuracy": 0.6883505582809448, + "num_tokens": 6175781789.0, + "step": 12081 + }, + { + "epoch": 3.2671714440237967, + "grad_norm": 2.139211893081665, + "learning_rate": 7.1002213391897725e-06, + "loss": 1.8377, + "mean_token_accuracy": 0.5932153463363647, + "num_tokens": 6176280249.0, + "step": 12082 + }, + { + "epoch": 3.2674418604651163, + "grad_norm": 1.1903464794158936, + "learning_rate": 7.098800595150142e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.5799564123153687, + "num_tokens": 6176804504.0, + "step": 12083 + }, + { + "epoch": 3.267712276906436, + "grad_norm": 1.1554145812988281, + "learning_rate": 7.097379970810787e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5504468083381653, + "num_tokens": 6177328749.0, + "step": 12084 + }, + { + "epoch": 3.2679826933477556, + "grad_norm": 1.1053305864334106, + "learning_rate": 7.09595946621529e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5747147798538208, + "num_tokens": 6177852925.0, + "step": 12085 + }, + { + "epoch": 3.2682531097890752, + "grad_norm": 1.0358024835586548, + "learning_rate": 7.094539081407237e-06, + "loss": 1.9242, + "mean_token_accuracy": 0.5615363121032715, + "num_tokens": 6178377197.0, + "step": 12086 + }, + { + "epoch": 3.268523526230395, + "grad_norm": 0.9708324670791626, + "learning_rate": 7.093118816430211e-06, + "loss": 1.9138, + "mean_token_accuracy": 0.5463979840278625, + "num_tokens": 6178901416.0, + "step": 12087 + }, + { + "epoch": 3.2687939426717145, + "grad_norm": 1.0062153339385986, + "learning_rate": 7.091698671327789e-06, + "loss": 1.8473, + "mean_token_accuracy": 0.579166054725647, + "num_tokens": 6179425586.0, + "step": 12088 + }, + { + "epoch": 3.269064359113034, + "grad_norm": 1.045971155166626, + "learning_rate": 7.0902786461435445e-06, + "loss": 1.9322, + "mean_token_accuracy": 0.5574081540107727, + "num_tokens": 6179949749.0, + "step": 12089 + }, + { + "epoch": 3.269334775554354, + "grad_norm": 1.0132397413253784, + "learning_rate": 7.08885874092105e-06, + "loss": 1.8655, + "mean_token_accuracy": 0.5724290609359741, + "num_tokens": 6180456705.0, + "step": 12090 + }, + { + "epoch": 3.2696051919956735, + "grad_norm": 1.077670693397522, + "learning_rate": 7.0874389557038734e-06, + "loss": 1.9365, + "mean_token_accuracy": 0.5594933032989502, + "num_tokens": 6180977814.0, + "step": 12091 + }, + { + "epoch": 3.269875608436993, + "grad_norm": 0.9061880707740784, + "learning_rate": 7.08601929053557e-06, + "loss": 1.7728, + "mean_token_accuracy": 0.5896830558776855, + "num_tokens": 6181502032.0, + "step": 12092 + }, + { + "epoch": 3.2701460248783127, + "grad_norm": 0.9461543560028076, + "learning_rate": 7.084599745459708e-06, + "loss": 1.9557, + "mean_token_accuracy": 0.5592375993728638, + "num_tokens": 6182026311.0, + "step": 12093 + }, + { + "epoch": 3.2704164413196324, + "grad_norm": 0.8768736720085144, + "learning_rate": 7.08318032051984e-06, + "loss": 1.9239, + "mean_token_accuracy": 0.5552209615707397, + "num_tokens": 6182550588.0, + "step": 12094 + }, + { + "epoch": 3.270686857760952, + "grad_norm": 1.1050463914871216, + "learning_rate": 7.081761015759516e-06, + "loss": 1.9258, + "mean_token_accuracy": 0.5589344501495361, + "num_tokens": 6183074729.0, + "step": 12095 + }, + { + "epoch": 3.2709572742022717, + "grad_norm": 0.9797360897064209, + "learning_rate": 7.080341831222292e-06, + "loss": 1.9194, + "mean_token_accuracy": 0.5805752277374268, + "num_tokens": 6183536395.0, + "step": 12096 + }, + { + "epoch": 3.2712276906435913, + "grad_norm": 0.9857171177864075, + "learning_rate": 7.078922766951706e-06, + "loss": 1.9519, + "mean_token_accuracy": 0.5569843649864197, + "num_tokens": 6184060577.0, + "step": 12097 + }, + { + "epoch": 3.271498107084911, + "grad_norm": 0.8718975782394409, + "learning_rate": 7.077503822991296e-06, + "loss": 1.815, + "mean_token_accuracy": 0.577889084815979, + "num_tokens": 6184584753.0, + "step": 12098 + }, + { + "epoch": 3.27176852352623, + "grad_norm": 0.9745575785636902, + "learning_rate": 7.076084999384609e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5850436687469482, + "num_tokens": 6185088035.0, + "step": 12099 + }, + { + "epoch": 3.2720389399675502, + "grad_norm": 0.994234561920166, + "learning_rate": 7.0746662961751765e-06, + "loss": 1.6538, + "mean_token_accuracy": 0.6152664422988892, + "num_tokens": 6185612200.0, + "step": 12100 + }, + { + "epoch": 3.2723093564088694, + "grad_norm": 0.4050350785255432, + "learning_rate": 7.073247713406519e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.7203385829925537, + "num_tokens": 6186073220.0, + "step": 12101 + }, + { + "epoch": 3.2725797728501895, + "grad_norm": 1.1022939682006836, + "learning_rate": 7.071829251122173e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5766730308532715, + "num_tokens": 6186561795.0, + "step": 12102 + }, + { + "epoch": 3.2728501892915087, + "grad_norm": 1.016408085823059, + "learning_rate": 7.070410909365653e-06, + "loss": 1.8449, + "mean_token_accuracy": 0.5731370449066162, + "num_tokens": 6187085826.0, + "step": 12103 + }, + { + "epoch": 3.2731206057328284, + "grad_norm": 1.1754180192947388, + "learning_rate": 7.068992688180489e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5705792903900146, + "num_tokens": 6187610100.0, + "step": 12104 + }, + { + "epoch": 3.273391022174148, + "grad_norm": 0.878404438495636, + "learning_rate": 7.067574587610188e-06, + "loss": 1.8503, + "mean_token_accuracy": 0.5649464130401611, + "num_tokens": 6188134293.0, + "step": 12105 + }, + { + "epoch": 3.2736614386154677, + "grad_norm": 1.0406111478805542, + "learning_rate": 7.06615660769826e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5653038620948792, + "num_tokens": 6188658551.0, + "step": 12106 + }, + { + "epoch": 3.2739318550567873, + "grad_norm": 1.2534165382385254, + "learning_rate": 7.06473874848822e-06, + "loss": 1.9989, + "mean_token_accuracy": 0.5451921224594116, + "num_tokens": 6189182727.0, + "step": 12107 + }, + { + "epoch": 3.274202271498107, + "grad_norm": 1.0221785306930542, + "learning_rate": 7.063321010023563e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5764901041984558, + "num_tokens": 6189661457.0, + "step": 12108 + }, + { + "epoch": 3.2744726879394266, + "grad_norm": 0.9708245992660522, + "learning_rate": 7.061903392347797e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5794440507888794, + "num_tokens": 6190185734.0, + "step": 12109 + }, + { + "epoch": 3.2747431043807462, + "grad_norm": 1.1036243438720703, + "learning_rate": 7.060485895504416e-06, + "loss": 1.7726, + "mean_token_accuracy": 0.5717277526855469, + "num_tokens": 6190709885.0, + "step": 12110 + }, + { + "epoch": 3.275013520822066, + "grad_norm": 1.0533450841903687, + "learning_rate": 7.059068519536916e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5751185417175293, + "num_tokens": 6191233961.0, + "step": 12111 + }, + { + "epoch": 3.2752839372633855, + "grad_norm": 0.9629493355751038, + "learning_rate": 7.057651264488778e-06, + "loss": 1.8874, + "mean_token_accuracy": 0.5774227380752563, + "num_tokens": 6191698035.0, + "step": 12112 + }, + { + "epoch": 3.275554353704705, + "grad_norm": 1.1034014225006104, + "learning_rate": 7.056234130403496e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5694112777709961, + "num_tokens": 6192184061.0, + "step": 12113 + }, + { + "epoch": 3.275824770146025, + "grad_norm": 0.9777024984359741, + "learning_rate": 7.054817117324551e-06, + "loss": 1.8676, + "mean_token_accuracy": 0.5781824588775635, + "num_tokens": 6192671219.0, + "step": 12114 + }, + { + "epoch": 3.2760951865873444, + "grad_norm": 0.9217555522918701, + "learning_rate": 7.053400225295415e-06, + "loss": 1.4901, + "mean_token_accuracy": 0.6369175910949707, + "num_tokens": 6193166322.0, + "step": 12115 + }, + { + "epoch": 3.276365603028664, + "grad_norm": 1.1820132732391357, + "learning_rate": 7.051983454359568e-06, + "loss": 1.8082, + "mean_token_accuracy": 0.571660041809082, + "num_tokens": 6193690598.0, + "step": 12116 + }, + { + "epoch": 3.2766360194699837, + "grad_norm": 0.8744533061981201, + "learning_rate": 7.050566804560481e-06, + "loss": 1.8057, + "mean_token_accuracy": 0.5755406618118286, + "num_tokens": 6194214826.0, + "step": 12117 + }, + { + "epoch": 3.2769064359113034, + "grad_norm": 0.8876656889915466, + "learning_rate": 7.049150275941614e-06, + "loss": 1.8167, + "mean_token_accuracy": 0.5770153999328613, + "num_tokens": 6194739000.0, + "step": 12118 + }, + { + "epoch": 3.277176852352623, + "grad_norm": 1.0034486055374146, + "learning_rate": 7.04773386854644e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5631740093231201, + "num_tokens": 6195263245.0, + "step": 12119 + }, + { + "epoch": 3.2774472687939427, + "grad_norm": 0.9729138612747192, + "learning_rate": 7.046317582418412e-06, + "loss": 1.8083, + "mean_token_accuracy": 0.5887352228164673, + "num_tokens": 6195787398.0, + "step": 12120 + }, + { + "epoch": 3.2777176852352623, + "grad_norm": 0.4398760497570038, + "learning_rate": 7.0449014176009864e-06, + "loss": 1.1363, + "mean_token_accuracy": 0.6894792318344116, + "num_tokens": 6196311515.0, + "step": 12121 + }, + { + "epoch": 3.277988101676582, + "grad_norm": 1.0177570581436157, + "learning_rate": 7.043485374137615e-06, + "loss": 1.8695, + "mean_token_accuracy": 0.5587525367736816, + "num_tokens": 6196835783.0, + "step": 12122 + }, + { + "epoch": 3.2782585181179016, + "grad_norm": 1.208888292312622, + "learning_rate": 7.042069452071752e-06, + "loss": 1.921, + "mean_token_accuracy": 0.5580769777297974, + "num_tokens": 6197360035.0, + "step": 12123 + }, + { + "epoch": 3.2785289345592212, + "grad_norm": 1.0152802467346191, + "learning_rate": 7.040653651446832e-06, + "loss": 1.904, + "mean_token_accuracy": 0.5664891004562378, + "num_tokens": 6197884239.0, + "step": 12124 + }, + { + "epoch": 3.278799351000541, + "grad_norm": 0.9097805619239807, + "learning_rate": 7.039237972306308e-06, + "loss": 1.8111, + "mean_token_accuracy": 0.5793094635009766, + "num_tokens": 6198408435.0, + "step": 12125 + }, + { + "epoch": 3.2790697674418605, + "grad_norm": 1.2824026346206665, + "learning_rate": 7.037822414693607e-06, + "loss": 1.9513, + "mean_token_accuracy": 0.5556075572967529, + "num_tokens": 6198932592.0, + "step": 12126 + }, + { + "epoch": 3.27934018388318, + "grad_norm": 1.0867544412612915, + "learning_rate": 7.0364069786521686e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.5523136258125305, + "num_tokens": 6199456770.0, + "step": 12127 + }, + { + "epoch": 3.2796106003245, + "grad_norm": 1.0308953523635864, + "learning_rate": 7.034991664225419e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.6012639999389648, + "num_tokens": 6199975150.0, + "step": 12128 + }, + { + "epoch": 3.2798810167658194, + "grad_norm": 1.252955436706543, + "learning_rate": 7.033576471456784e-06, + "loss": 1.9376, + "mean_token_accuracy": 0.5609592199325562, + "num_tokens": 6200470454.0, + "step": 12129 + }, + { + "epoch": 3.280151433207139, + "grad_norm": 1.2953786849975586, + "learning_rate": 7.03216140038969e-06, + "loss": 1.9225, + "mean_token_accuracy": 0.5696026086807251, + "num_tokens": 6200994627.0, + "step": 12130 + }, + { + "epoch": 3.2804218496484587, + "grad_norm": 1.053114891052246, + "learning_rate": 7.030746451067554e-06, + "loss": 1.8356, + "mean_token_accuracy": 0.5806635022163391, + "num_tokens": 6201496018.0, + "step": 12131 + }, + { + "epoch": 3.2806922660897784, + "grad_norm": 0.9655959606170654, + "learning_rate": 7.029331623533784e-06, + "loss": 1.8921, + "mean_token_accuracy": 0.5589190721511841, + "num_tokens": 6202016530.0, + "step": 12132 + }, + { + "epoch": 3.280962682531098, + "grad_norm": 1.0233540534973145, + "learning_rate": 7.027916917831803e-06, + "loss": 1.8313, + "mean_token_accuracy": 0.5738673806190491, + "num_tokens": 6202530296.0, + "step": 12133 + }, + { + "epoch": 3.2812330989724177, + "grad_norm": 1.1749745607376099, + "learning_rate": 7.026502334005012e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5669203996658325, + "num_tokens": 6203054323.0, + "step": 12134 + }, + { + "epoch": 3.2815035154137373, + "grad_norm": 1.0866023302078247, + "learning_rate": 7.02508787209681e-06, + "loss": 1.9022, + "mean_token_accuracy": 0.5630583763122559, + "num_tokens": 6203555043.0, + "step": 12135 + }, + { + "epoch": 3.281773931855057, + "grad_norm": 1.1021760702133179, + "learning_rate": 7.023673532150604e-06, + "loss": 2.0661, + "mean_token_accuracy": 0.5339200496673584, + "num_tokens": 6204079308.0, + "step": 12136 + }, + { + "epoch": 3.2820443482963766, + "grad_norm": 1.1250247955322266, + "learning_rate": 7.022259314209785e-06, + "loss": 1.9254, + "mean_token_accuracy": 0.5553334355354309, + "num_tokens": 6204603422.0, + "step": 12137 + }, + { + "epoch": 3.2823147647376962, + "grad_norm": 1.0367525815963745, + "learning_rate": 7.020845218317749e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.5796541571617126, + "num_tokens": 6205116997.0, + "step": 12138 + }, + { + "epoch": 3.282585181179016, + "grad_norm": 0.9376119375228882, + "learning_rate": 7.019431244517887e-06, + "loss": 1.7517, + "mean_token_accuracy": 0.5938831567764282, + "num_tokens": 6205641142.0, + "step": 12139 + }, + { + "epoch": 3.282855597620335, + "grad_norm": 0.9392511248588562, + "learning_rate": 7.01801739285358e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.5613740682601929, + "num_tokens": 6206165289.0, + "step": 12140 + }, + { + "epoch": 3.283126014061655, + "grad_norm": 0.36275437474250793, + "learning_rate": 7.0166036633682045e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.706152617931366, + "num_tokens": 6206689492.0, + "step": 12141 + }, + { + "epoch": 3.2833964305029744, + "grad_norm": 1.1588205099105835, + "learning_rate": 7.01519005610515e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5820472836494446, + "num_tokens": 6207213647.0, + "step": 12142 + }, + { + "epoch": 3.2836668469442944, + "grad_norm": 1.0730607509613037, + "learning_rate": 7.01377657110778e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.585236132144928, + "num_tokens": 6207737872.0, + "step": 12143 + }, + { + "epoch": 3.2839372633856136, + "grad_norm": 1.0412346124649048, + "learning_rate": 7.012363208419466e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.5630701780319214, + "num_tokens": 6208261794.0, + "step": 12144 + }, + { + "epoch": 3.2842076798269333, + "grad_norm": 1.230643630027771, + "learning_rate": 7.010949968083578e-06, + "loss": 1.8866, + "mean_token_accuracy": 0.5735886096954346, + "num_tokens": 6208786068.0, + "step": 12145 + }, + { + "epoch": 3.284478096268253, + "grad_norm": 1.0830727815628052, + "learning_rate": 7.009536850143477e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5779750347137451, + "num_tokens": 6209275628.0, + "step": 12146 + }, + { + "epoch": 3.2847485127095726, + "grad_norm": 1.2286851406097412, + "learning_rate": 7.0081238546425145e-06, + "loss": 1.8132, + "mean_token_accuracy": 0.6081100702285767, + "num_tokens": 6209799867.0, + "step": 12147 + }, + { + "epoch": 3.285018929150892, + "grad_norm": 1.1038355827331543, + "learning_rate": 7.006710981624058e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.570722222328186, + "num_tokens": 6210324090.0, + "step": 12148 + }, + { + "epoch": 3.285289345592212, + "grad_norm": 1.0558428764343262, + "learning_rate": 7.005298231131447e-06, + "loss": 1.7841, + "mean_token_accuracy": 0.5709623098373413, + "num_tokens": 6210830585.0, + "step": 12149 + }, + { + "epoch": 3.2855597620335315, + "grad_norm": 1.0071486234664917, + "learning_rate": 7.0038856032080384e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5795963406562805, + "num_tokens": 6211354834.0, + "step": 12150 + }, + { + "epoch": 3.285830178474851, + "grad_norm": 1.070394515991211, + "learning_rate": 7.002473097897169e-06, + "loss": 1.9754, + "mean_token_accuracy": 0.5350926518440247, + "num_tokens": 6211807364.0, + "step": 12151 + }, + { + "epoch": 3.286100594916171, + "grad_norm": 1.3974491357803345, + "learning_rate": 7.001060715242178e-06, + "loss": 2.0234, + "mean_token_accuracy": 0.5348268151283264, + "num_tokens": 6212331570.0, + "step": 12152 + }, + { + "epoch": 3.2863710113574904, + "grad_norm": 1.209935188293457, + "learning_rate": 6.99964845528641e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5746331810951233, + "num_tokens": 6212855601.0, + "step": 12153 + }, + { + "epoch": 3.28664142779881, + "grad_norm": 0.9351277351379395, + "learning_rate": 6.9982363180731904e-06, + "loss": 1.7509, + "mean_token_accuracy": 0.6013644933700562, + "num_tokens": 6213379847.0, + "step": 12154 + }, + { + "epoch": 3.2869118442401297, + "grad_norm": 1.2104582786560059, + "learning_rate": 6.996824303645846e-06, + "loss": 1.8861, + "mean_token_accuracy": 0.5787215232849121, + "num_tokens": 6213866367.0, + "step": 12155 + }, + { + "epoch": 3.2871822606814494, + "grad_norm": 1.308551549911499, + "learning_rate": 6.995412412047707e-06, + "loss": 1.8785, + "mean_token_accuracy": 0.5731826424598694, + "num_tokens": 6214390590.0, + "step": 12156 + }, + { + "epoch": 3.287452677122769, + "grad_norm": 1.378050684928894, + "learning_rate": 6.994000643322093e-06, + "loss": 1.965, + "mean_token_accuracy": 0.555398166179657, + "num_tokens": 6214914663.0, + "step": 12157 + }, + { + "epoch": 3.2877230935640886, + "grad_norm": 0.8822982311248779, + "learning_rate": 6.992588997512317e-06, + "loss": 1.9284, + "mean_token_accuracy": 0.5549640655517578, + "num_tokens": 6215438859.0, + "step": 12158 + }, + { + "epoch": 3.2879935100054083, + "grad_norm": 1.1997085809707642, + "learning_rate": 6.991177474661699e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.578373908996582, + "num_tokens": 6215963120.0, + "step": 12159 + }, + { + "epoch": 3.288263926446728, + "grad_norm": 1.28675377368927, + "learning_rate": 6.989766074813545e-06, + "loss": 1.8867, + "mean_token_accuracy": 0.5834847688674927, + "num_tokens": 6216487310.0, + "step": 12160 + }, + { + "epoch": 3.2885343428880476, + "grad_norm": 0.3929855227470398, + "learning_rate": 6.988354798011157e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.7191531658172607, + "num_tokens": 6217011490.0, + "step": 12161 + }, + { + "epoch": 3.2888047593293672, + "grad_norm": 1.2619260549545288, + "learning_rate": 6.986943644297846e-06, + "loss": 1.855, + "mean_token_accuracy": 0.571509599685669, + "num_tokens": 6217535745.0, + "step": 12162 + }, + { + "epoch": 3.289075175770687, + "grad_norm": 1.2727214097976685, + "learning_rate": 6.985532613716903e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5720562934875488, + "num_tokens": 6218050009.0, + "step": 12163 + }, + { + "epoch": 3.2893455922120065, + "grad_norm": 1.0232704877853394, + "learning_rate": 6.9841217063116255e-06, + "loss": 1.784, + "mean_token_accuracy": 0.5953704714775085, + "num_tokens": 6218566597.0, + "step": 12164 + }, + { + "epoch": 3.289616008653326, + "grad_norm": 1.0995194911956787, + "learning_rate": 6.982710922125302e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5767455101013184, + "num_tokens": 6219090859.0, + "step": 12165 + }, + { + "epoch": 3.289886425094646, + "grad_norm": 0.9702393412590027, + "learning_rate": 6.981300261201225e-06, + "loss": 1.8285, + "mean_token_accuracy": 0.5874559283256531, + "num_tokens": 6219585956.0, + "step": 12166 + }, + { + "epoch": 3.2901568415359654, + "grad_norm": 1.0176175832748413, + "learning_rate": 6.9798897235826705e-06, + "loss": 1.7973, + "mean_token_accuracy": 0.5898441076278687, + "num_tokens": 6220110205.0, + "step": 12167 + }, + { + "epoch": 3.290427257977285, + "grad_norm": 1.0093337297439575, + "learning_rate": 6.978479309312924e-06, + "loss": 1.7491, + "mean_token_accuracy": 0.5826999545097351, + "num_tokens": 6220634403.0, + "step": 12168 + }, + { + "epoch": 3.2906976744186047, + "grad_norm": 1.1769111156463623, + "learning_rate": 6.977069018435261e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.5638651251792908, + "num_tokens": 6221158654.0, + "step": 12169 + }, + { + "epoch": 3.2909680908599244, + "grad_norm": 1.0468206405639648, + "learning_rate": 6.975658850992948e-06, + "loss": 1.7696, + "mean_token_accuracy": 0.5982090830802917, + "num_tokens": 6221682921.0, + "step": 12170 + }, + { + "epoch": 3.291238507301244, + "grad_norm": 1.0794754028320312, + "learning_rate": 6.974248807029258e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5807234048843384, + "num_tokens": 6222207062.0, + "step": 12171 + }, + { + "epoch": 3.2915089237425637, + "grad_norm": 1.1580865383148193, + "learning_rate": 6.972838886587451e-06, + "loss": 1.8666, + "mean_token_accuracy": 0.5621048212051392, + "num_tokens": 6222731318.0, + "step": 12172 + }, + { + "epoch": 3.2917793401838833, + "grad_norm": 0.9971222281455994, + "learning_rate": 6.971429089710794e-06, + "loss": 1.917, + "mean_token_accuracy": 0.5735975503921509, + "num_tokens": 6223255597.0, + "step": 12173 + }, + { + "epoch": 3.292049756625203, + "grad_norm": 1.1560121774673462, + "learning_rate": 6.970019416442539e-06, + "loss": 1.7522, + "mean_token_accuracy": 0.584242045879364, + "num_tokens": 6223720337.0, + "step": 12174 + }, + { + "epoch": 3.2923201730665226, + "grad_norm": 1.142048954963684, + "learning_rate": 6.968609866825935e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5747950673103333, + "num_tokens": 6224244529.0, + "step": 12175 + }, + { + "epoch": 3.2925905895078422, + "grad_norm": 1.0814505815505981, + "learning_rate": 6.967200440904241e-06, + "loss": 1.9217, + "mean_token_accuracy": 0.5672205686569214, + "num_tokens": 6224768664.0, + "step": 12176 + }, + { + "epoch": 3.292861005949162, + "grad_norm": 0.9549311995506287, + "learning_rate": 6.965791138720696e-06, + "loss": 1.8236, + "mean_token_accuracy": 0.5835425853729248, + "num_tokens": 6225284142.0, + "step": 12177 + }, + { + "epoch": 3.2931314223904815, + "grad_norm": 1.272152066230774, + "learning_rate": 6.964381960318541e-06, + "loss": 1.9103, + "mean_token_accuracy": 0.5677890181541443, + "num_tokens": 6225808406.0, + "step": 12178 + }, + { + "epoch": 3.293401838831801, + "grad_norm": 1.016579508781433, + "learning_rate": 6.962972905741018e-06, + "loss": 1.8985, + "mean_token_accuracy": 0.578213632106781, + "num_tokens": 6226332642.0, + "step": 12179 + }, + { + "epoch": 3.293672255273121, + "grad_norm": 0.9851447939872742, + "learning_rate": 6.961563975031355e-06, + "loss": 1.8957, + "mean_token_accuracy": 0.5654023885726929, + "num_tokens": 6226856924.0, + "step": 12180 + }, + { + "epoch": 3.29394267171444, + "grad_norm": 0.43285781145095825, + "learning_rate": 6.960155168232785e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.715988039970398, + "num_tokens": 6227381199.0, + "step": 12181 + }, + { + "epoch": 3.29421308815576, + "grad_norm": 1.365850806236267, + "learning_rate": 6.958746485388539e-06, + "loss": 1.8014, + "mean_token_accuracy": 0.5767742395401001, + "num_tokens": 6227905430.0, + "step": 12182 + }, + { + "epoch": 3.2944835045970793, + "grad_norm": 1.0468521118164062, + "learning_rate": 6.957337926541834e-06, + "loss": 1.7326, + "mean_token_accuracy": 0.5972026586532593, + "num_tokens": 6228429629.0, + "step": 12183 + }, + { + "epoch": 3.2947539210383994, + "grad_norm": 0.9336379170417786, + "learning_rate": 6.955929491735889e-06, + "loss": 1.8589, + "mean_token_accuracy": 0.5807374715805054, + "num_tokens": 6228902027.0, + "step": 12184 + }, + { + "epoch": 3.2950243374797186, + "grad_norm": 0.9707666635513306, + "learning_rate": 6.9545211810139216e-06, + "loss": 1.8095, + "mean_token_accuracy": 0.5795161724090576, + "num_tokens": 6229426230.0, + "step": 12185 + }, + { + "epoch": 3.295294753921038, + "grad_norm": 1.216720461845398, + "learning_rate": 6.953112994419142e-06, + "loss": 1.6925, + "mean_token_accuracy": 0.6238918304443359, + "num_tokens": 6229886060.0, + "step": 12186 + }, + { + "epoch": 3.295565170362358, + "grad_norm": 1.0259555578231812, + "learning_rate": 6.951704931994753e-06, + "loss": 1.938, + "mean_token_accuracy": 0.5542727708816528, + "num_tokens": 6230410304.0, + "step": 12187 + }, + { + "epoch": 3.2958355868036775, + "grad_norm": 1.0536139011383057, + "learning_rate": 6.950296993783966e-06, + "loss": 1.8763, + "mean_token_accuracy": 0.5487666130065918, + "num_tokens": 6230934545.0, + "step": 12188 + }, + { + "epoch": 3.296106003244997, + "grad_norm": 1.067025065422058, + "learning_rate": 6.9488891798299766e-06, + "loss": 1.8496, + "mean_token_accuracy": 0.5684471130371094, + "num_tokens": 6231458817.0, + "step": 12189 + }, + { + "epoch": 3.296376419686317, + "grad_norm": 0.9573424458503723, + "learning_rate": 6.947481490175978e-06, + "loss": 1.8278, + "mean_token_accuracy": 0.5745009183883667, + "num_tokens": 6231983043.0, + "step": 12190 + }, + { + "epoch": 3.2966468361276364, + "grad_norm": 1.1123307943344116, + "learning_rate": 6.946073924865166e-06, + "loss": 2.0299, + "mean_token_accuracy": 0.5404995679855347, + "num_tokens": 6232507311.0, + "step": 12191 + }, + { + "epoch": 3.296917252568956, + "grad_norm": 1.235378623008728, + "learning_rate": 6.944666483940726e-06, + "loss": 1.8127, + "mean_token_accuracy": 0.5689977407455444, + "num_tokens": 6233031566.0, + "step": 12192 + }, + { + "epoch": 3.2971876690102757, + "grad_norm": 1.208552598953247, + "learning_rate": 6.9432591674458475e-06, + "loss": 1.8438, + "mean_token_accuracy": 0.570547342300415, + "num_tokens": 6233555810.0, + "step": 12193 + }, + { + "epoch": 3.2974580854515954, + "grad_norm": 0.8964627385139465, + "learning_rate": 6.941851975423705e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5656728744506836, + "num_tokens": 6234080010.0, + "step": 12194 + }, + { + "epoch": 3.297728501892915, + "grad_norm": 1.3191866874694824, + "learning_rate": 6.9404449079174765e-06, + "loss": 1.8094, + "mean_token_accuracy": 0.5958259105682373, + "num_tokens": 6234604281.0, + "step": 12195 + }, + { + "epoch": 3.2979989183342346, + "grad_norm": 1.6153953075408936, + "learning_rate": 6.939037964970341e-06, + "loss": 1.8974, + "mean_token_accuracy": 0.5718902349472046, + "num_tokens": 6235128448.0, + "step": 12196 + }, + { + "epoch": 3.2982693347755543, + "grad_norm": 1.1452051401138306, + "learning_rate": 6.937631146625463e-06, + "loss": 1.9592, + "mean_token_accuracy": 0.5593379139900208, + "num_tokens": 6235652553.0, + "step": 12197 + }, + { + "epoch": 3.298539751216874, + "grad_norm": 1.1425576210021973, + "learning_rate": 6.936224452926006e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.5863001942634583, + "num_tokens": 6236169238.0, + "step": 12198 + }, + { + "epoch": 3.2988101676581936, + "grad_norm": 1.3662141561508179, + "learning_rate": 6.934817883915137e-06, + "loss": 1.9423, + "mean_token_accuracy": 0.5553563833236694, + "num_tokens": 6236669994.0, + "step": 12199 + }, + { + "epoch": 3.299080584099513, + "grad_norm": 1.1038799285888672, + "learning_rate": 6.933411439636008e-06, + "loss": 1.7521, + "mean_token_accuracy": 0.5923276543617249, + "num_tokens": 6237182746.0, + "step": 12200 + }, + { + "epoch": 3.299351000540833, + "grad_norm": 0.45011359453201294, + "learning_rate": 6.932005120131773e-06, + "loss": 1.1214, + "mean_token_accuracy": 0.6897702813148499, + "num_tokens": 6237707014.0, + "step": 12201 + }, + { + "epoch": 3.2996214169821525, + "grad_norm": 1.4190598726272583, + "learning_rate": 6.930598925445586e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5743597745895386, + "num_tokens": 6238231288.0, + "step": 12202 + }, + { + "epoch": 3.299891833423472, + "grad_norm": 1.4065184593200684, + "learning_rate": 6.9291928556205945e-06, + "loss": 1.9147, + "mean_token_accuracy": 0.5740302801132202, + "num_tokens": 6238755538.0, + "step": 12203 + }, + { + "epoch": 3.300162249864792, + "grad_norm": 1.3959091901779175, + "learning_rate": 6.927786910699931e-06, + "loss": 1.939, + "mean_token_accuracy": 0.551527738571167, + "num_tokens": 6239279723.0, + "step": 12204 + }, + { + "epoch": 3.3004326663061114, + "grad_norm": 0.894758403301239, + "learning_rate": 6.926381090726746e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5857988595962524, + "num_tokens": 6239803895.0, + "step": 12205 + }, + { + "epoch": 3.300703082747431, + "grad_norm": 1.331868052482605, + "learning_rate": 6.924975395744169e-06, + "loss": 1.9063, + "mean_token_accuracy": 0.5721334218978882, + "num_tokens": 6240328047.0, + "step": 12206 + }, + { + "epoch": 3.3009734991887507, + "grad_norm": 1.4074965715408325, + "learning_rate": 6.923569825795325e-06, + "loss": 1.8467, + "mean_token_accuracy": 0.5750759840011597, + "num_tokens": 6240838159.0, + "step": 12207 + }, + { + "epoch": 3.3012439156300704, + "grad_norm": 1.3008909225463867, + "learning_rate": 6.9221643809233484e-06, + "loss": 1.9403, + "mean_token_accuracy": 0.5540744066238403, + "num_tokens": 6241362425.0, + "step": 12208 + }, + { + "epoch": 3.30151433207139, + "grad_norm": 1.2049764394760132, + "learning_rate": 6.920759061171365e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5874949097633362, + "num_tokens": 6241825522.0, + "step": 12209 + }, + { + "epoch": 3.3017847485127096, + "grad_norm": 1.1691761016845703, + "learning_rate": 6.919353866582485e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.5863192081451416, + "num_tokens": 6242349804.0, + "step": 12210 + }, + { + "epoch": 3.3020551649540293, + "grad_norm": 1.12234628200531, + "learning_rate": 6.917948797199834e-06, + "loss": 1.8118, + "mean_token_accuracy": 0.577642560005188, + "num_tokens": 6242874078.0, + "step": 12211 + }, + { + "epoch": 3.302325581395349, + "grad_norm": 1.166001558303833, + "learning_rate": 6.916543853066518e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5870513916015625, + "num_tokens": 6243370257.0, + "step": 12212 + }, + { + "epoch": 3.3025959978366686, + "grad_norm": 1.0530998706817627, + "learning_rate": 6.915139034225639e-06, + "loss": 1.6986, + "mean_token_accuracy": 0.6323271989822388, + "num_tokens": 6243894500.0, + "step": 12213 + }, + { + "epoch": 3.302866414277988, + "grad_norm": 1.1302909851074219, + "learning_rate": 6.913734340720313e-06, + "loss": 1.9846, + "mean_token_accuracy": 0.5446693897247314, + "num_tokens": 6244418653.0, + "step": 12214 + }, + { + "epoch": 3.303136830719308, + "grad_norm": 1.12050461769104, + "learning_rate": 6.9123297725936344e-06, + "loss": 1.8695, + "mean_token_accuracy": 0.5615150332450867, + "num_tokens": 6244938207.0, + "step": 12215 + }, + { + "epoch": 3.3034072471606275, + "grad_norm": 1.0128936767578125, + "learning_rate": 6.910925329888696e-06, + "loss": 1.805, + "mean_token_accuracy": 0.5678145885467529, + "num_tokens": 6245462341.0, + "step": 12216 + }, + { + "epoch": 3.303677663601947, + "grad_norm": 0.9276888966560364, + "learning_rate": 6.909521012648597e-06, + "loss": 1.8313, + "mean_token_accuracy": 0.5754657983779907, + "num_tokens": 6245986332.0, + "step": 12217 + }, + { + "epoch": 3.303948080043267, + "grad_norm": 1.2529648542404175, + "learning_rate": 6.908116820916418e-06, + "loss": 1.88, + "mean_token_accuracy": 0.5550407767295837, + "num_tokens": 6246510487.0, + "step": 12218 + }, + { + "epoch": 3.3042184964845864, + "grad_norm": 0.9067358374595642, + "learning_rate": 6.906712754735253e-06, + "loss": 1.8679, + "mean_token_accuracy": 0.5683612823486328, + "num_tokens": 6247034761.0, + "step": 12219 + }, + { + "epoch": 3.304488912925906, + "grad_norm": 0.885871946811676, + "learning_rate": 6.905308814148178e-06, + "loss": 1.9205, + "mean_token_accuracy": 0.5503156185150146, + "num_tokens": 6247558960.0, + "step": 12220 + }, + { + "epoch": 3.3047593293672257, + "grad_norm": 0.5043269395828247, + "learning_rate": 6.903904999198267e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.7090998888015747, + "num_tokens": 6248019801.0, + "step": 12221 + }, + { + "epoch": 3.305029745808545, + "grad_norm": 1.2071832418441772, + "learning_rate": 6.902501309928596e-06, + "loss": 1.7315, + "mean_token_accuracy": 0.5889856815338135, + "num_tokens": 6248524286.0, + "step": 12222 + }, + { + "epoch": 3.305300162249865, + "grad_norm": 1.3727372884750366, + "learning_rate": 6.901097746382238e-06, + "loss": 1.8966, + "mean_token_accuracy": 0.569970965385437, + "num_tokens": 6248998644.0, + "step": 12223 + }, + { + "epoch": 3.305570578691184, + "grad_norm": 0.8841447234153748, + "learning_rate": 6.899694308602254e-06, + "loss": 1.8916, + "mean_token_accuracy": 0.5669777393341064, + "num_tokens": 6249522884.0, + "step": 12224 + }, + { + "epoch": 3.3058409951325043, + "grad_norm": 1.2640236616134644, + "learning_rate": 6.898290996631707e-06, + "loss": 2.0351, + "mean_token_accuracy": 0.547620952129364, + "num_tokens": 6250008120.0, + "step": 12225 + }, + { + "epoch": 3.3061114115738235, + "grad_norm": 1.030493974685669, + "learning_rate": 6.896887810513657e-06, + "loss": 1.9378, + "mean_token_accuracy": 0.5445443391799927, + "num_tokens": 6250532331.0, + "step": 12226 + }, + { + "epoch": 3.306381828015143, + "grad_norm": 1.0222264528274536, + "learning_rate": 6.89548475029115e-06, + "loss": 1.8549, + "mean_token_accuracy": 0.5666632652282715, + "num_tokens": 6251056589.0, + "step": 12227 + }, + { + "epoch": 3.3066522444564628, + "grad_norm": 1.0090134143829346, + "learning_rate": 6.894081816007248e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5738488435745239, + "num_tokens": 6251580540.0, + "step": 12228 + }, + { + "epoch": 3.3069226608977824, + "grad_norm": 1.0907330513000488, + "learning_rate": 6.892679007704988e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.559269905090332, + "num_tokens": 6252104680.0, + "step": 12229 + }, + { + "epoch": 3.307193077339102, + "grad_norm": 1.0735969543457031, + "learning_rate": 6.891276325427412e-06, + "loss": 1.8666, + "mean_token_accuracy": 0.5665051937103271, + "num_tokens": 6252628911.0, + "step": 12230 + }, + { + "epoch": 3.3074634937804217, + "grad_norm": 0.9717583656311035, + "learning_rate": 6.889873769217565e-06, + "loss": 1.7599, + "mean_token_accuracy": 0.5846441984176636, + "num_tokens": 6253121290.0, + "step": 12231 + }, + { + "epoch": 3.3077339102217413, + "grad_norm": 0.9358380436897278, + "learning_rate": 6.8884713391184776e-06, + "loss": 1.6976, + "mean_token_accuracy": 0.582432746887207, + "num_tokens": 6253645515.0, + "step": 12232 + }, + { + "epoch": 3.308004326663061, + "grad_norm": 1.0692294836044312, + "learning_rate": 6.887069035173178e-06, + "loss": 1.8703, + "mean_token_accuracy": 0.5764244794845581, + "num_tokens": 6254169753.0, + "step": 12233 + }, + { + "epoch": 3.3082747431043806, + "grad_norm": 1.0835068225860596, + "learning_rate": 6.885666857424698e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5844390392303467, + "num_tokens": 6254694023.0, + "step": 12234 + }, + { + "epoch": 3.3085451595457003, + "grad_norm": 1.0693674087524414, + "learning_rate": 6.8842648059160556e-06, + "loss": 1.764, + "mean_token_accuracy": 0.5705274343490601, + "num_tokens": 6255218299.0, + "step": 12235 + }, + { + "epoch": 3.30881557598702, + "grad_norm": 1.0120298862457275, + "learning_rate": 6.882862880690275e-06, + "loss": 1.925, + "mean_token_accuracy": 0.5610132813453674, + "num_tokens": 6255742465.0, + "step": 12236 + }, + { + "epoch": 3.3090859924283396, + "grad_norm": 0.9417455196380615, + "learning_rate": 6.881461081790367e-06, + "loss": 1.6386, + "mean_token_accuracy": 0.6228259801864624, + "num_tokens": 6256266641.0, + "step": 12237 + }, + { + "epoch": 3.309356408869659, + "grad_norm": 0.9125785231590271, + "learning_rate": 6.880059409259348e-06, + "loss": 1.6847, + "mean_token_accuracy": 0.6200841665267944, + "num_tokens": 6256790918.0, + "step": 12238 + }, + { + "epoch": 3.309626825310979, + "grad_norm": 1.121397614479065, + "learning_rate": 6.878657863140219e-06, + "loss": 1.9737, + "mean_token_accuracy": 0.5588877201080322, + "num_tokens": 6257315165.0, + "step": 12239 + }, + { + "epoch": 3.3098972417522985, + "grad_norm": 0.9975007176399231, + "learning_rate": 6.877256443475989e-06, + "loss": 2.0137, + "mean_token_accuracy": 0.5326974391937256, + "num_tokens": 6257839386.0, + "step": 12240 + }, + { + "epoch": 3.310167658193618, + "grad_norm": 0.3749629855155945, + "learning_rate": 6.875855150309651e-06, + "loss": 1.0938, + "mean_token_accuracy": 0.7044168710708618, + "num_tokens": 6258363645.0, + "step": 12241 + }, + { + "epoch": 3.3104380746349378, + "grad_norm": 1.196138858795166, + "learning_rate": 6.8744539836842125e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5717124938964844, + "num_tokens": 6258887822.0, + "step": 12242 + }, + { + "epoch": 3.3107084910762574, + "grad_norm": 1.121925950050354, + "learning_rate": 6.873052943642655e-06, + "loss": 1.907, + "mean_token_accuracy": 0.5653885006904602, + "num_tokens": 6259368392.0, + "step": 12243 + }, + { + "epoch": 3.310978907517577, + "grad_norm": 0.9051870107650757, + "learning_rate": 6.871652030227967e-06, + "loss": 1.844, + "mean_token_accuracy": 0.5738030076026917, + "num_tokens": 6259892646.0, + "step": 12244 + }, + { + "epoch": 3.3112493239588967, + "grad_norm": 1.0524532794952393, + "learning_rate": 6.870251243483141e-06, + "loss": 1.9754, + "mean_token_accuracy": 0.5294277667999268, + "num_tokens": 6260416904.0, + "step": 12245 + }, + { + "epoch": 3.3115197404002163, + "grad_norm": 1.0535311698913574, + "learning_rate": 6.86885058345115e-06, + "loss": 1.8974, + "mean_token_accuracy": 0.56754070520401, + "num_tokens": 6260941175.0, + "step": 12246 + }, + { + "epoch": 3.311790156841536, + "grad_norm": 1.0981025695800781, + "learning_rate": 6.867450050174969e-06, + "loss": 1.9793, + "mean_token_accuracy": 0.5673210620880127, + "num_tokens": 6261401602.0, + "step": 12247 + }, + { + "epoch": 3.3120605732828556, + "grad_norm": 0.8877776861190796, + "learning_rate": 6.8660496436975754e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5538060665130615, + "num_tokens": 6261922429.0, + "step": 12248 + }, + { + "epoch": 3.3123309897241753, + "grad_norm": 1.1330492496490479, + "learning_rate": 6.864649364061937e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5540903806686401, + "num_tokens": 6262446629.0, + "step": 12249 + }, + { + "epoch": 3.312601406165495, + "grad_norm": 1.1551862955093384, + "learning_rate": 6.863249211311014e-06, + "loss": 1.8337, + "mean_token_accuracy": 0.5800843238830566, + "num_tokens": 6262970873.0, + "step": 12250 + }, + { + "epoch": 3.3128718226068146, + "grad_norm": 0.927612841129303, + "learning_rate": 6.86184918548777e-06, + "loss": 1.7334, + "mean_token_accuracy": 0.5963379740715027, + "num_tokens": 6263433471.0, + "step": 12251 + }, + { + "epoch": 3.313142239048134, + "grad_norm": 0.8844614028930664, + "learning_rate": 6.860449286635167e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.5869441628456116, + "num_tokens": 6263957531.0, + "step": 12252 + }, + { + "epoch": 3.313412655489454, + "grad_norm": 1.0448983907699585, + "learning_rate": 6.85904951479615e-06, + "loss": 1.9032, + "mean_token_accuracy": 0.5642485618591309, + "num_tokens": 6264427293.0, + "step": 12253 + }, + { + "epoch": 3.3136830719307735, + "grad_norm": 0.9835891127586365, + "learning_rate": 6.857649870013674e-06, + "loss": 1.8461, + "mean_token_accuracy": 0.5547494292259216, + "num_tokens": 6264926444.0, + "step": 12254 + }, + { + "epoch": 3.313953488372093, + "grad_norm": 0.8663423657417297, + "learning_rate": 6.8562503523306836e-06, + "loss": 1.902, + "mean_token_accuracy": 0.5628849267959595, + "num_tokens": 6265424040.0, + "step": 12255 + }, + { + "epoch": 3.314223904813413, + "grad_norm": 1.074011206626892, + "learning_rate": 6.854850961790113e-06, + "loss": 1.845, + "mean_token_accuracy": 0.5863242149353027, + "num_tokens": 6265948307.0, + "step": 12256 + }, + { + "epoch": 3.3144943212547324, + "grad_norm": 1.0520896911621094, + "learning_rate": 6.853451698434909e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5928637981414795, + "num_tokens": 6266466425.0, + "step": 12257 + }, + { + "epoch": 3.314764737696052, + "grad_norm": 1.073407769203186, + "learning_rate": 6.852052562308003e-06, + "loss": 1.8525, + "mean_token_accuracy": 0.5563066005706787, + "num_tokens": 6266986159.0, + "step": 12258 + }, + { + "epoch": 3.3150351541373717, + "grad_norm": 1.045318603515625, + "learning_rate": 6.8506535534523155e-06, + "loss": 2.0238, + "mean_token_accuracy": 0.5547350645065308, + "num_tokens": 6267510434.0, + "step": 12259 + }, + { + "epoch": 3.3153055705786914, + "grad_norm": 1.215002417564392, + "learning_rate": 6.849254671910785e-06, + "loss": 1.7683, + "mean_token_accuracy": 0.5772123336791992, + "num_tokens": 6268034712.0, + "step": 12260 + }, + { + "epoch": 3.315575987020011, + "grad_norm": 0.44246917963027954, + "learning_rate": 6.847855917726328e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.705329418182373, + "num_tokens": 6268558792.0, + "step": 12261 + }, + { + "epoch": 3.3158464034613306, + "grad_norm": 1.107903242111206, + "learning_rate": 6.8464572909418594e-06, + "loss": 1.808, + "mean_token_accuracy": 0.5743392705917358, + "num_tokens": 6269025806.0, + "step": 12262 + }, + { + "epoch": 3.31611681990265, + "grad_norm": 1.0864883661270142, + "learning_rate": 6.8450587916002965e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.5607407689094543, + "num_tokens": 6269550025.0, + "step": 12263 + }, + { + "epoch": 3.31638723634397, + "grad_norm": 1.02798593044281, + "learning_rate": 6.843660419744549e-06, + "loss": 1.9946, + "mean_token_accuracy": 0.5244202613830566, + "num_tokens": 6270074235.0, + "step": 12264 + }, + { + "epoch": 3.316657652785289, + "grad_norm": 0.938677191734314, + "learning_rate": 6.84226217541752e-06, + "loss": 1.8729, + "mean_token_accuracy": 0.5650177001953125, + "num_tokens": 6270598329.0, + "step": 12265 + }, + { + "epoch": 3.316928069226609, + "grad_norm": 1.2254270315170288, + "learning_rate": 6.840864058662119e-06, + "loss": 1.7784, + "mean_token_accuracy": 0.5650084614753723, + "num_tokens": 6271122508.0, + "step": 12266 + }, + { + "epoch": 3.3171984856679284, + "grad_norm": 0.9773968458175659, + "learning_rate": 6.839466069521237e-06, + "loss": 1.6876, + "mean_token_accuracy": 0.5834401845932007, + "num_tokens": 6271646751.0, + "step": 12267 + }, + { + "epoch": 3.317468902109248, + "grad_norm": 1.0454509258270264, + "learning_rate": 6.838068208037772e-06, + "loss": 1.8787, + "mean_token_accuracy": 0.5900238156318665, + "num_tokens": 6272156801.0, + "step": 12268 + }, + { + "epoch": 3.3177393185505677, + "grad_norm": 1.1785176992416382, + "learning_rate": 6.836670474254618e-06, + "loss": 1.8057, + "mean_token_accuracy": 0.5849872827529907, + "num_tokens": 6272681060.0, + "step": 12269 + }, + { + "epoch": 3.3180097349918873, + "grad_norm": 1.1298686265945435, + "learning_rate": 6.835272868214651e-06, + "loss": 1.7858, + "mean_token_accuracy": 0.5746108293533325, + "num_tokens": 6273205257.0, + "step": 12270 + }, + { + "epoch": 3.318280151433207, + "grad_norm": 1.0566121339797974, + "learning_rate": 6.833875389960764e-06, + "loss": 1.8531, + "mean_token_accuracy": 0.5747374296188354, + "num_tokens": 6273729272.0, + "step": 12271 + }, + { + "epoch": 3.3185505678745266, + "grad_norm": 1.1399729251861572, + "learning_rate": 6.832478039535833e-06, + "loss": 1.9486, + "mean_token_accuracy": 0.5459815263748169, + "num_tokens": 6274253529.0, + "step": 12272 + }, + { + "epoch": 3.3188209843158463, + "grad_norm": 1.1775881052017212, + "learning_rate": 6.831080816982727e-06, + "loss": 1.7895, + "mean_token_accuracy": 0.5821049809455872, + "num_tokens": 6274777785.0, + "step": 12273 + }, + { + "epoch": 3.319091400757166, + "grad_norm": 1.2203737497329712, + "learning_rate": 6.829683722344325e-06, + "loss": 1.9211, + "mean_token_accuracy": 0.5528720617294312, + "num_tokens": 6275302046.0, + "step": 12274 + }, + { + "epoch": 3.3193618171984856, + "grad_norm": 1.0821993350982666, + "learning_rate": 6.828286755663491e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5630541443824768, + "num_tokens": 6275826319.0, + "step": 12275 + }, + { + "epoch": 3.319632233639805, + "grad_norm": 1.0532402992248535, + "learning_rate": 6.826889916983085e-06, + "loss": 1.6605, + "mean_token_accuracy": 0.6280225515365601, + "num_tokens": 6276350542.0, + "step": 12276 + }, + { + "epoch": 3.319902650081125, + "grad_norm": 1.2836072444915771, + "learning_rate": 6.82549320634597e-06, + "loss": 1.8478, + "mean_token_accuracy": 0.5632922649383545, + "num_tokens": 6276874689.0, + "step": 12277 + }, + { + "epoch": 3.3201730665224445, + "grad_norm": 1.1370612382888794, + "learning_rate": 6.824096623794998e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.5637662410736084, + "num_tokens": 6277375625.0, + "step": 12278 + }, + { + "epoch": 3.320443482963764, + "grad_norm": 1.156639814376831, + "learning_rate": 6.822700169373024e-06, + "loss": 1.769, + "mean_token_accuracy": 0.6042946577072144, + "num_tokens": 6277899735.0, + "step": 12279 + }, + { + "epoch": 3.3207138994050838, + "grad_norm": 1.3985992670059204, + "learning_rate": 6.8213038431228925e-06, + "loss": 1.9476, + "mean_token_accuracy": 0.5562364459037781, + "num_tokens": 6278423910.0, + "step": 12280 + }, + { + "epoch": 3.3209843158464034, + "grad_norm": 0.39652034640312195, + "learning_rate": 6.819907645087449e-06, + "loss": 1.1877, + "mean_token_accuracy": 0.6810431480407715, + "num_tokens": 6278948113.0, + "step": 12281 + }, + { + "epoch": 3.321254732287723, + "grad_norm": 1.6300277709960938, + "learning_rate": 6.81851157530953e-06, + "loss": 1.966, + "mean_token_accuracy": 0.5564817190170288, + "num_tokens": 6279472399.0, + "step": 12282 + }, + { + "epoch": 3.3215251487290427, + "grad_norm": 1.2248045206069946, + "learning_rate": 6.817115633831974e-06, + "loss": 1.8811, + "mean_token_accuracy": 0.5853269696235657, + "num_tokens": 6279984956.0, + "step": 12283 + }, + { + "epoch": 3.3217955651703623, + "grad_norm": 1.0686473846435547, + "learning_rate": 6.815719820697614e-06, + "loss": 1.8174, + "mean_token_accuracy": 0.6078820824623108, + "num_tokens": 6280445222.0, + "step": 12284 + }, + { + "epoch": 3.322065981611682, + "grad_norm": 1.0152475833892822, + "learning_rate": 6.814324135949271e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5808177590370178, + "num_tokens": 6280958525.0, + "step": 12285 + }, + { + "epoch": 3.3223363980530016, + "grad_norm": 1.1383860111236572, + "learning_rate": 6.812928579629775e-06, + "loss": 1.8999, + "mean_token_accuracy": 0.5654211044311523, + "num_tokens": 6281482570.0, + "step": 12286 + }, + { + "epoch": 3.3226068144943213, + "grad_norm": 1.1089463233947754, + "learning_rate": 6.811533151781941e-06, + "loss": 1.857, + "mean_token_accuracy": 0.5781694054603577, + "num_tokens": 6282006806.0, + "step": 12287 + }, + { + "epoch": 3.322877230935641, + "grad_norm": 1.2042299509048462, + "learning_rate": 6.810137852448592e-06, + "loss": 1.7947, + "mean_token_accuracy": 0.5907742977142334, + "num_tokens": 6282531002.0, + "step": 12288 + }, + { + "epoch": 3.3231476473769606, + "grad_norm": 1.2013604640960693, + "learning_rate": 6.808742681672533e-06, + "loss": 1.9175, + "mean_token_accuracy": 0.5602229833602905, + "num_tokens": 6283055233.0, + "step": 12289 + }, + { + "epoch": 3.32341806381828, + "grad_norm": 1.0911847352981567, + "learning_rate": 6.807347639496571e-06, + "loss": 1.6833, + "mean_token_accuracy": 0.6176580190658569, + "num_tokens": 6283579507.0, + "step": 12290 + }, + { + "epoch": 3.3236884802596, + "grad_norm": 1.0930143594741821, + "learning_rate": 6.805952725963519e-06, + "loss": 1.8232, + "mean_token_accuracy": 0.5964571237564087, + "num_tokens": 6284078980.0, + "step": 12291 + }, + { + "epoch": 3.3239588967009195, + "grad_norm": 1.1013691425323486, + "learning_rate": 6.8045579411161645e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.5799168348312378, + "num_tokens": 6284586514.0, + "step": 12292 + }, + { + "epoch": 3.324229313142239, + "grad_norm": 1.0961062908172607, + "learning_rate": 6.803163284997316e-06, + "loss": 1.893, + "mean_token_accuracy": 0.5618000626564026, + "num_tokens": 6285072154.0, + "step": 12293 + }, + { + "epoch": 3.3244997295835588, + "grad_norm": 0.9181034564971924, + "learning_rate": 6.801768757649756e-06, + "loss": 1.6894, + "mean_token_accuracy": 0.6144347190856934, + "num_tokens": 6285596125.0, + "step": 12294 + }, + { + "epoch": 3.3247701460248784, + "grad_norm": 1.1120173931121826, + "learning_rate": 6.80037435911628e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5637530088424683, + "num_tokens": 6286120225.0, + "step": 12295 + }, + { + "epoch": 3.325040562466198, + "grad_norm": 1.0746650695800781, + "learning_rate": 6.7989800894396666e-06, + "loss": 1.8655, + "mean_token_accuracy": 0.5584739446640015, + "num_tokens": 6286644439.0, + "step": 12296 + }, + { + "epoch": 3.3253109789075177, + "grad_norm": 0.88277268409729, + "learning_rate": 6.7975859486627e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.589028000831604, + "num_tokens": 6287122902.0, + "step": 12297 + }, + { + "epoch": 3.3255813953488373, + "grad_norm": 0.8787611722946167, + "learning_rate": 6.796191936828156e-06, + "loss": 1.7357, + "mean_token_accuracy": 0.6053205132484436, + "num_tokens": 6287610987.0, + "step": 12298 + }, + { + "epoch": 3.325851811790157, + "grad_norm": 1.128786563873291, + "learning_rate": 6.794798053978802e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.587705135345459, + "num_tokens": 6288135265.0, + "step": 12299 + }, + { + "epoch": 3.3261222282314766, + "grad_norm": 1.234757661819458, + "learning_rate": 6.793404300157414e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5565595626831055, + "num_tokens": 6288659503.0, + "step": 12300 + }, + { + "epoch": 3.3263926446727963, + "grad_norm": 0.40382906794548035, + "learning_rate": 6.792010675406754e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.7141597270965576, + "num_tokens": 6289183758.0, + "step": 12301 + }, + { + "epoch": 3.326663061114116, + "grad_norm": 1.2778609991073608, + "learning_rate": 6.7906171797695764e-06, + "loss": 1.8465, + "mean_token_accuracy": 0.5749104619026184, + "num_tokens": 6289707930.0, + "step": 12302 + }, + { + "epoch": 3.3269334775554356, + "grad_norm": 1.3762261867523193, + "learning_rate": 6.789223813288648e-06, + "loss": 1.8964, + "mean_token_accuracy": 0.5757359862327576, + "num_tokens": 6290181680.0, + "step": 12303 + }, + { + "epoch": 3.3272038939967548, + "grad_norm": 0.9841457605361938, + "learning_rate": 6.787830576006714e-06, + "loss": 1.792, + "mean_token_accuracy": 0.5878408551216125, + "num_tokens": 6290705860.0, + "step": 12304 + }, + { + "epoch": 3.327474310438075, + "grad_norm": 1.238775610923767, + "learning_rate": 6.786437467966524e-06, + "loss": 1.8635, + "mean_token_accuracy": 0.5716793537139893, + "num_tokens": 6291229998.0, + "step": 12305 + }, + { + "epoch": 3.327744726879394, + "grad_norm": 1.223158359527588, + "learning_rate": 6.785044489210825e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.6027572154998779, + "num_tokens": 6291754124.0, + "step": 12306 + }, + { + "epoch": 3.328015143320714, + "grad_norm": 1.066558599472046, + "learning_rate": 6.783651639782354e-06, + "loss": 1.7509, + "mean_token_accuracy": 0.5855549573898315, + "num_tokens": 6292278341.0, + "step": 12307 + }, + { + "epoch": 3.3282855597620333, + "grad_norm": 1.0195002555847168, + "learning_rate": 6.782258919723851e-06, + "loss": 1.7867, + "mean_token_accuracy": 0.5877364873886108, + "num_tokens": 6292802583.0, + "step": 12308 + }, + { + "epoch": 3.328555976203353, + "grad_norm": 1.092718482017517, + "learning_rate": 6.78086632907805e-06, + "loss": 1.6053, + "mean_token_accuracy": 0.6197245717048645, + "num_tokens": 6293326738.0, + "step": 12309 + }, + { + "epoch": 3.3288263926446726, + "grad_norm": 1.1430259943008423, + "learning_rate": 6.779473867887674e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5759443044662476, + "num_tokens": 6293758104.0, + "step": 12310 + }, + { + "epoch": 3.3290968090859923, + "grad_norm": 0.9729588627815247, + "learning_rate": 6.778081536195456e-06, + "loss": 1.8612, + "mean_token_accuracy": 0.5629757642745972, + "num_tokens": 6294282348.0, + "step": 12311 + }, + { + "epoch": 3.329367225527312, + "grad_norm": 1.0302164554595947, + "learning_rate": 6.776689334044113e-06, + "loss": 1.9329, + "mean_token_accuracy": 0.554132342338562, + "num_tokens": 6294806549.0, + "step": 12312 + }, + { + "epoch": 3.3296376419686315, + "grad_norm": 1.0244923830032349, + "learning_rate": 6.7752972614763555e-06, + "loss": 1.936, + "mean_token_accuracy": 0.551362931728363, + "num_tokens": 6295330806.0, + "step": 12313 + }, + { + "epoch": 3.329908058409951, + "grad_norm": 1.0823922157287598, + "learning_rate": 6.773905318534907e-06, + "loss": 1.9505, + "mean_token_accuracy": 0.5668979287147522, + "num_tokens": 6295854982.0, + "step": 12314 + }, + { + "epoch": 3.330178474851271, + "grad_norm": 1.0480440855026245, + "learning_rate": 6.77251350526247e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.576215922832489, + "num_tokens": 6296379215.0, + "step": 12315 + }, + { + "epoch": 3.3304488912925905, + "grad_norm": 0.883158802986145, + "learning_rate": 6.7711218217017496e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5956816077232361, + "num_tokens": 6296903374.0, + "step": 12316 + }, + { + "epoch": 3.33071930773391, + "grad_norm": 0.9789373278617859, + "learning_rate": 6.7697302678954506e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.570610761642456, + "num_tokens": 6297427657.0, + "step": 12317 + }, + { + "epoch": 3.3309897241752298, + "grad_norm": 1.1253262758255005, + "learning_rate": 6.768338843886266e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.5757057070732117, + "num_tokens": 6297951816.0, + "step": 12318 + }, + { + "epoch": 3.3312601406165494, + "grad_norm": 0.9352302551269531, + "learning_rate": 6.766947549716889e-06, + "loss": 1.908, + "mean_token_accuracy": 0.5632192492485046, + "num_tokens": 6298476095.0, + "step": 12319 + }, + { + "epoch": 3.331530557057869, + "grad_norm": 0.8708380460739136, + "learning_rate": 6.765556385430012e-06, + "loss": 1.9493, + "mean_token_accuracy": 0.5371806621551514, + "num_tokens": 6299000125.0, + "step": 12320 + }, + { + "epoch": 3.3318009734991887, + "grad_norm": 0.41570499539375305, + "learning_rate": 6.764165351068315e-06, + "loss": 1.1775, + "mean_token_accuracy": 0.6935732364654541, + "num_tokens": 6299465202.0, + "step": 12321 + }, + { + "epoch": 3.3320713899405083, + "grad_norm": 1.0748612880706787, + "learning_rate": 6.76277444667448e-06, + "loss": 1.8182, + "mean_token_accuracy": 0.5813757181167603, + "num_tokens": 6299989479.0, + "step": 12322 + }, + { + "epoch": 3.332341806381828, + "grad_norm": 1.4449234008789062, + "learning_rate": 6.761383672291191e-06, + "loss": 1.8004, + "mean_token_accuracy": 0.5977001190185547, + "num_tokens": 6300394061.0, + "step": 12323 + }, + { + "epoch": 3.3326122228231476, + "grad_norm": 0.8879614472389221, + "learning_rate": 6.759993027961116e-06, + "loss": 1.6585, + "mean_token_accuracy": 0.627034068107605, + "num_tokens": 6300918216.0, + "step": 12324 + }, + { + "epoch": 3.3328826392644673, + "grad_norm": 0.9586060643196106, + "learning_rate": 6.7586025137269195e-06, + "loss": 1.8634, + "mean_token_accuracy": 0.5750301480293274, + "num_tokens": 6301419010.0, + "step": 12325 + }, + { + "epoch": 3.333153055705787, + "grad_norm": 1.0060420036315918, + "learning_rate": 6.757212129631274e-06, + "loss": 1.7794, + "mean_token_accuracy": 0.5818355083465576, + "num_tokens": 6301897562.0, + "step": 12326 + }, + { + "epoch": 3.3334234721471065, + "grad_norm": 1.2144254446029663, + "learning_rate": 6.755821875716839e-06, + "loss": 1.9351, + "mean_token_accuracy": 0.5486932396888733, + "num_tokens": 6302421601.0, + "step": 12327 + }, + { + "epoch": 3.333693888588426, + "grad_norm": 0.96638023853302, + "learning_rate": 6.754431752026267e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5819219946861267, + "num_tokens": 6302945825.0, + "step": 12328 + }, + { + "epoch": 3.333964305029746, + "grad_norm": 0.9935682415962219, + "learning_rate": 6.753041758602217e-06, + "loss": 1.7932, + "mean_token_accuracy": 0.5742115378379822, + "num_tokens": 6303469889.0, + "step": 12329 + }, + { + "epoch": 3.3342347214710655, + "grad_norm": 1.0447403192520142, + "learning_rate": 6.751651895487334e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5861750841140747, + "num_tokens": 6303955357.0, + "step": 12330 + }, + { + "epoch": 3.334505137912385, + "grad_norm": 1.1245101690292358, + "learning_rate": 6.750262162724263e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5752770900726318, + "num_tokens": 6304479597.0, + "step": 12331 + }, + { + "epoch": 3.3347755543537048, + "grad_norm": 1.1747337579727173, + "learning_rate": 6.748872560355651e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5858473777770996, + "num_tokens": 6304895122.0, + "step": 12332 + }, + { + "epoch": 3.3350459707950244, + "grad_norm": 0.9390655755996704, + "learning_rate": 6.747483088424124e-06, + "loss": 1.6046, + "mean_token_accuracy": 0.6340199708938599, + "num_tokens": 6305382670.0, + "step": 12333 + }, + { + "epoch": 3.335316387236344, + "grad_norm": 0.9758468866348267, + "learning_rate": 6.7460937469723265e-06, + "loss": 1.8522, + "mean_token_accuracy": 0.5570023655891418, + "num_tokens": 6305906944.0, + "step": 12334 + }, + { + "epoch": 3.3355868036776637, + "grad_norm": 0.9820308685302734, + "learning_rate": 6.744704536042879e-06, + "loss": 1.832, + "mean_token_accuracy": 0.5963724851608276, + "num_tokens": 6306431211.0, + "step": 12335 + }, + { + "epoch": 3.3358572201189833, + "grad_norm": 1.1165597438812256, + "learning_rate": 6.743315455678414e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5644278526306152, + "num_tokens": 6306955453.0, + "step": 12336 + }, + { + "epoch": 3.336127636560303, + "grad_norm": 1.0308623313903809, + "learning_rate": 6.741926505921545e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.588604211807251, + "num_tokens": 6307479548.0, + "step": 12337 + }, + { + "epoch": 3.3363980530016226, + "grad_norm": 1.038529634475708, + "learning_rate": 6.740537686814897e-06, + "loss": 1.8751, + "mean_token_accuracy": 0.577996015548706, + "num_tokens": 6308003659.0, + "step": 12338 + }, + { + "epoch": 3.3366684694429423, + "grad_norm": 1.030722975730896, + "learning_rate": 6.739148998401076e-06, + "loss": 1.858, + "mean_token_accuracy": 0.5717004537582397, + "num_tokens": 6308527804.0, + "step": 12339 + }, + { + "epoch": 3.336938885884262, + "grad_norm": 1.0648882389068604, + "learning_rate": 6.737760440722698e-06, + "loss": 1.9348, + "mean_token_accuracy": 0.5588330030441284, + "num_tokens": 6309027436.0, + "step": 12340 + }, + { + "epoch": 3.3372093023255816, + "grad_norm": 0.3645954132080078, + "learning_rate": 6.7363720138223635e-06, + "loss": 1.2273, + "mean_token_accuracy": 0.679702639579773, + "num_tokens": 6309510194.0, + "step": 12341 + }, + { + "epoch": 3.337479718766901, + "grad_norm": 1.2901326417922974, + "learning_rate": 6.734983717742669e-06, + "loss": 1.8827, + "mean_token_accuracy": 0.5632911920547485, + "num_tokens": 6310034262.0, + "step": 12342 + }, + { + "epoch": 3.337750135208221, + "grad_norm": 1.2865564823150635, + "learning_rate": 6.733595552526222e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5941181182861328, + "num_tokens": 6310558439.0, + "step": 12343 + }, + { + "epoch": 3.3380205516495405, + "grad_norm": 1.0037457942962646, + "learning_rate": 6.73220751821561e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5743006467819214, + "num_tokens": 6311082644.0, + "step": 12344 + }, + { + "epoch": 3.3382909680908597, + "grad_norm": 0.9024066925048828, + "learning_rate": 6.730819614853418e-06, + "loss": 1.8824, + "mean_token_accuracy": 0.5683361291885376, + "num_tokens": 6311551891.0, + "step": 12345 + }, + { + "epoch": 3.3385613845321798, + "grad_norm": 1.1407699584960938, + "learning_rate": 6.729431842482239e-06, + "loss": 1.8407, + "mean_token_accuracy": 0.5697394609451294, + "num_tokens": 6312076155.0, + "step": 12346 + }, + { + "epoch": 3.338831800973499, + "grad_norm": 1.0929735898971558, + "learning_rate": 6.72804420114465e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.5796945691108704, + "num_tokens": 6312600368.0, + "step": 12347 + }, + { + "epoch": 3.339102217414819, + "grad_norm": 1.109498143196106, + "learning_rate": 6.726656690883225e-06, + "loss": 1.9233, + "mean_token_accuracy": 0.5863825082778931, + "num_tokens": 6313073193.0, + "step": 12348 + }, + { + "epoch": 3.3393726338561383, + "grad_norm": 0.9494680166244507, + "learning_rate": 6.725269311740541e-06, + "loss": 1.9433, + "mean_token_accuracy": 0.5704084038734436, + "num_tokens": 6313576168.0, + "step": 12349 + }, + { + "epoch": 3.339643050297458, + "grad_norm": 1.0369467735290527, + "learning_rate": 6.723882063759164e-06, + "loss": 1.8204, + "mean_token_accuracy": 0.5879607796669006, + "num_tokens": 6314035798.0, + "step": 12350 + }, + { + "epoch": 3.3399134667387775, + "grad_norm": 1.1176221370697021, + "learning_rate": 6.722494946981659e-06, + "loss": 1.8941, + "mean_token_accuracy": 0.559118390083313, + "num_tokens": 6314556420.0, + "step": 12351 + }, + { + "epoch": 3.340183883180097, + "grad_norm": 1.0280956029891968, + "learning_rate": 6.721107961450595e-06, + "loss": 1.8372, + "mean_token_accuracy": 0.5687544345855713, + "num_tokens": 6315080573.0, + "step": 12352 + }, + { + "epoch": 3.340454299621417, + "grad_norm": 0.9036443829536438, + "learning_rate": 6.719721107208519e-06, + "loss": 1.8151, + "mean_token_accuracy": 0.586577296257019, + "num_tokens": 6315604855.0, + "step": 12353 + }, + { + "epoch": 3.3407247160627365, + "grad_norm": 1.0103918313980103, + "learning_rate": 6.718334384297984e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.5775923132896423, + "num_tokens": 6316126901.0, + "step": 12354 + }, + { + "epoch": 3.340995132504056, + "grad_norm": 0.930570662021637, + "learning_rate": 6.716947792761548e-06, + "loss": 1.7271, + "mean_token_accuracy": 0.6008002758026123, + "num_tokens": 6316632581.0, + "step": 12355 + }, + { + "epoch": 3.3412655489453758, + "grad_norm": 1.0455005168914795, + "learning_rate": 6.715561332641742e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5851279497146606, + "num_tokens": 6317156843.0, + "step": 12356 + }, + { + "epoch": 3.3415359653866954, + "grad_norm": 0.9509966373443604, + "learning_rate": 6.714175003981119e-06, + "loss": 1.7956, + "mean_token_accuracy": 0.586180567741394, + "num_tokens": 6317681066.0, + "step": 12357 + }, + { + "epoch": 3.341806381828015, + "grad_norm": 1.145816445350647, + "learning_rate": 6.712788806822211e-06, + "loss": 1.7688, + "mean_token_accuracy": 0.5768716335296631, + "num_tokens": 6318205238.0, + "step": 12358 + }, + { + "epoch": 3.3420767982693347, + "grad_norm": 0.8966054320335388, + "learning_rate": 6.711402741207546e-06, + "loss": 1.7791, + "mean_token_accuracy": 0.5869395732879639, + "num_tokens": 6318683198.0, + "step": 12359 + }, + { + "epoch": 3.3423472147106543, + "grad_norm": 1.0336012840270996, + "learning_rate": 6.710016807179662e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.5859298706054688, + "num_tokens": 6319207456.0, + "step": 12360 + }, + { + "epoch": 3.342617631151974, + "grad_norm": 0.39263418316841125, + "learning_rate": 6.708631004781077e-06, + "loss": 1.1452, + "mean_token_accuracy": 0.6838679313659668, + "num_tokens": 6319731681.0, + "step": 12361 + }, + { + "epoch": 3.3428880475932936, + "grad_norm": 1.2131189107894897, + "learning_rate": 6.7072453340543095e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.5723342895507812, + "num_tokens": 6320255941.0, + "step": 12362 + }, + { + "epoch": 3.3431584640346133, + "grad_norm": 1.0534166097640991, + "learning_rate": 6.705859795041883e-06, + "loss": 1.839, + "mean_token_accuracy": 0.5730061531066895, + "num_tokens": 6320768927.0, + "step": 12363 + }, + { + "epoch": 3.343428880475933, + "grad_norm": 1.0174325704574585, + "learning_rate": 6.704474387786303e-06, + "loss": 1.9297, + "mean_token_accuracy": 0.5551216006278992, + "num_tokens": 6321293131.0, + "step": 12364 + }, + { + "epoch": 3.3436992969172525, + "grad_norm": 0.8628210425376892, + "learning_rate": 6.703089112330081e-06, + "loss": 1.7066, + "mean_token_accuracy": 0.5800142288208008, + "num_tokens": 6321817386.0, + "step": 12365 + }, + { + "epoch": 3.343969713358572, + "grad_norm": 1.3353750705718994, + "learning_rate": 6.701703968715725e-06, + "loss": 1.8808, + "mean_token_accuracy": 0.5585060715675354, + "num_tokens": 6322322635.0, + "step": 12366 + }, + { + "epoch": 3.344240129799892, + "grad_norm": 1.1057336330413818, + "learning_rate": 6.700318956985732e-06, + "loss": 1.8202, + "mean_token_accuracy": 0.5647144317626953, + "num_tokens": 6322761231.0, + "step": 12367 + }, + { + "epoch": 3.3445105462412115, + "grad_norm": 1.0100960731506348, + "learning_rate": 6.6989340771825925e-06, + "loss": 1.8873, + "mean_token_accuracy": 0.5658076405525208, + "num_tokens": 6323217062.0, + "step": 12368 + }, + { + "epoch": 3.344780962682531, + "grad_norm": 0.8538418412208557, + "learning_rate": 6.697549329348809e-06, + "loss": 1.8405, + "mean_token_accuracy": 0.5722999572753906, + "num_tokens": 6323732757.0, + "step": 12369 + }, + { + "epoch": 3.3450513791238508, + "grad_norm": 0.9439008831977844, + "learning_rate": 6.696164713526866e-06, + "loss": 1.8723, + "mean_token_accuracy": 0.5682793855667114, + "num_tokens": 6324256887.0, + "step": 12370 + }, + { + "epoch": 3.3453217955651704, + "grad_norm": 1.1494240760803223, + "learning_rate": 6.694780229759241e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5761414170265198, + "num_tokens": 6324669631.0, + "step": 12371 + }, + { + "epoch": 3.34559221200649, + "grad_norm": 1.060065746307373, + "learning_rate": 6.6933958780884224e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5899025797843933, + "num_tokens": 6325193712.0, + "step": 12372 + }, + { + "epoch": 3.3458626284478097, + "grad_norm": 0.9508039355278015, + "learning_rate": 6.692011658556885e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5735251307487488, + "num_tokens": 6325715716.0, + "step": 12373 + }, + { + "epoch": 3.3461330448891293, + "grad_norm": 1.1196136474609375, + "learning_rate": 6.6906275712070954e-06, + "loss": 1.798, + "mean_token_accuracy": 0.5858916640281677, + "num_tokens": 6326239896.0, + "step": 12374 + }, + { + "epoch": 3.346403461330449, + "grad_norm": 1.1689051389694214, + "learning_rate": 6.689243616081527e-06, + "loss": 1.9337, + "mean_token_accuracy": 0.568541944026947, + "num_tokens": 6326764124.0, + "step": 12375 + }, + { + "epoch": 3.3466738777717686, + "grad_norm": 0.8712249994277954, + "learning_rate": 6.687859793222642e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.591168224811554, + "num_tokens": 6327288402.0, + "step": 12376 + }, + { + "epoch": 3.3469442942130883, + "grad_norm": 0.871776282787323, + "learning_rate": 6.686476102672897e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5673201084136963, + "num_tokens": 6327812494.0, + "step": 12377 + }, + { + "epoch": 3.347214710654408, + "grad_norm": 1.1908788681030273, + "learning_rate": 6.685092544474749e-06, + "loss": 1.8397, + "mean_token_accuracy": 0.5595946311950684, + "num_tokens": 6328315167.0, + "step": 12378 + }, + { + "epoch": 3.3474851270957275, + "grad_norm": 1.2694917917251587, + "learning_rate": 6.683709118670656e-06, + "loss": 1.8581, + "mean_token_accuracy": 0.5683931708335876, + "num_tokens": 6328839383.0, + "step": 12379 + }, + { + "epoch": 3.347755543537047, + "grad_norm": 1.1131982803344727, + "learning_rate": 6.682325825303055e-06, + "loss": 1.7947, + "mean_token_accuracy": 0.5705985426902771, + "num_tokens": 6329363463.0, + "step": 12380 + }, + { + "epoch": 3.348025959978367, + "grad_norm": 0.38307297229766846, + "learning_rate": 6.680942664414403e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.7091476917266846, + "num_tokens": 6329868234.0, + "step": 12381 + }, + { + "epoch": 3.3482963764196865, + "grad_norm": 1.3778862953186035, + "learning_rate": 6.679559636047124e-06, + "loss": 1.9116, + "mean_token_accuracy": 0.5593830943107605, + "num_tokens": 6330392420.0, + "step": 12382 + }, + { + "epoch": 3.348566792861006, + "grad_norm": 1.4880852699279785, + "learning_rate": 6.678176740243666e-06, + "loss": 1.8677, + "mean_token_accuracy": 0.5731096863746643, + "num_tokens": 6330916640.0, + "step": 12383 + }, + { + "epoch": 3.3488372093023258, + "grad_norm": 1.04454505443573, + "learning_rate": 6.676793977046454e-06, + "loss": 1.8642, + "mean_token_accuracy": 0.5663983821868896, + "num_tokens": 6331401930.0, + "step": 12384 + }, + { + "epoch": 3.3491076257436454, + "grad_norm": 1.4294583797454834, + "learning_rate": 6.6754113464979155e-06, + "loss": 1.9447, + "mean_token_accuracy": 0.5632182359695435, + "num_tokens": 6331926187.0, + "step": 12385 + }, + { + "epoch": 3.3493780421849646, + "grad_norm": 1.3093736171722412, + "learning_rate": 6.674028848640476e-06, + "loss": 1.7898, + "mean_token_accuracy": 0.580751895904541, + "num_tokens": 6332450427.0, + "step": 12386 + }, + { + "epoch": 3.3496484586262847, + "grad_norm": 1.100669026374817, + "learning_rate": 6.672646483516554e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.5804762840270996, + "num_tokens": 6332934252.0, + "step": 12387 + }, + { + "epoch": 3.349918875067604, + "grad_norm": 1.0613782405853271, + "learning_rate": 6.671264251168561e-06, + "loss": 1.8183, + "mean_token_accuracy": 0.5941202640533447, + "num_tokens": 6333458383.0, + "step": 12388 + }, + { + "epoch": 3.350189291508924, + "grad_norm": 1.1291608810424805, + "learning_rate": 6.669882151638913e-06, + "loss": 1.8495, + "mean_token_accuracy": 0.572323739528656, + "num_tokens": 6333982666.0, + "step": 12389 + }, + { + "epoch": 3.350459707950243, + "grad_norm": 1.0885306596755981, + "learning_rate": 6.668500184970015e-06, + "loss": 1.7844, + "mean_token_accuracy": 0.5882682800292969, + "num_tokens": 6334500543.0, + "step": 12390 + }, + { + "epoch": 3.350730124391563, + "grad_norm": 0.9418185353279114, + "learning_rate": 6.667118351204267e-06, + "loss": 1.8934, + "mean_token_accuracy": 0.5632041692733765, + "num_tokens": 6335024809.0, + "step": 12391 + }, + { + "epoch": 3.3510005408328825, + "grad_norm": 0.8651297688484192, + "learning_rate": 6.6657366503840695e-06, + "loss": 1.8216, + "mean_token_accuracy": 0.5852880477905273, + "num_tokens": 6335548943.0, + "step": 12392 + }, + { + "epoch": 3.351270957274202, + "grad_norm": 1.0667389631271362, + "learning_rate": 6.66435508255182e-06, + "loss": 1.9046, + "mean_token_accuracy": 0.5438481569290161, + "num_tokens": 6336073222.0, + "step": 12393 + }, + { + "epoch": 3.3515413737155217, + "grad_norm": 1.0566455125808716, + "learning_rate": 6.662973647749904e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.5567065477371216, + "num_tokens": 6336597407.0, + "step": 12394 + }, + { + "epoch": 3.3518117901568414, + "grad_norm": 0.8578475713729858, + "learning_rate": 6.661592346020716e-06, + "loss": 1.7485, + "mean_token_accuracy": 0.5930900573730469, + "num_tokens": 6337113992.0, + "step": 12395 + }, + { + "epoch": 3.352082206598161, + "grad_norm": 0.9992987513542175, + "learning_rate": 6.660211177406631e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.562760591506958, + "num_tokens": 6337638164.0, + "step": 12396 + }, + { + "epoch": 3.3523526230394807, + "grad_norm": 1.2043615579605103, + "learning_rate": 6.658830141950027e-06, + "loss": 1.8031, + "mean_token_accuracy": 0.5746790170669556, + "num_tokens": 6338162394.0, + "step": 12397 + }, + { + "epoch": 3.3526230394808003, + "grad_norm": 1.1507954597473145, + "learning_rate": 6.657449239693284e-06, + "loss": 1.7814, + "mean_token_accuracy": 0.5609496235847473, + "num_tokens": 6338686516.0, + "step": 12398 + }, + { + "epoch": 3.35289345592212, + "grad_norm": 0.9656370282173157, + "learning_rate": 6.6560684706787706e-06, + "loss": 1.8003, + "mean_token_accuracy": 0.5862345099449158, + "num_tokens": 6339185580.0, + "step": 12399 + }, + { + "epoch": 3.3531638723634396, + "grad_norm": 1.1487643718719482, + "learning_rate": 6.654687834948845e-06, + "loss": 1.9549, + "mean_token_accuracy": 0.5525586009025574, + "num_tokens": 6339668132.0, + "step": 12400 + }, + { + "epoch": 3.3534342888047592, + "grad_norm": 0.3915230929851532, + "learning_rate": 6.65330733254588e-06, + "loss": 1.1003, + "mean_token_accuracy": 0.692710280418396, + "num_tokens": 6340192314.0, + "step": 12401 + }, + { + "epoch": 3.353704705246079, + "grad_norm": 1.0301178693771362, + "learning_rate": 6.651926963512225e-06, + "loss": 1.8912, + "mean_token_accuracy": 0.5585334300994873, + "num_tokens": 6340716585.0, + "step": 12402 + }, + { + "epoch": 3.3539751216873985, + "grad_norm": 0.9726492762565613, + "learning_rate": 6.6505467278902425e-06, + "loss": 1.9535, + "mean_token_accuracy": 0.5562028884887695, + "num_tokens": 6341240741.0, + "step": 12403 + }, + { + "epoch": 3.354245538128718, + "grad_norm": 1.0257630348205566, + "learning_rate": 6.649166625722278e-06, + "loss": 1.7884, + "mean_token_accuracy": 0.5564096570014954, + "num_tokens": 6341764890.0, + "step": 12404 + }, + { + "epoch": 3.354515954570038, + "grad_norm": 0.9649797677993774, + "learning_rate": 6.64778665705067e-06, + "loss": 1.7983, + "mean_token_accuracy": 0.5476360321044922, + "num_tokens": 6342289024.0, + "step": 12405 + }, + { + "epoch": 3.3547863710113575, + "grad_norm": 0.9234764575958252, + "learning_rate": 6.646406821917772e-06, + "loss": 1.8312, + "mean_token_accuracy": 0.5724199414253235, + "num_tokens": 6342813305.0, + "step": 12406 + }, + { + "epoch": 3.355056787452677, + "grad_norm": 0.986788809299469, + "learning_rate": 6.64502712036591e-06, + "loss": 1.904, + "mean_token_accuracy": 0.5544732213020325, + "num_tokens": 6343337369.0, + "step": 12407 + }, + { + "epoch": 3.3553272038939967, + "grad_norm": 0.9038516283035278, + "learning_rate": 6.643647552437426e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5903650522232056, + "num_tokens": 6343749825.0, + "step": 12408 + }, + { + "epoch": 3.3555976203353164, + "grad_norm": 1.1640039682388306, + "learning_rate": 6.6422681181746485e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.5695590972900391, + "num_tokens": 6344224588.0, + "step": 12409 + }, + { + "epoch": 3.355868036776636, + "grad_norm": 1.0364586114883423, + "learning_rate": 6.640888817619901e-06, + "loss": 1.7323, + "mean_token_accuracy": 0.5876842737197876, + "num_tokens": 6344748766.0, + "step": 12410 + }, + { + "epoch": 3.3561384532179557, + "grad_norm": 0.8991570472717285, + "learning_rate": 6.6395096508154985e-06, + "loss": 1.7627, + "mean_token_accuracy": 0.5868483781814575, + "num_tokens": 6345264288.0, + "step": 12411 + }, + { + "epoch": 3.3564088696592753, + "grad_norm": 0.9653941988945007, + "learning_rate": 6.638130617803769e-06, + "loss": 1.694, + "mean_token_accuracy": 0.6050718426704407, + "num_tokens": 6345788434.0, + "step": 12412 + }, + { + "epoch": 3.356679286100595, + "grad_norm": 0.9029567837715149, + "learning_rate": 6.636751718627018e-06, + "loss": 1.7615, + "mean_token_accuracy": 0.6079043745994568, + "num_tokens": 6346248245.0, + "step": 12413 + }, + { + "epoch": 3.3569497025419146, + "grad_norm": 1.0767618417739868, + "learning_rate": 6.635372953327552e-06, + "loss": 1.9089, + "mean_token_accuracy": 0.5577850341796875, + "num_tokens": 6346772420.0, + "step": 12414 + }, + { + "epoch": 3.3572201189832342, + "grad_norm": 0.9433240294456482, + "learning_rate": 6.633994321947683e-06, + "loss": 1.8997, + "mean_token_accuracy": 0.5724167823791504, + "num_tokens": 6347296628.0, + "step": 12415 + }, + { + "epoch": 3.357490535424554, + "grad_norm": 1.1632211208343506, + "learning_rate": 6.632615824529709e-06, + "loss": 1.9083, + "mean_token_accuracy": 0.5659995675086975, + "num_tokens": 6347820858.0, + "step": 12416 + }, + { + "epoch": 3.3577609518658735, + "grad_norm": 1.0775212049484253, + "learning_rate": 6.631237461115923e-06, + "loss": 1.8824, + "mean_token_accuracy": 0.5726543664932251, + "num_tokens": 6348344947.0, + "step": 12417 + }, + { + "epoch": 3.358031368307193, + "grad_norm": 0.8823354840278625, + "learning_rate": 6.62985923174862e-06, + "loss": 1.7161, + "mean_token_accuracy": 0.6175776720046997, + "num_tokens": 6348863691.0, + "step": 12418 + }, + { + "epoch": 3.358301784748513, + "grad_norm": 1.0216134786605835, + "learning_rate": 6.628481136470088e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.58255934715271, + "num_tokens": 6349387510.0, + "step": 12419 + }, + { + "epoch": 3.3585722011898325, + "grad_norm": 1.0147525072097778, + "learning_rate": 6.627103175322607e-06, + "loss": 1.8506, + "mean_token_accuracy": 0.5869446992874146, + "num_tokens": 6349911606.0, + "step": 12420 + }, + { + "epoch": 3.358842617631152, + "grad_norm": 0.4087201654911041, + "learning_rate": 6.625725348348462e-06, + "loss": 1.0938, + "mean_token_accuracy": 0.7142592668533325, + "num_tokens": 6350430971.0, + "step": 12421 + }, + { + "epoch": 3.3591130340724717, + "grad_norm": 1.1498664617538452, + "learning_rate": 6.624347655589929e-06, + "loss": 1.7364, + "mean_token_accuracy": 0.5896379947662354, + "num_tokens": 6350951546.0, + "step": 12422 + }, + { + "epoch": 3.3593834505137914, + "grad_norm": 1.3135976791381836, + "learning_rate": 6.622970097089277e-06, + "loss": 1.7837, + "mean_token_accuracy": 0.6005272269248962, + "num_tokens": 6351411316.0, + "step": 12423 + }, + { + "epoch": 3.359653866955111, + "grad_norm": 1.0423219203948975, + "learning_rate": 6.621592672888775e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5711849927902222, + "num_tokens": 6351935576.0, + "step": 12424 + }, + { + "epoch": 3.3599242833964307, + "grad_norm": 0.9894624352455139, + "learning_rate": 6.620215383030681e-06, + "loss": 1.8922, + "mean_token_accuracy": 0.5715006589889526, + "num_tokens": 6352459774.0, + "step": 12425 + }, + { + "epoch": 3.3601946998377503, + "grad_norm": 0.9469849467277527, + "learning_rate": 6.618838227557267e-06, + "loss": 1.7674, + "mean_token_accuracy": 0.5982639193534851, + "num_tokens": 6352983919.0, + "step": 12426 + }, + { + "epoch": 3.3604651162790695, + "grad_norm": 0.9490047693252563, + "learning_rate": 6.617461206510778e-06, + "loss": 1.9043, + "mean_token_accuracy": 0.5538805723190308, + "num_tokens": 6353508202.0, + "step": 12427 + }, + { + "epoch": 3.3607355327203896, + "grad_norm": 0.8475382328033447, + "learning_rate": 6.616084319933465e-06, + "loss": 1.9712, + "mean_token_accuracy": 0.5522844195365906, + "num_tokens": 6354032312.0, + "step": 12428 + }, + { + "epoch": 3.361005949161709, + "grad_norm": 1.0624048709869385, + "learning_rate": 6.61470756786758e-06, + "loss": 1.8421, + "mean_token_accuracy": 0.5872514247894287, + "num_tokens": 6354548659.0, + "step": 12429 + }, + { + "epoch": 3.361276365603029, + "grad_norm": 1.0263241529464722, + "learning_rate": 6.613330950355362e-06, + "loss": 1.7709, + "mean_token_accuracy": 0.5855333209037781, + "num_tokens": 6355072748.0, + "step": 12430 + }, + { + "epoch": 3.361546782044348, + "grad_norm": 1.0610344409942627, + "learning_rate": 6.611954467439048e-06, + "loss": 1.8672, + "mean_token_accuracy": 0.5848199725151062, + "num_tokens": 6355596905.0, + "step": 12431 + }, + { + "epoch": 3.3618171984856677, + "grad_norm": 1.0566648244857788, + "learning_rate": 6.61057811916088e-06, + "loss": 1.8752, + "mean_token_accuracy": 0.6066321134567261, + "num_tokens": 6356009980.0, + "step": 12432 + }, + { + "epoch": 3.3620876149269874, + "grad_norm": 0.9440745711326599, + "learning_rate": 6.609201905563082e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.567351222038269, + "num_tokens": 6356534075.0, + "step": 12433 + }, + { + "epoch": 3.362358031368307, + "grad_norm": 1.0498186349868774, + "learning_rate": 6.607825826687879e-06, + "loss": 1.7559, + "mean_token_accuracy": 0.5858209133148193, + "num_tokens": 6357058357.0, + "step": 12434 + }, + { + "epoch": 3.3626284478096267, + "grad_norm": 0.9216338396072388, + "learning_rate": 6.606449882577498e-06, + "loss": 1.7496, + "mean_token_accuracy": 0.587382435798645, + "num_tokens": 6357582608.0, + "step": 12435 + }, + { + "epoch": 3.3628988642509463, + "grad_norm": 1.0320651531219482, + "learning_rate": 6.605074073274158e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.5853935480117798, + "num_tokens": 6358106809.0, + "step": 12436 + }, + { + "epoch": 3.363169280692266, + "grad_norm": 1.0117409229278564, + "learning_rate": 6.603698398820066e-06, + "loss": 1.9184, + "mean_token_accuracy": 0.5506048798561096, + "num_tokens": 6358630829.0, + "step": 12437 + }, + { + "epoch": 3.3634396971335856, + "grad_norm": 0.9835869669914246, + "learning_rate": 6.602322859257438e-06, + "loss": 1.8652, + "mean_token_accuracy": 0.5849940776824951, + "num_tokens": 6359154922.0, + "step": 12438 + }, + { + "epoch": 3.3637101135749052, + "grad_norm": 0.9742499589920044, + "learning_rate": 6.600947454628479e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5913788080215454, + "num_tokens": 6359522169.0, + "step": 12439 + }, + { + "epoch": 3.363980530016225, + "grad_norm": 1.1574605703353882, + "learning_rate": 6.599572184975387e-06, + "loss": 1.8537, + "mean_token_accuracy": 0.5921237468719482, + "num_tokens": 6360016872.0, + "step": 12440 + }, + { + "epoch": 3.3642509464575445, + "grad_norm": 0.3785788118839264, + "learning_rate": 6.598197050340361e-06, + "loss": 1.1723, + "mean_token_accuracy": 0.6972554922103882, + "num_tokens": 6360471911.0, + "step": 12441 + }, + { + "epoch": 3.364521362898864, + "grad_norm": 1.1989022493362427, + "learning_rate": 6.596822050765597e-06, + "loss": 1.7816, + "mean_token_accuracy": 0.5866934061050415, + "num_tokens": 6360996153.0, + "step": 12442 + }, + { + "epoch": 3.364791779340184, + "grad_norm": 1.0439624786376953, + "learning_rate": 6.595447186293277e-06, + "loss": 1.7272, + "mean_token_accuracy": 0.6022526025772095, + "num_tokens": 6361516843.0, + "step": 12443 + }, + { + "epoch": 3.3650621957815035, + "grad_norm": 0.9434425234794617, + "learning_rate": 6.594072456965592e-06, + "loss": 1.8911, + "mean_token_accuracy": 0.557585597038269, + "num_tokens": 6362041022.0, + "step": 12444 + }, + { + "epoch": 3.365332612222823, + "grad_norm": 1.0932718515396118, + "learning_rate": 6.592697862824724e-06, + "loss": 1.7574, + "mean_token_accuracy": 0.5718382596969604, + "num_tokens": 6362565231.0, + "step": 12445 + }, + { + "epoch": 3.3656030286641427, + "grad_norm": 1.096709132194519, + "learning_rate": 6.591323403912842e-06, + "loss": 1.7792, + "mean_token_accuracy": 0.5876489877700806, + "num_tokens": 6363075430.0, + "step": 12446 + }, + { + "epoch": 3.3658734451054624, + "grad_norm": 1.1881706714630127, + "learning_rate": 6.589949080272127e-06, + "loss": 1.9264, + "mean_token_accuracy": 0.5694143772125244, + "num_tokens": 6363546435.0, + "step": 12447 + }, + { + "epoch": 3.366143861546782, + "grad_norm": 0.9005026817321777, + "learning_rate": 6.588574891944739e-06, + "loss": 1.8946, + "mean_token_accuracy": 0.565424919128418, + "num_tokens": 6364070698.0, + "step": 12448 + }, + { + "epoch": 3.3664142779881017, + "grad_norm": 1.1196843385696411, + "learning_rate": 6.58720083897285e-06, + "loss": 1.7696, + "mean_token_accuracy": 0.5856910943984985, + "num_tokens": 6364594927.0, + "step": 12449 + }, + { + "epoch": 3.3666846944294213, + "grad_norm": 1.0506186485290527, + "learning_rate": 6.585826921398612e-06, + "loss": 1.9511, + "mean_token_accuracy": 0.5520222783088684, + "num_tokens": 6365119147.0, + "step": 12450 + }, + { + "epoch": 3.366955110870741, + "grad_norm": 0.9015061259269714, + "learning_rate": 6.584453139264187e-06, + "loss": 1.8451, + "mean_token_accuracy": 0.5865944027900696, + "num_tokens": 6365626680.0, + "step": 12451 + }, + { + "epoch": 3.3672255273120606, + "grad_norm": 0.8930845856666565, + "learning_rate": 6.583079492611728e-06, + "loss": 1.6332, + "mean_token_accuracy": 0.6310214996337891, + "num_tokens": 6366150941.0, + "step": 12452 + }, + { + "epoch": 3.3674959437533802, + "grad_norm": 1.055314302444458, + "learning_rate": 6.581705981483379e-06, + "loss": 1.6818, + "mean_token_accuracy": 0.585170567035675, + "num_tokens": 6366675109.0, + "step": 12453 + }, + { + "epoch": 3.3677663601947, + "grad_norm": 0.9371797442436218, + "learning_rate": 6.580332605921281e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5812430381774902, + "num_tokens": 6367199379.0, + "step": 12454 + }, + { + "epoch": 3.3680367766360195, + "grad_norm": 0.9846217036247253, + "learning_rate": 6.57895936596758e-06, + "loss": 1.7916, + "mean_token_accuracy": 0.6014229655265808, + "num_tokens": 6367668616.0, + "step": 12455 + }, + { + "epoch": 3.368307193077339, + "grad_norm": 0.9727652072906494, + "learning_rate": 6.577586261664406e-06, + "loss": 1.792, + "mean_token_accuracy": 0.5895006656646729, + "num_tokens": 6368192834.0, + "step": 12456 + }, + { + "epoch": 3.368577609518659, + "grad_norm": 1.0247174501419067, + "learning_rate": 6.576213293053888e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5852113962173462, + "num_tokens": 6368672118.0, + "step": 12457 + }, + { + "epoch": 3.3688480259599785, + "grad_norm": 0.9694790244102478, + "learning_rate": 6.574840460178159e-06, + "loss": 1.8115, + "mean_token_accuracy": 0.5715717673301697, + "num_tokens": 6369161269.0, + "step": 12458 + }, + { + "epoch": 3.369118442401298, + "grad_norm": 1.0700328350067139, + "learning_rate": 6.573467763079337e-06, + "loss": 1.8676, + "mean_token_accuracy": 0.5738397836685181, + "num_tokens": 6369668696.0, + "step": 12459 + }, + { + "epoch": 3.3693888588426177, + "grad_norm": 1.0791959762573242, + "learning_rate": 6.57209520179954e-06, + "loss": 1.8608, + "mean_token_accuracy": 0.5440990924835205, + "num_tokens": 6370192957.0, + "step": 12460 + }, + { + "epoch": 3.3696592752839374, + "grad_norm": 0.3917624354362488, + "learning_rate": 6.570722776380888e-06, + "loss": 1.1735, + "mean_token_accuracy": 0.6827735900878906, + "num_tokens": 6370717240.0, + "step": 12461 + }, + { + "epoch": 3.369929691725257, + "grad_norm": 0.9812541604042053, + "learning_rate": 6.569350486865481e-06, + "loss": 1.749, + "mean_token_accuracy": 0.5840418338775635, + "num_tokens": 6371241429.0, + "step": 12462 + }, + { + "epoch": 3.3702001081665767, + "grad_norm": 1.2584598064422607, + "learning_rate": 6.567978333295436e-06, + "loss": 1.8892, + "mean_token_accuracy": 0.5734800100326538, + "num_tokens": 6371765510.0, + "step": 12463 + }, + { + "epoch": 3.3704705246078963, + "grad_norm": 0.9241899847984314, + "learning_rate": 6.566606315712844e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5787370204925537, + "num_tokens": 6372248765.0, + "step": 12464 + }, + { + "epoch": 3.370740941049216, + "grad_norm": 0.9694809913635254, + "learning_rate": 6.565234434159814e-06, + "loss": 1.7314, + "mean_token_accuracy": 0.5983387231826782, + "num_tokens": 6372728169.0, + "step": 12465 + }, + { + "epoch": 3.3710113574905356, + "grad_norm": 0.9708573222160339, + "learning_rate": 6.563862688678427e-06, + "loss": 1.8064, + "mean_token_accuracy": 0.5894110798835754, + "num_tokens": 6373204604.0, + "step": 12466 + }, + { + "epoch": 3.3712817739318552, + "grad_norm": 0.8302178382873535, + "learning_rate": 6.5624910793107855e-06, + "loss": 1.809, + "mean_token_accuracy": 0.5873783826828003, + "num_tokens": 6373728830.0, + "step": 12467 + }, + { + "epoch": 3.3715521903731744, + "grad_norm": 1.1593365669250488, + "learning_rate": 6.561119606098965e-06, + "loss": 1.7036, + "mean_token_accuracy": 0.6037321090698242, + "num_tokens": 6374202353.0, + "step": 12468 + }, + { + "epoch": 3.3718226068144945, + "grad_norm": 1.153696060180664, + "learning_rate": 6.559748269085047e-06, + "loss": 1.8109, + "mean_token_accuracy": 0.6034797430038452, + "num_tokens": 6374726620.0, + "step": 12469 + }, + { + "epoch": 3.3720930232558137, + "grad_norm": 1.1240516901016235, + "learning_rate": 6.558377068311113e-06, + "loss": 2.0782, + "mean_token_accuracy": 0.5305864214897156, + "num_tokens": 6375247206.0, + "step": 12470 + }, + { + "epoch": 3.372363439697134, + "grad_norm": 0.9414496421813965, + "learning_rate": 6.557006003819227e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5752471685409546, + "num_tokens": 6375762878.0, + "step": 12471 + }, + { + "epoch": 3.372633856138453, + "grad_norm": 0.9257279634475708, + "learning_rate": 6.555635075651469e-06, + "loss": 1.8873, + "mean_token_accuracy": 0.5628931522369385, + "num_tokens": 6376287140.0, + "step": 12472 + }, + { + "epoch": 3.372904272579773, + "grad_norm": 0.9825400114059448, + "learning_rate": 6.554264283849896e-06, + "loss": 1.7386, + "mean_token_accuracy": 0.6127176880836487, + "num_tokens": 6376723273.0, + "step": 12473 + }, + { + "epoch": 3.3731746890210923, + "grad_norm": 0.887403130531311, + "learning_rate": 6.552893628456565e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5878534317016602, + "num_tokens": 6377247474.0, + "step": 12474 + }, + { + "epoch": 3.373445105462412, + "grad_norm": 1.002838373184204, + "learning_rate": 6.551523109513539e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5720478296279907, + "num_tokens": 6377771749.0, + "step": 12475 + }, + { + "epoch": 3.3737155219037316, + "grad_norm": 0.8857751488685608, + "learning_rate": 6.5501527270628665e-06, + "loss": 1.6783, + "mean_token_accuracy": 0.613351583480835, + "num_tokens": 6378286295.0, + "step": 12476 + }, + { + "epoch": 3.3739859383450512, + "grad_norm": 1.0043483972549438, + "learning_rate": 6.54878248114659e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5794956088066101, + "num_tokens": 6378810486.0, + "step": 12477 + }, + { + "epoch": 3.374256354786371, + "grad_norm": 0.9009994268417358, + "learning_rate": 6.5474123718067585e-06, + "loss": 1.8106, + "mean_token_accuracy": 0.5649664998054504, + "num_tokens": 6379334610.0, + "step": 12478 + }, + { + "epoch": 3.3745267712276905, + "grad_norm": 0.9645952582359314, + "learning_rate": 6.546042399085413e-06, + "loss": 1.8852, + "mean_token_accuracy": 0.5702959299087524, + "num_tokens": 6379858746.0, + "step": 12479 + }, + { + "epoch": 3.37479718766901, + "grad_norm": 0.9623526334762573, + "learning_rate": 6.54467256302458e-06, + "loss": 1.8674, + "mean_token_accuracy": 0.5774691104888916, + "num_tokens": 6380377288.0, + "step": 12480 + }, + { + "epoch": 3.37506760411033, + "grad_norm": 0.4196200668811798, + "learning_rate": 6.543302863666301e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.7094797492027283, + "num_tokens": 6380874266.0, + "step": 12481 + }, + { + "epoch": 3.3753380205516494, + "grad_norm": 1.0369975566864014, + "learning_rate": 6.541933301052594e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5729782581329346, + "num_tokens": 6381398514.0, + "step": 12482 + }, + { + "epoch": 3.375608436992969, + "grad_norm": 1.07134211063385, + "learning_rate": 6.540563875225481e-06, + "loss": 1.9064, + "mean_token_accuracy": 0.5584042072296143, + "num_tokens": 6381899072.0, + "step": 12483 + }, + { + "epoch": 3.3758788534342887, + "grad_norm": 0.9485875368118286, + "learning_rate": 6.539194586226986e-06, + "loss": 1.8885, + "mean_token_accuracy": 0.5612630844116211, + "num_tokens": 6382423330.0, + "step": 12484 + }, + { + "epoch": 3.3761492698756084, + "grad_norm": 0.9396169781684875, + "learning_rate": 6.537825434099121e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5869671702384949, + "num_tokens": 6382947485.0, + "step": 12485 + }, + { + "epoch": 3.376419686316928, + "grad_norm": 0.9572626352310181, + "learning_rate": 6.5364564188838875e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5804283022880554, + "num_tokens": 6383422252.0, + "step": 12486 + }, + { + "epoch": 3.3766901027582477, + "grad_norm": 1.035008430480957, + "learning_rate": 6.535087540623304e-06, + "loss": 1.7995, + "mean_token_accuracy": 0.591011643409729, + "num_tokens": 6383894781.0, + "step": 12487 + }, + { + "epoch": 3.3769605191995673, + "grad_norm": 1.0808953046798706, + "learning_rate": 6.533718799359363e-06, + "loss": 1.6435, + "mean_token_accuracy": 0.6358463764190674, + "num_tokens": 6384343676.0, + "step": 12488 + }, + { + "epoch": 3.377230935640887, + "grad_norm": 1.0440819263458252, + "learning_rate": 6.532350195134062e-06, + "loss": 1.9839, + "mean_token_accuracy": 0.5302449464797974, + "num_tokens": 6384854798.0, + "step": 12489 + }, + { + "epoch": 3.3775013520822066, + "grad_norm": 1.010686993598938, + "learning_rate": 6.5309817279894e-06, + "loss": 1.8581, + "mean_token_accuracy": 0.5802773833274841, + "num_tokens": 6385341208.0, + "step": 12490 + }, + { + "epoch": 3.3777717685235262, + "grad_norm": 1.3540468215942383, + "learning_rate": 6.5296133979673535e-06, + "loss": 1.8132, + "mean_token_accuracy": 0.5731761455535889, + "num_tokens": 6385807230.0, + "step": 12491 + }, + { + "epoch": 3.378042184964846, + "grad_norm": 1.0616859197616577, + "learning_rate": 6.528245205109918e-06, + "loss": 1.7866, + "mean_token_accuracy": 0.5774704217910767, + "num_tokens": 6386331333.0, + "step": 12492 + }, + { + "epoch": 3.3783126014061655, + "grad_norm": 1.0731374025344849, + "learning_rate": 6.5268771494590725e-06, + "loss": 1.9559, + "mean_token_accuracy": 0.5452101230621338, + "num_tokens": 6386855272.0, + "step": 12493 + }, + { + "epoch": 3.378583017847485, + "grad_norm": 1.0789525508880615, + "learning_rate": 6.5255092310567865e-06, + "loss": 1.8483, + "mean_token_accuracy": 0.5891528129577637, + "num_tokens": 6387368626.0, + "step": 12494 + }, + { + "epoch": 3.378853434288805, + "grad_norm": 0.9308044910430908, + "learning_rate": 6.52414144994504e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5772090554237366, + "num_tokens": 6387892881.0, + "step": 12495 + }, + { + "epoch": 3.3791238507301244, + "grad_norm": 0.8991695046424866, + "learning_rate": 6.522773806165797e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.575569748878479, + "num_tokens": 6388417062.0, + "step": 12496 + }, + { + "epoch": 3.379394267171444, + "grad_norm": 0.972489595413208, + "learning_rate": 6.521406299761018e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.5780667066574097, + "num_tokens": 6388941242.0, + "step": 12497 + }, + { + "epoch": 3.3796646836127637, + "grad_norm": 1.0848078727722168, + "learning_rate": 6.520038930772665e-06, + "loss": 1.6899, + "mean_token_accuracy": 0.6037091016769409, + "num_tokens": 6389429245.0, + "step": 12498 + }, + { + "epoch": 3.3799351000540834, + "grad_norm": 0.8547276258468628, + "learning_rate": 6.5186716992426935e-06, + "loss": 1.8405, + "mean_token_accuracy": 0.5818714499473572, + "num_tokens": 6389953517.0, + "step": 12499 + }, + { + "epoch": 3.380205516495403, + "grad_norm": 0.810540497303009, + "learning_rate": 6.517304605213051e-06, + "loss": 1.884, + "mean_token_accuracy": 0.568507730960846, + "num_tokens": 6390477800.0, + "step": 12500 + }, + { + "epoch": 3.3804759329367227, + "grad_norm": 0.39694175124168396, + "learning_rate": 6.515937648725688e-06, + "loss": 1.047, + "mean_token_accuracy": 0.7124416828155518, + "num_tokens": 6390969478.0, + "step": 12501 + }, + { + "epoch": 3.3807463493780423, + "grad_norm": 1.2699917554855347, + "learning_rate": 6.514570829822544e-06, + "loss": 1.7678, + "mean_token_accuracy": 0.5980079174041748, + "num_tokens": 6391458926.0, + "step": 12502 + }, + { + "epoch": 3.381016765819362, + "grad_norm": 1.1401256322860718, + "learning_rate": 6.513204148545555e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5997589230537415, + "num_tokens": 6391983205.0, + "step": 12503 + }, + { + "epoch": 3.3812871822606816, + "grad_norm": 1.017682433128357, + "learning_rate": 6.5118376049366606e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.588456392288208, + "num_tokens": 6392465260.0, + "step": 12504 + }, + { + "epoch": 3.3815575987020012, + "grad_norm": 1.0554735660552979, + "learning_rate": 6.5104711990377844e-06, + "loss": 1.8507, + "mean_token_accuracy": 0.57330322265625, + "num_tokens": 6392989518.0, + "step": 12505 + }, + { + "epoch": 3.381828015143321, + "grad_norm": 1.2201083898544312, + "learning_rate": 6.509104930890858e-06, + "loss": 1.8629, + "mean_token_accuracy": 0.5704760551452637, + "num_tokens": 6393513773.0, + "step": 12506 + }, + { + "epoch": 3.3820984315846405, + "grad_norm": 1.031516194343567, + "learning_rate": 6.507738800537796e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5712927579879761, + "num_tokens": 6394037978.0, + "step": 12507 + }, + { + "epoch": 3.38236884802596, + "grad_norm": 1.1218570470809937, + "learning_rate": 6.5063728080205225e-06, + "loss": 2.0174, + "mean_token_accuracy": 0.5364283323287964, + "num_tokens": 6394562248.0, + "step": 12508 + }, + { + "epoch": 3.3826392644672794, + "grad_norm": 1.171078085899353, + "learning_rate": 6.505006953380941e-06, + "loss": 1.925, + "mean_token_accuracy": 0.5666740536689758, + "num_tokens": 6395086487.0, + "step": 12509 + }, + { + "epoch": 3.3829096809085994, + "grad_norm": 1.1337406635284424, + "learning_rate": 6.50364123666097e-06, + "loss": 1.8144, + "mean_token_accuracy": 0.5748022198677063, + "num_tokens": 6395610750.0, + "step": 12510 + }, + { + "epoch": 3.3831800973499186, + "grad_norm": 1.0066711902618408, + "learning_rate": 6.502275657902508e-06, + "loss": 1.966, + "mean_token_accuracy": 0.5533150434494019, + "num_tokens": 6396134896.0, + "step": 12511 + }, + { + "epoch": 3.3834505137912387, + "grad_norm": 1.056418776512146, + "learning_rate": 6.500910217147452e-06, + "loss": 1.9084, + "mean_token_accuracy": 0.5852903127670288, + "num_tokens": 6396597853.0, + "step": 12512 + }, + { + "epoch": 3.383720930232558, + "grad_norm": 1.6943819522857666, + "learning_rate": 6.499544914437705e-06, + "loss": 1.6104, + "mean_token_accuracy": 0.6477982997894287, + "num_tokens": 6397122051.0, + "step": 12513 + }, + { + "epoch": 3.383991346673878, + "grad_norm": 1.3133513927459717, + "learning_rate": 6.4981797498151566e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5963711738586426, + "num_tokens": 6397628576.0, + "step": 12514 + }, + { + "epoch": 3.384261763115197, + "grad_norm": 0.9748769402503967, + "learning_rate": 6.496814723321687e-06, + "loss": 1.7858, + "mean_token_accuracy": 0.5850681066513062, + "num_tokens": 6398152856.0, + "step": 12515 + }, + { + "epoch": 3.384532179556517, + "grad_norm": 0.9083310961723328, + "learning_rate": 6.49544983499919e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5688083171844482, + "num_tokens": 6398673569.0, + "step": 12516 + }, + { + "epoch": 3.3848025959978365, + "grad_norm": 1.1357090473175049, + "learning_rate": 6.494085084889534e-06, + "loss": 1.7713, + "mean_token_accuracy": 0.595597505569458, + "num_tokens": 6399197809.0, + "step": 12517 + }, + { + "epoch": 3.385073012439156, + "grad_norm": 1.1846891641616821, + "learning_rate": 6.492720473034603e-06, + "loss": 1.7642, + "mean_token_accuracy": 0.5982332229614258, + "num_tokens": 6399722042.0, + "step": 12518 + }, + { + "epoch": 3.385343428880476, + "grad_norm": 0.8943924903869629, + "learning_rate": 6.491355999476263e-06, + "loss": 1.8198, + "mean_token_accuracy": 0.5832219123840332, + "num_tokens": 6400246322.0, + "step": 12519 + }, + { + "epoch": 3.3856138453217954, + "grad_norm": 0.9842019081115723, + "learning_rate": 6.4899916642563765e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5570907592773438, + "num_tokens": 6400770483.0, + "step": 12520 + }, + { + "epoch": 3.385884261763115, + "grad_norm": 0.42860147356987, + "learning_rate": 6.488627467416808e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.7080820798873901, + "num_tokens": 6401235512.0, + "step": 12521 + }, + { + "epoch": 3.3861546782044347, + "grad_norm": 1.340401291847229, + "learning_rate": 6.48726340899942e-06, + "loss": 1.9371, + "mean_token_accuracy": 0.5723963975906372, + "num_tokens": 6401759677.0, + "step": 12522 + }, + { + "epoch": 3.3864250946457544, + "grad_norm": 1.3979114294052124, + "learning_rate": 6.485899489046057e-06, + "loss": 1.8035, + "mean_token_accuracy": 0.5761296153068542, + "num_tokens": 6402226960.0, + "step": 12523 + }, + { + "epoch": 3.386695511087074, + "grad_norm": 0.8136826753616333, + "learning_rate": 6.484535707598578e-06, + "loss": 1.8888, + "mean_token_accuracy": 0.5570740103721619, + "num_tokens": 6402751140.0, + "step": 12524 + }, + { + "epoch": 3.3869659275283936, + "grad_norm": 0.8690895438194275, + "learning_rate": 6.483172064698822e-06, + "loss": 1.7524, + "mean_token_accuracy": 0.5938630104064941, + "num_tokens": 6403275189.0, + "step": 12525 + }, + { + "epoch": 3.3872363439697133, + "grad_norm": 1.314413070678711, + "learning_rate": 6.481808560388627e-06, + "loss": 1.8689, + "mean_token_accuracy": 0.5837680101394653, + "num_tokens": 6403799354.0, + "step": 12526 + }, + { + "epoch": 3.387506760411033, + "grad_norm": 1.338008165359497, + "learning_rate": 6.480445194709835e-06, + "loss": 1.9235, + "mean_token_accuracy": 0.5707152485847473, + "num_tokens": 6404267987.0, + "step": 12527 + }, + { + "epoch": 3.3877771768523526, + "grad_norm": 1.0183030366897583, + "learning_rate": 6.479081967704276e-06, + "loss": 1.8187, + "mean_token_accuracy": 0.5504436492919922, + "num_tokens": 6404792250.0, + "step": 12528 + }, + { + "epoch": 3.388047593293672, + "grad_norm": 1.0183696746826172, + "learning_rate": 6.477718879413775e-06, + "loss": 1.7931, + "mean_token_accuracy": 0.5877019166946411, + "num_tokens": 6405316486.0, + "step": 12529 + }, + { + "epoch": 3.388318009734992, + "grad_norm": 1.3878992795944214, + "learning_rate": 6.4763559298801605e-06, + "loss": 1.7807, + "mean_token_accuracy": 0.582054615020752, + "num_tokens": 6405840718.0, + "step": 12530 + }, + { + "epoch": 3.3885884261763115, + "grad_norm": 1.140569806098938, + "learning_rate": 6.474993119145248e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.5670281648635864, + "num_tokens": 6406333265.0, + "step": 12531 + }, + { + "epoch": 3.388858842617631, + "grad_norm": 1.019810438156128, + "learning_rate": 6.4736304472508515e-06, + "loss": 1.8861, + "mean_token_accuracy": 0.5598207116127014, + "num_tokens": 6406857524.0, + "step": 12532 + }, + { + "epoch": 3.389129259058951, + "grad_norm": 1.1079521179199219, + "learning_rate": 6.472267914238788e-06, + "loss": 1.7105, + "mean_token_accuracy": 0.5832539796829224, + "num_tokens": 6407381700.0, + "step": 12533 + }, + { + "epoch": 3.3893996755002704, + "grad_norm": 1.1936652660369873, + "learning_rate": 6.4709055201508564e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.591701865196228, + "num_tokens": 6407905945.0, + "step": 12534 + }, + { + "epoch": 3.38967009194159, + "grad_norm": 0.9316319823265076, + "learning_rate": 6.469543265028863e-06, + "loss": 1.7912, + "mean_token_accuracy": 0.5878374576568604, + "num_tokens": 6408430124.0, + "step": 12535 + }, + { + "epoch": 3.3899405083829097, + "grad_norm": 0.9187266230583191, + "learning_rate": 6.4681811489146075e-06, + "loss": 1.7574, + "mean_token_accuracy": 0.5921944379806519, + "num_tokens": 6408954410.0, + "step": 12536 + }, + { + "epoch": 3.3902109248242294, + "grad_norm": 1.0419282913208008, + "learning_rate": 6.466819171849883e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5537092685699463, + "num_tokens": 6409478631.0, + "step": 12537 + }, + { + "epoch": 3.390481341265549, + "grad_norm": 0.9513346552848816, + "learning_rate": 6.465457333876473e-06, + "loss": 1.9596, + "mean_token_accuracy": 0.5431346893310547, + "num_tokens": 6410002804.0, + "step": 12538 + }, + { + "epoch": 3.3907517577068687, + "grad_norm": 0.8753924369812012, + "learning_rate": 6.464095635036172e-06, + "loss": 1.7473, + "mean_token_accuracy": 0.5817320346832275, + "num_tokens": 6410527011.0, + "step": 12539 + }, + { + "epoch": 3.3910221741481883, + "grad_norm": 1.0590190887451172, + "learning_rate": 6.462734075370751e-06, + "loss": 1.7898, + "mean_token_accuracy": 0.5911383628845215, + "num_tokens": 6410991441.0, + "step": 12540 + }, + { + "epoch": 3.391292590589508, + "grad_norm": 0.35137873888015747, + "learning_rate": 6.461372654921997e-06, + "loss": 1.1351, + "mean_token_accuracy": 0.6963497400283813, + "num_tokens": 6411515657.0, + "step": 12541 + }, + { + "epoch": 3.3915630070308276, + "grad_norm": 1.1384236812591553, + "learning_rate": 6.460011373731676e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5786103010177612, + "num_tokens": 6412039767.0, + "step": 12542 + }, + { + "epoch": 3.3918334234721472, + "grad_norm": 1.2884739637374878, + "learning_rate": 6.4586502318415536e-06, + "loss": 1.8205, + "mean_token_accuracy": 0.578274130821228, + "num_tokens": 6412534606.0, + "step": 12543 + }, + { + "epoch": 3.392103839913467, + "grad_norm": 1.0306015014648438, + "learning_rate": 6.457289229293402e-06, + "loss": 1.7406, + "mean_token_accuracy": 0.5744290351867676, + "num_tokens": 6413058810.0, + "step": 12544 + }, + { + "epoch": 3.3923742563547865, + "grad_norm": 1.261422038078308, + "learning_rate": 6.455928366128975e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5834473371505737, + "num_tokens": 6413520223.0, + "step": 12545 + }, + { + "epoch": 3.392644672796106, + "grad_norm": 1.2176352739334106, + "learning_rate": 6.454567642390026e-06, + "loss": 1.7935, + "mean_token_accuracy": 0.5898817777633667, + "num_tokens": 6414044482.0, + "step": 12546 + }, + { + "epoch": 3.392915089237426, + "grad_norm": 1.1140081882476807, + "learning_rate": 6.453207058118311e-06, + "loss": 1.9754, + "mean_token_accuracy": 0.5529196262359619, + "num_tokens": 6414548963.0, + "step": 12547 + }, + { + "epoch": 3.3931855056787454, + "grad_norm": 1.0101736783981323, + "learning_rate": 6.4518466133555745e-06, + "loss": 1.8376, + "mean_token_accuracy": 0.5726162195205688, + "num_tokens": 6415073225.0, + "step": 12548 + }, + { + "epoch": 3.393455922120065, + "grad_norm": 0.9640277028083801, + "learning_rate": 6.45048630814356e-06, + "loss": 1.9036, + "mean_token_accuracy": 0.5653443336486816, + "num_tokens": 6415597447.0, + "step": 12549 + }, + { + "epoch": 3.3937263385613847, + "grad_norm": 1.1261154413223267, + "learning_rate": 6.449126142524002e-06, + "loss": 1.7812, + "mean_token_accuracy": 0.5816556215286255, + "num_tokens": 6416121678.0, + "step": 12550 + }, + { + "epoch": 3.3939967550027044, + "grad_norm": 0.8496822714805603, + "learning_rate": 6.447766116538641e-06, + "loss": 1.8253, + "mean_token_accuracy": 0.57439786195755, + "num_tokens": 6416585727.0, + "step": 12551 + }, + { + "epoch": 3.3942671714440236, + "grad_norm": 1.3270195722579956, + "learning_rate": 6.446406230229198e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5764094591140747, + "num_tokens": 6417109869.0, + "step": 12552 + }, + { + "epoch": 3.3945375878853437, + "grad_norm": 0.9592174291610718, + "learning_rate": 6.445046483637409e-06, + "loss": 1.8693, + "mean_token_accuracy": 0.584743320941925, + "num_tokens": 6417634150.0, + "step": 12553 + }, + { + "epoch": 3.394808004326663, + "grad_norm": 0.9014103412628174, + "learning_rate": 6.443686876804986e-06, + "loss": 1.6734, + "mean_token_accuracy": 0.5953797101974487, + "num_tokens": 6418158267.0, + "step": 12554 + }, + { + "epoch": 3.395078420767983, + "grad_norm": 1.1022658348083496, + "learning_rate": 6.442327409773648e-06, + "loss": 1.8274, + "mean_token_accuracy": 0.5749372243881226, + "num_tokens": 6418682530.0, + "step": 12555 + }, + { + "epoch": 3.395348837209302, + "grad_norm": 1.2980037927627563, + "learning_rate": 6.440968082585111e-06, + "loss": 1.828, + "mean_token_accuracy": 0.5841351747512817, + "num_tokens": 6419206651.0, + "step": 12556 + }, + { + "epoch": 3.395619253650622, + "grad_norm": 0.8633845448493958, + "learning_rate": 6.43960889528108e-06, + "loss": 1.8964, + "mean_token_accuracy": 0.5695233345031738, + "num_tokens": 6419730888.0, + "step": 12557 + }, + { + "epoch": 3.3958896700919414, + "grad_norm": 0.9157905578613281, + "learning_rate": 6.438249847903257e-06, + "loss": 1.7388, + "mean_token_accuracy": 0.5899493098258972, + "num_tokens": 6420255150.0, + "step": 12558 + }, + { + "epoch": 3.396160086533261, + "grad_norm": 1.212864875793457, + "learning_rate": 6.436890940493346e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5782605409622192, + "num_tokens": 6420779264.0, + "step": 12559 + }, + { + "epoch": 3.3964305029745807, + "grad_norm": 1.0870232582092285, + "learning_rate": 6.43553217309304e-06, + "loss": 1.8097, + "mean_token_accuracy": 0.5711367130279541, + "num_tokens": 6421303498.0, + "step": 12560 + }, + { + "epoch": 3.3967009194159004, + "grad_norm": 0.38460201025009155, + "learning_rate": 6.434173545744029e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.707998514175415, + "num_tokens": 6421827578.0, + "step": 12561 + }, + { + "epoch": 3.39697133585722, + "grad_norm": 1.4801652431488037, + "learning_rate": 6.432815058487999e-06, + "loss": 1.8753, + "mean_token_accuracy": 0.5538938641548157, + "num_tokens": 6422351861.0, + "step": 12562 + }, + { + "epoch": 3.3972417522985396, + "grad_norm": 1.4042127132415771, + "learning_rate": 6.431456711366637e-06, + "loss": 1.8637, + "mean_token_accuracy": 0.5673108696937561, + "num_tokens": 6422876130.0, + "step": 12563 + }, + { + "epoch": 3.3975121687398593, + "grad_norm": 0.9070652723312378, + "learning_rate": 6.430098504421616e-06, + "loss": 1.8537, + "mean_token_accuracy": 0.5716394782066345, + "num_tokens": 6423400297.0, + "step": 12564 + }, + { + "epoch": 3.397782585181179, + "grad_norm": 1.018464207649231, + "learning_rate": 6.428740437694616e-06, + "loss": 1.6888, + "mean_token_accuracy": 0.5965285301208496, + "num_tokens": 6423881272.0, + "step": 12565 + }, + { + "epoch": 3.3980530016224986, + "grad_norm": 1.2759002447128296, + "learning_rate": 6.427382511227298e-06, + "loss": 1.7764, + "mean_token_accuracy": 0.5821468830108643, + "num_tokens": 6424405479.0, + "step": 12566 + }, + { + "epoch": 3.398323418063818, + "grad_norm": 1.251775860786438, + "learning_rate": 6.426024725061336e-06, + "loss": 1.9966, + "mean_token_accuracy": 0.5512001514434814, + "num_tokens": 6424929739.0, + "step": 12567 + }, + { + "epoch": 3.398593834505138, + "grad_norm": 1.240989089012146, + "learning_rate": 6.424667079238387e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.5668403506278992, + "num_tokens": 6425453840.0, + "step": 12568 + }, + { + "epoch": 3.3988642509464575, + "grad_norm": 1.34033203125, + "learning_rate": 6.423309573800103e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5720775127410889, + "num_tokens": 6425978099.0, + "step": 12569 + }, + { + "epoch": 3.399134667387777, + "grad_norm": 1.123120903968811, + "learning_rate": 6.421952208788144e-06, + "loss": 1.882, + "mean_token_accuracy": 0.580249547958374, + "num_tokens": 6426453976.0, + "step": 12570 + }, + { + "epoch": 3.399405083829097, + "grad_norm": 0.9441984295845032, + "learning_rate": 6.420594984244153e-06, + "loss": 1.8684, + "mean_token_accuracy": 0.569648265838623, + "num_tokens": 6426978132.0, + "step": 12571 + }, + { + "epoch": 3.3996755002704164, + "grad_norm": 1.2138245105743408, + "learning_rate": 6.419237900209774e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.5801592469215393, + "num_tokens": 6427502103.0, + "step": 12572 + }, + { + "epoch": 3.399945916711736, + "grad_norm": 1.1774603128433228, + "learning_rate": 6.41788095672665e-06, + "loss": 1.7006, + "mean_token_accuracy": 0.5944875478744507, + "num_tokens": 6428026340.0, + "step": 12573 + }, + { + "epoch": 3.4002163331530557, + "grad_norm": 1.0346853733062744, + "learning_rate": 6.416524153836412e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5495536923408508, + "num_tokens": 6428550574.0, + "step": 12574 + }, + { + "epoch": 3.4004867495943754, + "grad_norm": 1.1990946531295776, + "learning_rate": 6.41516749158069e-06, + "loss": 1.9073, + "mean_token_accuracy": 0.5559887886047363, + "num_tokens": 6429074832.0, + "step": 12575 + }, + { + "epoch": 3.400757166035695, + "grad_norm": 1.1589107513427734, + "learning_rate": 6.413810970001116e-06, + "loss": 1.7565, + "mean_token_accuracy": 0.5874416828155518, + "num_tokens": 6429537031.0, + "step": 12576 + }, + { + "epoch": 3.4010275824770146, + "grad_norm": 1.0003938674926758, + "learning_rate": 6.412454589139304e-06, + "loss": 1.6155, + "mean_token_accuracy": 0.6272682547569275, + "num_tokens": 6430061256.0, + "step": 12577 + }, + { + "epoch": 3.4012979989183343, + "grad_norm": 0.8622764945030212, + "learning_rate": 6.4110983490368774e-06, + "loss": 1.8093, + "mean_token_accuracy": 0.5874640941619873, + "num_tokens": 6430585480.0, + "step": 12578 + }, + { + "epoch": 3.401568415359654, + "grad_norm": 1.0921789407730103, + "learning_rate": 6.409742249735451e-06, + "loss": 1.7633, + "mean_token_accuracy": 0.5720916986465454, + "num_tokens": 6431109724.0, + "step": 12579 + }, + { + "epoch": 3.4018388318009736, + "grad_norm": 1.0252529382705688, + "learning_rate": 6.408386291276632e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5887261033058167, + "num_tokens": 6431607209.0, + "step": 12580 + }, + { + "epoch": 3.402109248242293, + "grad_norm": 0.4503048062324524, + "learning_rate": 6.4070304737020215e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7406026124954224, + "num_tokens": 6432131475.0, + "step": 12581 + }, + { + "epoch": 3.402379664683613, + "grad_norm": 1.0242664813995361, + "learning_rate": 6.405674797053227e-06, + "loss": 1.9139, + "mean_token_accuracy": 0.5770986676216125, + "num_tokens": 6432655695.0, + "step": 12582 + }, + { + "epoch": 3.4026500811249325, + "grad_norm": 0.9398022890090942, + "learning_rate": 6.40431926137184e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5703660845756531, + "num_tokens": 6433179904.0, + "step": 12583 + }, + { + "epoch": 3.402920497566252, + "grad_norm": 1.0543254613876343, + "learning_rate": 6.40296386669945e-06, + "loss": 1.8923, + "mean_token_accuracy": 0.5790339708328247, + "num_tokens": 6433704049.0, + "step": 12584 + }, + { + "epoch": 3.403190914007572, + "grad_norm": 0.9639050364494324, + "learning_rate": 6.40160861307765e-06, + "loss": 1.9478, + "mean_token_accuracy": 0.5529144406318665, + "num_tokens": 6434228331.0, + "step": 12585 + }, + { + "epoch": 3.4034613304488914, + "grad_norm": 0.9191656708717346, + "learning_rate": 6.400253500548018e-06, + "loss": 1.8203, + "mean_token_accuracy": 0.5734750032424927, + "num_tokens": 6434752513.0, + "step": 12586 + }, + { + "epoch": 3.403731746890211, + "grad_norm": 1.2085031270980835, + "learning_rate": 6.39889852915214e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.592449426651001, + "num_tokens": 6435276719.0, + "step": 12587 + }, + { + "epoch": 3.4040021633315307, + "grad_norm": 1.331153392791748, + "learning_rate": 6.3975436989315855e-06, + "loss": 1.817, + "mean_token_accuracy": 0.5898559093475342, + "num_tokens": 6435798909.0, + "step": 12588 + }, + { + "epoch": 3.4042725797728504, + "grad_norm": 1.1727360486984253, + "learning_rate": 6.39618900992792e-06, + "loss": 1.8555, + "mean_token_accuracy": 0.5824544429779053, + "num_tokens": 6436323114.0, + "step": 12589 + }, + { + "epoch": 3.40454299621417, + "grad_norm": 1.266458511352539, + "learning_rate": 6.394834462182721e-06, + "loss": 1.9199, + "mean_token_accuracy": 0.5480670928955078, + "num_tokens": 6436841900.0, + "step": 12590 + }, + { + "epoch": 3.4048134126554896, + "grad_norm": 1.3620214462280273, + "learning_rate": 6.393480055737538e-06, + "loss": 1.9159, + "mean_token_accuracy": 0.5739458799362183, + "num_tokens": 6437335549.0, + "step": 12591 + }, + { + "epoch": 3.4050838290968093, + "grad_norm": 1.4575978517532349, + "learning_rate": 6.392125790633939e-06, + "loss": 1.845, + "mean_token_accuracy": 0.5789391994476318, + "num_tokens": 6437859814.0, + "step": 12592 + }, + { + "epoch": 3.4053542455381285, + "grad_norm": 1.0033963918685913, + "learning_rate": 6.390771666913466e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5704209804534912, + "num_tokens": 6438359355.0, + "step": 12593 + }, + { + "epoch": 3.4056246619794486, + "grad_norm": 1.324704885482788, + "learning_rate": 6.389417684617678e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5801680088043213, + "num_tokens": 6438824148.0, + "step": 12594 + }, + { + "epoch": 3.4058950784207678, + "grad_norm": 1.6324400901794434, + "learning_rate": 6.38806384378811e-06, + "loss": 1.885, + "mean_token_accuracy": 0.5785346031188965, + "num_tokens": 6439315191.0, + "step": 12595 + }, + { + "epoch": 3.406165494862088, + "grad_norm": 1.243538737297058, + "learning_rate": 6.386710144466311e-06, + "loss": 1.8004, + "mean_token_accuracy": 0.5708601474761963, + "num_tokens": 6439839455.0, + "step": 12596 + }, + { + "epoch": 3.406435911303407, + "grad_norm": 1.0179414749145508, + "learning_rate": 6.3853565866938095e-06, + "loss": 1.8083, + "mean_token_accuracy": 0.5872608423233032, + "num_tokens": 6440354297.0, + "step": 12597 + }, + { + "epoch": 3.4067063277447267, + "grad_norm": 1.2385786771774292, + "learning_rate": 6.384003170512136e-06, + "loss": 1.9112, + "mean_token_accuracy": 0.5505588054656982, + "num_tokens": 6440850224.0, + "step": 12598 + }, + { + "epoch": 3.4069767441860463, + "grad_norm": 1.2031821012496948, + "learning_rate": 6.382649895962825e-06, + "loss": 1.7572, + "mean_token_accuracy": 0.583244800567627, + "num_tokens": 6441374485.0, + "step": 12599 + }, + { + "epoch": 3.407247160627366, + "grad_norm": 1.1565008163452148, + "learning_rate": 6.381296763087392e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.5669523477554321, + "num_tokens": 6441898657.0, + "step": 12600 + }, + { + "epoch": 3.4075175770686856, + "grad_norm": 0.35352063179016113, + "learning_rate": 6.379943771927354e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.7098283767700195, + "num_tokens": 6442422684.0, + "step": 12601 + }, + { + "epoch": 3.4077879935100053, + "grad_norm": 1.4025704860687256, + "learning_rate": 6.378590922524231e-06, + "loss": 1.801, + "mean_token_accuracy": 0.601068377494812, + "num_tokens": 6442946890.0, + "step": 12602 + }, + { + "epoch": 3.408058409951325, + "grad_norm": 1.3630625009536743, + "learning_rate": 6.37723821491953e-06, + "loss": 1.9376, + "mean_token_accuracy": 0.5759186744689941, + "num_tokens": 6443442573.0, + "step": 12603 + }, + { + "epoch": 3.4083288263926446, + "grad_norm": 1.2159595489501953, + "learning_rate": 6.375885649154751e-06, + "loss": 1.8694, + "mean_token_accuracy": 0.5701078176498413, + "num_tokens": 6443966767.0, + "step": 12604 + }, + { + "epoch": 3.408599242833964, + "grad_norm": 1.1577577590942383, + "learning_rate": 6.3745332252714e-06, + "loss": 1.8059, + "mean_token_accuracy": 0.5673246383666992, + "num_tokens": 6444465192.0, + "step": 12605 + }, + { + "epoch": 3.408869659275284, + "grad_norm": 1.2320510149002075, + "learning_rate": 6.373180943310975e-06, + "loss": 1.7241, + "mean_token_accuracy": 0.5970817804336548, + "num_tokens": 6444981862.0, + "step": 12606 + }, + { + "epoch": 3.4091400757166035, + "grad_norm": 1.1750224828720093, + "learning_rate": 6.3718288033149625e-06, + "loss": 1.7859, + "mean_token_accuracy": 0.5831577777862549, + "num_tokens": 6445506014.0, + "step": 12607 + }, + { + "epoch": 3.409410492157923, + "grad_norm": 1.0857808589935303, + "learning_rate": 6.370476805324855e-06, + "loss": 1.843, + "mean_token_accuracy": 0.5833685994148254, + "num_tokens": 6445939896.0, + "step": 12608 + }, + { + "epoch": 3.4096809085992428, + "grad_norm": 1.256089210510254, + "learning_rate": 6.369124949382131e-06, + "loss": 1.8963, + "mean_token_accuracy": 0.5713164806365967, + "num_tokens": 6446464037.0, + "step": 12609 + }, + { + "epoch": 3.4099513250405624, + "grad_norm": 1.2559949159622192, + "learning_rate": 6.3677732355282765e-06, + "loss": 1.997, + "mean_token_accuracy": 0.5770169496536255, + "num_tokens": 6446924050.0, + "step": 12610 + }, + { + "epoch": 3.410221741481882, + "grad_norm": 0.906690776348114, + "learning_rate": 6.36642166380476e-06, + "loss": 1.9356, + "mean_token_accuracy": 0.5634670853614807, + "num_tokens": 6447448330.0, + "step": 12611 + }, + { + "epoch": 3.4104921579232017, + "grad_norm": 1.0365281105041504, + "learning_rate": 6.36507023425305e-06, + "loss": 1.7982, + "mean_token_accuracy": 0.5713387727737427, + "num_tokens": 6447972503.0, + "step": 12612 + }, + { + "epoch": 3.4107625743645213, + "grad_norm": 1.011281967163086, + "learning_rate": 6.363718946914619e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.557838499546051, + "num_tokens": 6448496680.0, + "step": 12613 + }, + { + "epoch": 3.411032990805841, + "grad_norm": 1.0874284505844116, + "learning_rate": 6.3623678018309264e-06, + "loss": 1.8911, + "mean_token_accuracy": 0.5466659069061279, + "num_tokens": 6449020872.0, + "step": 12614 + }, + { + "epoch": 3.4113034072471606, + "grad_norm": 0.9409394860267639, + "learning_rate": 6.361016799043423e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.570396363735199, + "num_tokens": 6449488598.0, + "step": 12615 + }, + { + "epoch": 3.4115738236884803, + "grad_norm": 1.0716577768325806, + "learning_rate": 6.359665938593571e-06, + "loss": 1.7927, + "mean_token_accuracy": 0.5896217823028564, + "num_tokens": 6449948603.0, + "step": 12616 + }, + { + "epoch": 3.4118442401298, + "grad_norm": 1.2622566223144531, + "learning_rate": 6.358315220522815e-06, + "loss": 1.83, + "mean_token_accuracy": 0.5764709711074829, + "num_tokens": 6450472883.0, + "step": 12617 + }, + { + "epoch": 3.4121146565711196, + "grad_norm": 1.107603669166565, + "learning_rate": 6.356964644872594e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.5761828422546387, + "num_tokens": 6450997107.0, + "step": 12618 + }, + { + "epoch": 3.412385073012439, + "grad_norm": 0.9922882914543152, + "learning_rate": 6.355614211684356e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5654329061508179, + "num_tokens": 6451521347.0, + "step": 12619 + }, + { + "epoch": 3.412655489453759, + "grad_norm": 9.993624687194824, + "learning_rate": 6.3542639209995275e-06, + "loss": 1.7025, + "mean_token_accuracy": 0.6059809923171997, + "num_tokens": 6452045596.0, + "step": 12620 + }, + { + "epoch": 3.4129259058950785, + "grad_norm": 0.4003320634365082, + "learning_rate": 6.3529137728595455e-06, + "loss": 1.0835, + "mean_token_accuracy": 0.7197566628456116, + "num_tokens": 6452463396.0, + "step": 12621 + }, + { + "epoch": 3.413196322336398, + "grad_norm": 2.2175543308258057, + "learning_rate": 6.351563767305837e-06, + "loss": 1.8757, + "mean_token_accuracy": 0.5782192945480347, + "num_tokens": 6452971588.0, + "step": 12622 + }, + { + "epoch": 3.413466738777718, + "grad_norm": 1.7837333679199219, + "learning_rate": 6.350213904379824e-06, + "loss": 1.705, + "mean_token_accuracy": 0.6005357503890991, + "num_tokens": 6453495833.0, + "step": 12623 + }, + { + "epoch": 3.4137371552190374, + "grad_norm": 1.4753872156143188, + "learning_rate": 6.348864184122918e-06, + "loss": 1.8768, + "mean_token_accuracy": 0.5634968280792236, + "num_tokens": 6454020008.0, + "step": 12624 + }, + { + "epoch": 3.414007571660357, + "grad_norm": 1.182786226272583, + "learning_rate": 6.347514606576541e-06, + "loss": 1.913, + "mean_token_accuracy": 0.562585711479187, + "num_tokens": 6454544273.0, + "step": 12625 + }, + { + "epoch": 3.4142779881016767, + "grad_norm": 1.0597246885299683, + "learning_rate": 6.346165171782097e-06, + "loss": 1.6998, + "mean_token_accuracy": 0.5973186492919922, + "num_tokens": 6455040025.0, + "step": 12626 + }, + { + "epoch": 3.4145484045429964, + "grad_norm": 1.4033582210540771, + "learning_rate": 6.34481587978099e-06, + "loss": 1.8287, + "mean_token_accuracy": 0.5912758708000183, + "num_tokens": 6455502492.0, + "step": 12627 + }, + { + "epoch": 3.414818820984316, + "grad_norm": 1.4843733310699463, + "learning_rate": 6.343466730614622e-06, + "loss": 1.6671, + "mean_token_accuracy": 0.6172187328338623, + "num_tokens": 6456026695.0, + "step": 12628 + }, + { + "epoch": 3.4150892374256356, + "grad_norm": 1.2534362077713013, + "learning_rate": 6.342117724324392e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5875241756439209, + "num_tokens": 6456541944.0, + "step": 12629 + }, + { + "epoch": 3.4153596538669553, + "grad_norm": 1.3508429527282715, + "learning_rate": 6.340768860951685e-06, + "loss": 1.9242, + "mean_token_accuracy": 0.5635325908660889, + "num_tokens": 6457066170.0, + "step": 12630 + }, + { + "epoch": 3.415630070308275, + "grad_norm": 1.2562382221221924, + "learning_rate": 6.339420140537893e-06, + "loss": 1.9326, + "mean_token_accuracy": 0.558151364326477, + "num_tokens": 6457590409.0, + "step": 12631 + }, + { + "epoch": 3.4159004867495946, + "grad_norm": 1.0408412218093872, + "learning_rate": 6.3380715631243925e-06, + "loss": 1.7829, + "mean_token_accuracy": 0.590538501739502, + "num_tokens": 6458056783.0, + "step": 12632 + }, + { + "epoch": 3.416170903190914, + "grad_norm": 0.99189692735672, + "learning_rate": 6.336723128752571e-06, + "loss": 1.7696, + "mean_token_accuracy": 0.5577312707901001, + "num_tokens": 6458580888.0, + "step": 12633 + }, + { + "epoch": 3.4164413196322334, + "grad_norm": 1.158033847808838, + "learning_rate": 6.335374837463795e-06, + "loss": 1.8085, + "mean_token_accuracy": 0.5762078762054443, + "num_tokens": 6459060025.0, + "step": 12634 + }, + { + "epoch": 3.4167117360735535, + "grad_norm": 1.0348578691482544, + "learning_rate": 6.334026689299435e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5637711882591248, + "num_tokens": 6459526917.0, + "step": 12635 + }, + { + "epoch": 3.4169821525148727, + "grad_norm": 1.279605507850647, + "learning_rate": 6.3326786843008616e-06, + "loss": 1.9756, + "mean_token_accuracy": 0.554036557674408, + "num_tokens": 6460051136.0, + "step": 12636 + }, + { + "epoch": 3.417252568956193, + "grad_norm": 1.0507551431655884, + "learning_rate": 6.331330822509434e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5594767332077026, + "num_tokens": 6460575256.0, + "step": 12637 + }, + { + "epoch": 3.417522985397512, + "grad_norm": 1.0152326822280884, + "learning_rate": 6.329983103966502e-06, + "loss": 1.8393, + "mean_token_accuracy": 0.5668784379959106, + "num_tokens": 6461099529.0, + "step": 12638 + }, + { + "epoch": 3.4177934018388316, + "grad_norm": 1.049107313156128, + "learning_rate": 6.328635528713423e-06, + "loss": 1.8413, + "mean_token_accuracy": 0.5768851637840271, + "num_tokens": 6461574696.0, + "step": 12639 + }, + { + "epoch": 3.4180638182801513, + "grad_norm": 1.0509587526321411, + "learning_rate": 6.327288096791546e-06, + "loss": 1.7911, + "mean_token_accuracy": 0.582438588142395, + "num_tokens": 6462075184.0, + "step": 12640 + }, + { + "epoch": 3.418334234721471, + "grad_norm": 0.3822070062160492, + "learning_rate": 6.325940808242208e-06, + "loss": 1.0968, + "mean_token_accuracy": 0.7002335786819458, + "num_tokens": 6462599461.0, + "step": 12641 + }, + { + "epoch": 3.4186046511627906, + "grad_norm": 1.081944227218628, + "learning_rate": 6.324593663106755e-06, + "loss": 1.9026, + "mean_token_accuracy": 0.6022703647613525, + "num_tokens": 6463058185.0, + "step": 12642 + }, + { + "epoch": 3.41887506760411, + "grad_norm": 1.0879229307174683, + "learning_rate": 6.323246661426516e-06, + "loss": 1.6708, + "mean_token_accuracy": 0.5965583920478821, + "num_tokens": 6463523366.0, + "step": 12643 + }, + { + "epoch": 3.41914548404543, + "grad_norm": 1.1333695650100708, + "learning_rate": 6.321899803242822e-06, + "loss": 1.9214, + "mean_token_accuracy": 0.5379263162612915, + "num_tokens": 6464047545.0, + "step": 12644 + }, + { + "epoch": 3.4194159004867495, + "grad_norm": 1.0419479608535767, + "learning_rate": 6.3205530885970026e-06, + "loss": 1.7805, + "mean_token_accuracy": 0.6081206798553467, + "num_tokens": 6464534266.0, + "step": 12645 + }, + { + "epoch": 3.419686316928069, + "grad_norm": 0.9695984721183777, + "learning_rate": 6.319206517530374e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.5689451694488525, + "num_tokens": 6465058502.0, + "step": 12646 + }, + { + "epoch": 3.4199567333693888, + "grad_norm": 0.8732296824455261, + "learning_rate": 6.3178600900842535e-06, + "loss": 1.7642, + "mean_token_accuracy": 0.5838441252708435, + "num_tokens": 6465582612.0, + "step": 12647 + }, + { + "epoch": 3.4202271498107084, + "grad_norm": 0.9253832101821899, + "learning_rate": 6.316513806299953e-06, + "loss": 1.9506, + "mean_token_accuracy": 0.5493627786636353, + "num_tokens": 6466106850.0, + "step": 12648 + }, + { + "epoch": 3.420497566252028, + "grad_norm": 1.0189287662506104, + "learning_rate": 6.315167666218786e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.5845428705215454, + "num_tokens": 6466591402.0, + "step": 12649 + }, + { + "epoch": 3.4207679826933477, + "grad_norm": 0.9060746431350708, + "learning_rate": 6.3138216698820494e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.580358624458313, + "num_tokens": 6467115605.0, + "step": 12650 + }, + { + "epoch": 3.4210383991346673, + "grad_norm": 0.8913055658340454, + "learning_rate": 6.312475817331049e-06, + "loss": 1.6995, + "mean_token_accuracy": 0.5999401807785034, + "num_tokens": 6467639649.0, + "step": 12651 + }, + { + "epoch": 3.421308815575987, + "grad_norm": 1.0204732418060303, + "learning_rate": 6.311130108607075e-06, + "loss": 1.7374, + "mean_token_accuracy": 0.5826207399368286, + "num_tokens": 6468163906.0, + "step": 12652 + }, + { + "epoch": 3.4215792320173066, + "grad_norm": 1.0629621744155884, + "learning_rate": 6.309784543751415e-06, + "loss": 1.8715, + "mean_token_accuracy": 0.584945797920227, + "num_tokens": 6468681854.0, + "step": 12653 + }, + { + "epoch": 3.4218496484586263, + "grad_norm": 0.982410192489624, + "learning_rate": 6.308439122805361e-06, + "loss": 1.8307, + "mean_token_accuracy": 0.5816628336906433, + "num_tokens": 6469145865.0, + "step": 12654 + }, + { + "epoch": 3.422120064899946, + "grad_norm": 0.8973724246025085, + "learning_rate": 6.30709384581019e-06, + "loss": 1.7409, + "mean_token_accuracy": 0.5994420051574707, + "num_tokens": 6469670064.0, + "step": 12655 + }, + { + "epoch": 3.4223904813412656, + "grad_norm": 1.0554580688476562, + "learning_rate": 6.30574871280718e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.5741096138954163, + "num_tokens": 6470194245.0, + "step": 12656 + }, + { + "epoch": 3.422660897782585, + "grad_norm": 0.9994350671768188, + "learning_rate": 6.304403723837609e-06, + "loss": 1.8048, + "mean_token_accuracy": 0.5736645460128784, + "num_tokens": 6470718429.0, + "step": 12657 + }, + { + "epoch": 3.422931314223905, + "grad_norm": 1.0184746980667114, + "learning_rate": 6.303058878942734e-06, + "loss": 1.8393, + "mean_token_accuracy": 0.5637683272361755, + "num_tokens": 6471242642.0, + "step": 12658 + }, + { + "epoch": 3.4232017306652245, + "grad_norm": 0.9459787607192993, + "learning_rate": 6.301714178163831e-06, + "loss": 1.8078, + "mean_token_accuracy": 0.5829190611839294, + "num_tokens": 6471766655.0, + "step": 12659 + }, + { + "epoch": 3.423472147106544, + "grad_norm": 0.8312410712242126, + "learning_rate": 6.300369621542152e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5661815404891968, + "num_tokens": 6472289337.0, + "step": 12660 + }, + { + "epoch": 3.4237425635478638, + "grad_norm": 0.37053707242012024, + "learning_rate": 6.299025209118952e-06, + "loss": 1.1537, + "mean_token_accuracy": 0.6897704601287842, + "num_tokens": 6472813479.0, + "step": 12661 + }, + { + "epoch": 3.4240129799891834, + "grad_norm": 0.9899501800537109, + "learning_rate": 6.297680940935482e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.5913300514221191, + "num_tokens": 6473287755.0, + "step": 12662 + }, + { + "epoch": 3.424283396430503, + "grad_norm": 1.115157127380371, + "learning_rate": 6.2963368170329955e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5838571190834045, + "num_tokens": 6473810797.0, + "step": 12663 + }, + { + "epoch": 3.4245538128718227, + "grad_norm": 0.9586417078971863, + "learning_rate": 6.294992837452721e-06, + "loss": 1.926, + "mean_token_accuracy": 0.5524704456329346, + "num_tokens": 6474334847.0, + "step": 12664 + }, + { + "epoch": 3.4248242293131423, + "grad_norm": 0.8278956413269043, + "learning_rate": 6.293649002235908e-06, + "loss": 1.8008, + "mean_token_accuracy": 0.5736955404281616, + "num_tokens": 6474859077.0, + "step": 12665 + }, + { + "epoch": 3.425094645754462, + "grad_norm": 0.9247307777404785, + "learning_rate": 6.2923053114237835e-06, + "loss": 1.8371, + "mean_token_accuracy": 0.5779088735580444, + "num_tokens": 6475361403.0, + "step": 12666 + }, + { + "epoch": 3.4253650621957816, + "grad_norm": 1.0625933408737183, + "learning_rate": 6.2909617650575725e-06, + "loss": 1.9558, + "mean_token_accuracy": 0.584180474281311, + "num_tokens": 6475820474.0, + "step": 12667 + }, + { + "epoch": 3.4256354786371013, + "grad_norm": 1.100611686706543, + "learning_rate": 6.289618363178508e-06, + "loss": 1.9637, + "mean_token_accuracy": 0.5603085160255432, + "num_tokens": 6476344628.0, + "step": 12668 + }, + { + "epoch": 3.425905895078421, + "grad_norm": 0.9047320485115051, + "learning_rate": 6.288275105827804e-06, + "loss": 1.849, + "mean_token_accuracy": 0.5746946334838867, + "num_tokens": 6476868731.0, + "step": 12669 + }, + { + "epoch": 3.4261763115197406, + "grad_norm": 1.1708227396011353, + "learning_rate": 6.286931993046672e-06, + "loss": 1.8285, + "mean_token_accuracy": 0.5878936052322388, + "num_tokens": 6477382529.0, + "step": 12670 + }, + { + "epoch": 3.42644672796106, + "grad_norm": 1.1502599716186523, + "learning_rate": 6.285589024876329e-06, + "loss": 1.8355, + "mean_token_accuracy": 0.5689594745635986, + "num_tokens": 6477906640.0, + "step": 12671 + }, + { + "epoch": 3.42671714440238, + "grad_norm": 1.07468843460083, + "learning_rate": 6.28424620135798e-06, + "loss": 1.904, + "mean_token_accuracy": 0.5530227422714233, + "num_tokens": 6478430917.0, + "step": 12672 + }, + { + "epoch": 3.4269875608436995, + "grad_norm": 0.9789605736732483, + "learning_rate": 6.282903522532823e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.6054764986038208, + "num_tokens": 6478955107.0, + "step": 12673 + }, + { + "epoch": 3.427257977285019, + "grad_norm": 1.0094472169876099, + "learning_rate": 6.281560988442061e-06, + "loss": 1.8241, + "mean_token_accuracy": 0.5843281149864197, + "num_tokens": 6479479366.0, + "step": 12674 + }, + { + "epoch": 3.4275283937263383, + "grad_norm": 1.0390160083770752, + "learning_rate": 6.28021859912688e-06, + "loss": 1.8715, + "mean_token_accuracy": 0.5634051561355591, + "num_tokens": 6480003516.0, + "step": 12675 + }, + { + "epoch": 3.4277988101676584, + "grad_norm": 1.088498592376709, + "learning_rate": 6.278876354628476e-06, + "loss": 1.8967, + "mean_token_accuracy": 0.5650238990783691, + "num_tokens": 6480527665.0, + "step": 12676 + }, + { + "epoch": 3.4280692266089776, + "grad_norm": 1.10226309299469, + "learning_rate": 6.277534254988026e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5836054086685181, + "num_tokens": 6481017528.0, + "step": 12677 + }, + { + "epoch": 3.4283396430502977, + "grad_norm": 0.9429371356964111, + "learning_rate": 6.276192300246716e-06, + "loss": 1.73, + "mean_token_accuracy": 0.5913166999816895, + "num_tokens": 6481541787.0, + "step": 12678 + }, + { + "epoch": 3.428610059491617, + "grad_norm": 1.0348883867263794, + "learning_rate": 6.274850490445716e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.5884716510772705, + "num_tokens": 6482065878.0, + "step": 12679 + }, + { + "epoch": 3.4288804759329365, + "grad_norm": 1.1188452243804932, + "learning_rate": 6.2735088256262e-06, + "loss": 1.7405, + "mean_token_accuracy": 0.5942580699920654, + "num_tokens": 6482533812.0, + "step": 12680 + }, + { + "epoch": 3.429150892374256, + "grad_norm": 0.3690231740474701, + "learning_rate": 6.272167305829331e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.7071349620819092, + "num_tokens": 6483057509.0, + "step": 12681 + }, + { + "epoch": 3.429421308815576, + "grad_norm": 1.2039963006973267, + "learning_rate": 6.270825931096276e-06, + "loss": 1.8171, + "mean_token_accuracy": 0.5769526362419128, + "num_tokens": 6483581606.0, + "step": 12682 + }, + { + "epoch": 3.4296917252568955, + "grad_norm": 1.5166655778884888, + "learning_rate": 6.269484701468189e-06, + "loss": 1.981, + "mean_token_accuracy": 0.5583779811859131, + "num_tokens": 6484105858.0, + "step": 12683 + }, + { + "epoch": 3.429962141698215, + "grad_norm": 1.0308781862258911, + "learning_rate": 6.26814361698622e-06, + "loss": 1.8907, + "mean_token_accuracy": 0.5760483741760254, + "num_tokens": 6484629993.0, + "step": 12684 + }, + { + "epoch": 3.4302325581395348, + "grad_norm": 0.9456843137741089, + "learning_rate": 6.266802677691522e-06, + "loss": 1.7977, + "mean_token_accuracy": 0.5925583839416504, + "num_tokens": 6485154234.0, + "step": 12685 + }, + { + "epoch": 3.4305029745808544, + "grad_norm": 1.0769023895263672, + "learning_rate": 6.265461883625239e-06, + "loss": 1.8459, + "mean_token_accuracy": 0.5819829702377319, + "num_tokens": 6485637073.0, + "step": 12686 + }, + { + "epoch": 3.430773391022174, + "grad_norm": 1.0352168083190918, + "learning_rate": 6.264121234828504e-06, + "loss": 1.8794, + "mean_token_accuracy": 0.5935028791427612, + "num_tokens": 6486088018.0, + "step": 12687 + }, + { + "epoch": 3.4310438074634937, + "grad_norm": 0.9553185701370239, + "learning_rate": 6.2627807313424614e-06, + "loss": 1.8716, + "mean_token_accuracy": 0.5791293382644653, + "num_tokens": 6486587351.0, + "step": 12688 + }, + { + "epoch": 3.4313142239048133, + "grad_norm": 1.1825944185256958, + "learning_rate": 6.261440373208236e-06, + "loss": 1.6441, + "mean_token_accuracy": 0.6095558404922485, + "num_tokens": 6487111614.0, + "step": 12689 + }, + { + "epoch": 3.431584640346133, + "grad_norm": 1.2099436521530151, + "learning_rate": 6.260100160466953e-06, + "loss": 1.8691, + "mean_token_accuracy": 0.5964252948760986, + "num_tokens": 6487545491.0, + "step": 12690 + }, + { + "epoch": 3.4318550567874526, + "grad_norm": 0.9137313961982727, + "learning_rate": 6.258760093159736e-06, + "loss": 1.9329, + "mean_token_accuracy": 0.5516756772994995, + "num_tokens": 6488069587.0, + "step": 12691 + }, + { + "epoch": 3.4321254732287723, + "grad_norm": 0.9585081338882446, + "learning_rate": 6.257420171327706e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5753315091133118, + "num_tokens": 6488593764.0, + "step": 12692 + }, + { + "epoch": 3.432395889670092, + "grad_norm": 0.9632935523986816, + "learning_rate": 6.256080395011967e-06, + "loss": 1.7168, + "mean_token_accuracy": 0.5943968296051025, + "num_tokens": 6489118022.0, + "step": 12693 + }, + { + "epoch": 3.4326663061114115, + "grad_norm": 1.0149203538894653, + "learning_rate": 6.254740764253637e-06, + "loss": 1.9388, + "mean_token_accuracy": 0.5498978495597839, + "num_tokens": 6489642124.0, + "step": 12694 + }, + { + "epoch": 3.432936722552731, + "grad_norm": 0.8757550716400146, + "learning_rate": 6.253401279093816e-06, + "loss": 1.8438, + "mean_token_accuracy": 0.5781573057174683, + "num_tokens": 6490166404.0, + "step": 12695 + }, + { + "epoch": 3.433207138994051, + "grad_norm": 0.9977027773857117, + "learning_rate": 6.252061939573597e-06, + "loss": 1.8679, + "mean_token_accuracy": 0.5889279246330261, + "num_tokens": 6490643029.0, + "step": 12696 + }, + { + "epoch": 3.4334775554353705, + "grad_norm": 1.2695908546447754, + "learning_rate": 6.250722745734086e-06, + "loss": 1.7942, + "mean_token_accuracy": 0.5861244201660156, + "num_tokens": 6491123643.0, + "step": 12697 + }, + { + "epoch": 3.43374797187669, + "grad_norm": 1.004684329032898, + "learning_rate": 6.249383697616367e-06, + "loss": 1.8865, + "mean_token_accuracy": 0.5677350759506226, + "num_tokens": 6491647914.0, + "step": 12698 + }, + { + "epoch": 3.4340183883180098, + "grad_norm": 0.9159122109413147, + "learning_rate": 6.248044795261525e-06, + "loss": 1.91, + "mean_token_accuracy": 0.5483312606811523, + "num_tokens": 6492172187.0, + "step": 12699 + }, + { + "epoch": 3.4342888047593294, + "grad_norm": 0.9170072078704834, + "learning_rate": 6.246706038710644e-06, + "loss": 1.9598, + "mean_token_accuracy": 0.5457615256309509, + "num_tokens": 6492696471.0, + "step": 12700 + }, + { + "epoch": 3.434559221200649, + "grad_norm": 0.38070324063301086, + "learning_rate": 6.245367428004803e-06, + "loss": 1.142, + "mean_token_accuracy": 0.7025915384292603, + "num_tokens": 6493156347.0, + "step": 12701 + }, + { + "epoch": 3.4348296376419687, + "grad_norm": 1.0937641859054565, + "learning_rate": 6.244028963185067e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.560428261756897, + "num_tokens": 6493680610.0, + "step": 12702 + }, + { + "epoch": 3.4351000540832883, + "grad_norm": 1.0800427198410034, + "learning_rate": 6.2426906442925126e-06, + "loss": 1.847, + "mean_token_accuracy": 0.5758914947509766, + "num_tokens": 6494187843.0, + "step": 12703 + }, + { + "epoch": 3.435370470524608, + "grad_norm": 0.8953331112861633, + "learning_rate": 6.241352471368195e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.572819709777832, + "num_tokens": 6494712069.0, + "step": 12704 + }, + { + "epoch": 3.4356408869659276, + "grad_norm": 0.8814824819564819, + "learning_rate": 6.2400144444531786e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5939240455627441, + "num_tokens": 6495236177.0, + "step": 12705 + }, + { + "epoch": 3.4359113034072473, + "grad_norm": 1.2172174453735352, + "learning_rate": 6.238676563588519e-06, + "loss": 1.8005, + "mean_token_accuracy": 0.5770190954208374, + "num_tokens": 6495760211.0, + "step": 12706 + }, + { + "epoch": 3.436181719848567, + "grad_norm": 0.9042717218399048, + "learning_rate": 6.237338828815264e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5794434547424316, + "num_tokens": 6496284456.0, + "step": 12707 + }, + { + "epoch": 3.4364521362898865, + "grad_norm": 0.9620829224586487, + "learning_rate": 6.236001240174462e-06, + "loss": 1.829, + "mean_token_accuracy": 0.5598047971725464, + "num_tokens": 6496808714.0, + "step": 12708 + }, + { + "epoch": 3.436722552731206, + "grad_norm": 1.0374549627304077, + "learning_rate": 6.234663797707152e-06, + "loss": 1.8572, + "mean_token_accuracy": 0.5824460983276367, + "num_tokens": 6497332984.0, + "step": 12709 + }, + { + "epoch": 3.436992969172526, + "grad_norm": 1.0195279121398926, + "learning_rate": 6.233326501454367e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.540416955947876, + "num_tokens": 6497857234.0, + "step": 12710 + }, + { + "epoch": 3.4372633856138455, + "grad_norm": 1.0008025169372559, + "learning_rate": 6.231989351457147e-06, + "loss": 1.8188, + "mean_token_accuracy": 0.5616590976715088, + "num_tokens": 6498381490.0, + "step": 12711 + }, + { + "epoch": 3.437533802055165, + "grad_norm": 1.0311429500579834, + "learning_rate": 6.230652347756515e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5870825052261353, + "num_tokens": 6498905631.0, + "step": 12712 + }, + { + "epoch": 3.4378042184964848, + "grad_norm": 0.922221839427948, + "learning_rate": 6.229315490393491e-06, + "loss": 1.6688, + "mean_token_accuracy": 0.6053746938705444, + "num_tokens": 6499330516.0, + "step": 12713 + }, + { + "epoch": 3.4380746349378044, + "grad_norm": 1.0319799184799194, + "learning_rate": 6.2279787794091005e-06, + "loss": 1.9441, + "mean_token_accuracy": 0.5548298358917236, + "num_tokens": 6499854719.0, + "step": 12714 + }, + { + "epoch": 3.438345051379124, + "grad_norm": 1.0585638284683228, + "learning_rate": 6.226642214844356e-06, + "loss": 1.9565, + "mean_token_accuracy": 0.5501680374145508, + "num_tokens": 6500378897.0, + "step": 12715 + }, + { + "epoch": 3.4386154678204432, + "grad_norm": 0.8731220960617065, + "learning_rate": 6.225305796740262e-06, + "loss": 1.9251, + "mean_token_accuracy": 0.5545678734779358, + "num_tokens": 6500903055.0, + "step": 12716 + }, + { + "epoch": 3.4388858842617633, + "grad_norm": 0.8755658268928528, + "learning_rate": 6.2239695251378295e-06, + "loss": 1.7087, + "mean_token_accuracy": 0.5848438739776611, + "num_tokens": 6501427321.0, + "step": 12717 + }, + { + "epoch": 3.4391563007030825, + "grad_norm": 1.0194995403289795, + "learning_rate": 6.222633400078056e-06, + "loss": 1.8134, + "mean_token_accuracy": 0.5801411867141724, + "num_tokens": 6501933008.0, + "step": 12718 + }, + { + "epoch": 3.4394267171444026, + "grad_norm": 1.0553001165390015, + "learning_rate": 6.221297421601941e-06, + "loss": 2.0166, + "mean_token_accuracy": 0.5528554916381836, + "num_tokens": 6502375292.0, + "step": 12719 + }, + { + "epoch": 3.439697133585722, + "grad_norm": 0.9776422381401062, + "learning_rate": 6.219961589750472e-06, + "loss": 1.7116, + "mean_token_accuracy": 0.5859785676002502, + "num_tokens": 6502878536.0, + "step": 12720 + }, + { + "epoch": 3.4399675500270415, + "grad_norm": 0.39694175124168396, + "learning_rate": 6.2186259045646415e-06, + "loss": 1.069, + "mean_token_accuracy": 0.7021729946136475, + "num_tokens": 6503402752.0, + "step": 12721 + }, + { + "epoch": 3.440237966468361, + "grad_norm": 1.2729063034057617, + "learning_rate": 6.217290366085426e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5841712355613708, + "num_tokens": 6503926972.0, + "step": 12722 + }, + { + "epoch": 3.4405083829096808, + "grad_norm": 1.2965402603149414, + "learning_rate": 6.2159549743538105e-06, + "loss": 1.9099, + "mean_token_accuracy": 0.5712454319000244, + "num_tokens": 6504451114.0, + "step": 12723 + }, + { + "epoch": 3.4407787993510004, + "grad_norm": 1.1694461107254028, + "learning_rate": 6.214619729410766e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5786219835281372, + "num_tokens": 6504975313.0, + "step": 12724 + }, + { + "epoch": 3.44104921579232, + "grad_norm": 1.066819667816162, + "learning_rate": 6.213284631297256e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.5754281878471375, + "num_tokens": 6505499496.0, + "step": 12725 + }, + { + "epoch": 3.4413196322336397, + "grad_norm": 1.030275583267212, + "learning_rate": 6.211949680054257e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5902363061904907, + "num_tokens": 6506023670.0, + "step": 12726 + }, + { + "epoch": 3.4415900486749593, + "grad_norm": 1.1484113931655884, + "learning_rate": 6.210614875722716e-06, + "loss": 1.9198, + "mean_token_accuracy": 0.556513786315918, + "num_tokens": 6506547834.0, + "step": 12727 + }, + { + "epoch": 3.441860465116279, + "grad_norm": 1.053414225578308, + "learning_rate": 6.2092802183436e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5842797756195068, + "num_tokens": 6507071947.0, + "step": 12728 + }, + { + "epoch": 3.4421308815575986, + "grad_norm": 1.0191506147384644, + "learning_rate": 6.207945707957854e-06, + "loss": 1.9709, + "mean_token_accuracy": 0.545665979385376, + "num_tokens": 6507596186.0, + "step": 12729 + }, + { + "epoch": 3.4424012979989183, + "grad_norm": 1.0314295291900635, + "learning_rate": 6.2066113446064236e-06, + "loss": 1.9264, + "mean_token_accuracy": 0.5666851997375488, + "num_tokens": 6508118130.0, + "step": 12730 + }, + { + "epoch": 3.442671714440238, + "grad_norm": 0.9777849912643433, + "learning_rate": 6.205277128330258e-06, + "loss": 1.8956, + "mean_token_accuracy": 0.5642305612564087, + "num_tokens": 6508589326.0, + "step": 12731 + }, + { + "epoch": 3.4429421308815575, + "grad_norm": 1.0121777057647705, + "learning_rate": 6.203943059170286e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5787343382835388, + "num_tokens": 6509113580.0, + "step": 12732 + }, + { + "epoch": 3.443212547322877, + "grad_norm": 1.2055988311767578, + "learning_rate": 6.20260913716745e-06, + "loss": 1.9097, + "mean_token_accuracy": 0.5499426126480103, + "num_tokens": 6509637796.0, + "step": 12733 + }, + { + "epoch": 3.443482963764197, + "grad_norm": 0.9693697690963745, + "learning_rate": 6.201275362362669e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5833014249801636, + "num_tokens": 6510162060.0, + "step": 12734 + }, + { + "epoch": 3.4437533802055165, + "grad_norm": 0.9027139544487, + "learning_rate": 6.199941734796877e-06, + "loss": 1.8377, + "mean_token_accuracy": 0.5649985074996948, + "num_tokens": 6510678825.0, + "step": 12735 + }, + { + "epoch": 3.444023796646836, + "grad_norm": 1.1457328796386719, + "learning_rate": 6.198608254510983e-06, + "loss": 1.8774, + "mean_token_accuracy": 0.5761681795120239, + "num_tokens": 6511158534.0, + "step": 12736 + }, + { + "epoch": 3.4442942130881558, + "grad_norm": 0.9334420561790466, + "learning_rate": 6.197274921545913e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5775363445281982, + "num_tokens": 6511682671.0, + "step": 12737 + }, + { + "epoch": 3.4445646295294754, + "grad_norm": 1.0180145502090454, + "learning_rate": 6.195941735942571e-06, + "loss": 1.822, + "mean_token_accuracy": 0.5995039939880371, + "num_tokens": 6512131078.0, + "step": 12738 + }, + { + "epoch": 3.444835045970795, + "grad_norm": 1.1355804204940796, + "learning_rate": 6.1946086977418616e-06, + "loss": 1.8761, + "mean_token_accuracy": 0.5596075654029846, + "num_tokens": 6512655280.0, + "step": 12739 + }, + { + "epoch": 3.4451054624121147, + "grad_norm": 0.9086018800735474, + "learning_rate": 6.193275806984693e-06, + "loss": 1.7497, + "mean_token_accuracy": 0.5804264545440674, + "num_tokens": 6513179504.0, + "step": 12740 + }, + { + "epoch": 3.4453758788534343, + "grad_norm": 0.4623399078845978, + "learning_rate": 6.1919430637119574e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.7078135013580322, + "num_tokens": 6513660376.0, + "step": 12741 + }, + { + "epoch": 3.445646295294754, + "grad_norm": 0.9485539793968201, + "learning_rate": 6.190610467964545e-06, + "loss": 1.8519, + "mean_token_accuracy": 0.5797038078308105, + "num_tokens": 6514158270.0, + "step": 12742 + }, + { + "epoch": 3.4459167117360736, + "grad_norm": 1.0044857263565063, + "learning_rate": 6.1892780197833515e-06, + "loss": 1.8874, + "mean_token_accuracy": 0.5663114786148071, + "num_tokens": 6514682305.0, + "step": 12743 + }, + { + "epoch": 3.4461871281773933, + "grad_norm": 0.83604496717453, + "learning_rate": 6.187945719209254e-06, + "loss": 1.6945, + "mean_token_accuracy": 0.6013225317001343, + "num_tokens": 6515206577.0, + "step": 12744 + }, + { + "epoch": 3.446457544618713, + "grad_norm": 0.9035013914108276, + "learning_rate": 6.1866135662831306e-06, + "loss": 1.8651, + "mean_token_accuracy": 0.5534889698028564, + "num_tokens": 6515687205.0, + "step": 12745 + }, + { + "epoch": 3.4467279610600325, + "grad_norm": 6.800789833068848, + "learning_rate": 6.185281561045862e-06, + "loss": 1.6906, + "mean_token_accuracy": 0.6030198931694031, + "num_tokens": 6516211304.0, + "step": 12746 + }, + { + "epoch": 3.446998377501352, + "grad_norm": 1.3798061609268188, + "learning_rate": 6.18394970353831e-06, + "loss": 1.9046, + "mean_token_accuracy": 0.531519889831543, + "num_tokens": 6516735464.0, + "step": 12747 + }, + { + "epoch": 3.447268793942672, + "grad_norm": 1.0997378826141357, + "learning_rate": 6.182617993801343e-06, + "loss": 1.7745, + "mean_token_accuracy": 0.5595523118972778, + "num_tokens": 6517259651.0, + "step": 12748 + }, + { + "epoch": 3.4475392103839915, + "grad_norm": 1.132097840309143, + "learning_rate": 6.1812864318758295e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5726969838142395, + "num_tokens": 6517783794.0, + "step": 12749 + }, + { + "epoch": 3.447809626825311, + "grad_norm": 1.1188963651657104, + "learning_rate": 6.179955017802614e-06, + "loss": 1.826, + "mean_token_accuracy": 0.5770300626754761, + "num_tokens": 6518307864.0, + "step": 12750 + }, + { + "epoch": 3.4480800432666308, + "grad_norm": 1.4960392713546753, + "learning_rate": 6.178623751622556e-06, + "loss": 1.8787, + "mean_token_accuracy": 0.5690039396286011, + "num_tokens": 6518778602.0, + "step": 12751 + }, + { + "epoch": 3.4483504597079504, + "grad_norm": 1.0956510305404663, + "learning_rate": 6.177292633376502e-06, + "loss": 1.7804, + "mean_token_accuracy": 0.5957929491996765, + "num_tokens": 6519302877.0, + "step": 12752 + }, + { + "epoch": 3.44862087614927, + "grad_norm": 1.187267541885376, + "learning_rate": 6.175961663105287e-06, + "loss": 1.7703, + "mean_token_accuracy": 0.5648431777954102, + "num_tokens": 6519809964.0, + "step": 12753 + }, + { + "epoch": 3.4488912925905897, + "grad_norm": 1.512487769126892, + "learning_rate": 6.17463084084976e-06, + "loss": 1.8823, + "mean_token_accuracy": 0.5634902715682983, + "num_tokens": 6520334234.0, + "step": 12754 + }, + { + "epoch": 3.4491617090319093, + "grad_norm": 1.2527810335159302, + "learning_rate": 6.173300166650749e-06, + "loss": 1.921, + "mean_token_accuracy": 0.5549719333648682, + "num_tokens": 6520858405.0, + "step": 12755 + }, + { + "epoch": 3.449432125473229, + "grad_norm": 1.0437066555023193, + "learning_rate": 6.171969640549079e-06, + "loss": 1.8801, + "mean_token_accuracy": 0.5603305697441101, + "num_tokens": 6521382669.0, + "step": 12756 + }, + { + "epoch": 3.449702541914548, + "grad_norm": 1.1146020889282227, + "learning_rate": 6.170639262585583e-06, + "loss": 1.8766, + "mean_token_accuracy": 0.5824411511421204, + "num_tokens": 6521906864.0, + "step": 12757 + }, + { + "epoch": 3.4499729583558683, + "grad_norm": 1.350916862487793, + "learning_rate": 6.1693090328010764e-06, + "loss": 1.75, + "mean_token_accuracy": 0.5740854740142822, + "num_tokens": 6522408143.0, + "step": 12758 + }, + { + "epoch": 3.4502433747971875, + "grad_norm": 1.0086822509765625, + "learning_rate": 6.167978951236372e-06, + "loss": 1.7873, + "mean_token_accuracy": 0.5840786695480347, + "num_tokens": 6522932341.0, + "step": 12759 + }, + { + "epoch": 3.4505137912385075, + "grad_norm": 0.9598605036735535, + "learning_rate": 6.166649017932286e-06, + "loss": 1.8929, + "mean_token_accuracy": 0.5729643702507019, + "num_tokens": 6523452837.0, + "step": 12760 + }, + { + "epoch": 3.4507842076798267, + "grad_norm": 0.3510589599609375, + "learning_rate": 6.165319232929618e-06, + "loss": 1.2101, + "mean_token_accuracy": 0.6820831298828125, + "num_tokens": 6523977020.0, + "step": 12761 + }, + { + "epoch": 3.4510546241211464, + "grad_norm": 1.3772143125534058, + "learning_rate": 6.163989596269178e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.5699954032897949, + "num_tokens": 6524501183.0, + "step": 12762 + }, + { + "epoch": 3.451325040562466, + "grad_norm": 1.1310445070266724, + "learning_rate": 6.162660107991756e-06, + "loss": 1.9557, + "mean_token_accuracy": 0.5397186875343323, + "num_tokens": 6525025384.0, + "step": 12763 + }, + { + "epoch": 3.4515954570037857, + "grad_norm": 0.8895553946495056, + "learning_rate": 6.1613307681381515e-06, + "loss": 1.8112, + "mean_token_accuracy": 0.5638166069984436, + "num_tokens": 6525549503.0, + "step": 12764 + }, + { + "epoch": 3.4518658734451053, + "grad_norm": 1.2069648504257202, + "learning_rate": 6.160001576749144e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.5685279369354248, + "num_tokens": 6526073779.0, + "step": 12765 + }, + { + "epoch": 3.452136289886425, + "grad_norm": 1.1335796117782593, + "learning_rate": 6.158672533865525e-06, + "loss": 1.836, + "mean_token_accuracy": 0.5577520728111267, + "num_tokens": 6526597919.0, + "step": 12766 + }, + { + "epoch": 3.4524067063277446, + "grad_norm": 0.8906903266906738, + "learning_rate": 6.157343639528071e-06, + "loss": 1.6855, + "mean_token_accuracy": 0.5889385938644409, + "num_tokens": 6527122059.0, + "step": 12767 + }, + { + "epoch": 3.4526771227690642, + "grad_norm": 1.0988861322402954, + "learning_rate": 6.156014893777551e-06, + "loss": 1.8437, + "mean_token_accuracy": 0.5874058604240417, + "num_tokens": 6527646252.0, + "step": 12768 + }, + { + "epoch": 3.452947539210384, + "grad_norm": 1.0998493432998657, + "learning_rate": 6.154686296654744e-06, + "loss": 1.7775, + "mean_token_accuracy": 0.5882217884063721, + "num_tokens": 6528170485.0, + "step": 12769 + }, + { + "epoch": 3.4532179556517035, + "grad_norm": 1.060205101966858, + "learning_rate": 6.1533578482004095e-06, + "loss": 1.8929, + "mean_token_accuracy": 0.5786867141723633, + "num_tokens": 6528694680.0, + "step": 12770 + }, + { + "epoch": 3.453488372093023, + "grad_norm": 1.0728055238723755, + "learning_rate": 6.152029548455307e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5718480944633484, + "num_tokens": 6529218818.0, + "step": 12771 + }, + { + "epoch": 3.453758788534343, + "grad_norm": 1.0525760650634766, + "learning_rate": 6.150701397460197e-06, + "loss": 1.9279, + "mean_token_accuracy": 0.5611100196838379, + "num_tokens": 6529743043.0, + "step": 12772 + }, + { + "epoch": 3.4540292049756625, + "grad_norm": 1.0662617683410645, + "learning_rate": 6.149373395255825e-06, + "loss": 1.75, + "mean_token_accuracy": 0.6023069620132446, + "num_tokens": 6530208007.0, + "step": 12773 + }, + { + "epoch": 3.454299621416982, + "grad_norm": 0.9238001108169556, + "learning_rate": 6.148045541882947e-06, + "loss": 1.8572, + "mean_token_accuracy": 0.5634307861328125, + "num_tokens": 6530732252.0, + "step": 12774 + }, + { + "epoch": 3.4545700378583017, + "grad_norm": 1.1298809051513672, + "learning_rate": 6.146717837382297e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5563575029373169, + "num_tokens": 6531256489.0, + "step": 12775 + }, + { + "epoch": 3.4548404542996214, + "grad_norm": 1.1922489404678345, + "learning_rate": 6.145390281794619e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5754201412200928, + "num_tokens": 6531780630.0, + "step": 12776 + }, + { + "epoch": 3.455110870740941, + "grad_norm": 0.9762721061706543, + "learning_rate": 6.144062875160641e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5713332891464233, + "num_tokens": 6532265018.0, + "step": 12777 + }, + { + "epoch": 3.4553812871822607, + "grad_norm": 1.013217568397522, + "learning_rate": 6.142735617521096e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.5728598833084106, + "num_tokens": 6532757875.0, + "step": 12778 + }, + { + "epoch": 3.4556517036235803, + "grad_norm": 0.9730496406555176, + "learning_rate": 6.141408508916704e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5829629898071289, + "num_tokens": 6533282146.0, + "step": 12779 + }, + { + "epoch": 3.4559221200649, + "grad_norm": 1.014791488647461, + "learning_rate": 6.1400815493881916e-06, + "loss": 1.8887, + "mean_token_accuracy": 0.5666114091873169, + "num_tokens": 6533806309.0, + "step": 12780 + }, + { + "epoch": 3.4561925365062196, + "grad_norm": 0.4121394157409668, + "learning_rate": 6.138754738976269e-06, + "loss": 1.2042, + "mean_token_accuracy": 0.6850368976593018, + "num_tokens": 6534329669.0, + "step": 12781 + }, + { + "epoch": 3.4564629529475392, + "grad_norm": 1.1072542667388916, + "learning_rate": 6.137428077721643e-06, + "loss": 1.8338, + "mean_token_accuracy": 0.5818465948104858, + "num_tokens": 6534853811.0, + "step": 12782 + }, + { + "epoch": 3.456733369388859, + "grad_norm": 1.224374532699585, + "learning_rate": 6.136101565665027e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5732229948043823, + "num_tokens": 6535377951.0, + "step": 12783 + }, + { + "epoch": 3.4570037858301785, + "grad_norm": 0.9693546295166016, + "learning_rate": 6.134775202847118e-06, + "loss": 1.6768, + "mean_token_accuracy": 0.5868302583694458, + "num_tokens": 6535902143.0, + "step": 12784 + }, + { + "epoch": 3.457274202271498, + "grad_norm": 0.8295707106590271, + "learning_rate": 6.133448989308612e-06, + "loss": 1.77, + "mean_token_accuracy": 0.5930349230766296, + "num_tokens": 6536426361.0, + "step": 12785 + }, + { + "epoch": 3.457544618712818, + "grad_norm": 0.9648343324661255, + "learning_rate": 6.132122925090205e-06, + "loss": 1.7964, + "mean_token_accuracy": 0.5885957479476929, + "num_tokens": 6536950626.0, + "step": 12786 + }, + { + "epoch": 3.4578150351541375, + "grad_norm": 1.0092788934707642, + "learning_rate": 6.130797010232583e-06, + "loss": 1.8599, + "mean_token_accuracy": 0.583942174911499, + "num_tokens": 6537414143.0, + "step": 12787 + }, + { + "epoch": 3.458085451595457, + "grad_norm": 0.8901327252388, + "learning_rate": 6.129471244776424e-06, + "loss": 1.8272, + "mean_token_accuracy": 0.5788590908050537, + "num_tokens": 6537938418.0, + "step": 12788 + }, + { + "epoch": 3.4583558680367767, + "grad_norm": 1.035618782043457, + "learning_rate": 6.128145628762413e-06, + "loss": 1.8589, + "mean_token_accuracy": 0.5824756622314453, + "num_tokens": 6538462691.0, + "step": 12789 + }, + { + "epoch": 3.4586262844780964, + "grad_norm": 0.9825907349586487, + "learning_rate": 6.126820162231218e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5834424495697021, + "num_tokens": 6538986904.0, + "step": 12790 + }, + { + "epoch": 3.458896700919416, + "grad_norm": 1.0609886646270752, + "learning_rate": 6.1254948452235115e-06, + "loss": 1.9123, + "mean_token_accuracy": 0.5712372660636902, + "num_tokens": 6539511079.0, + "step": 12791 + }, + { + "epoch": 3.4591671173607357, + "grad_norm": 1.0161370038986206, + "learning_rate": 6.124169677779963e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5590263605117798, + "num_tokens": 6540035340.0, + "step": 12792 + }, + { + "epoch": 3.4594375338020553, + "grad_norm": 0.8835612535476685, + "learning_rate": 6.122844659941228e-06, + "loss": 1.7873, + "mean_token_accuracy": 0.5772360563278198, + "num_tokens": 6540508231.0, + "step": 12793 + }, + { + "epoch": 3.459707950243375, + "grad_norm": 0.9382266402244568, + "learning_rate": 6.1215197917479564e-06, + "loss": 1.8912, + "mean_token_accuracy": 0.5741924047470093, + "num_tokens": 6541032512.0, + "step": 12794 + }, + { + "epoch": 3.4599783666846946, + "grad_norm": 0.8722637295722961, + "learning_rate": 6.120195073240808e-06, + "loss": 1.7462, + "mean_token_accuracy": 0.5816331505775452, + "num_tokens": 6541556727.0, + "step": 12795 + }, + { + "epoch": 3.4602487831260142, + "grad_norm": 0.918467104434967, + "learning_rate": 6.118870504460422e-06, + "loss": 1.748, + "mean_token_accuracy": 0.5779666900634766, + "num_tokens": 6542047655.0, + "step": 12796 + }, + { + "epoch": 3.460519199567334, + "grad_norm": 0.9344140291213989, + "learning_rate": 6.117546085447447e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5829263925552368, + "num_tokens": 6542571893.0, + "step": 12797 + }, + { + "epoch": 3.460789616008653, + "grad_norm": 1.1010547876358032, + "learning_rate": 6.116221816242517e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5785892009735107, + "num_tokens": 6543086996.0, + "step": 12798 + }, + { + "epoch": 3.461060032449973, + "grad_norm": 0.9909186363220215, + "learning_rate": 6.11489769688626e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.568350613117218, + "num_tokens": 6543611201.0, + "step": 12799 + }, + { + "epoch": 3.4613304488912924, + "grad_norm": 0.9474058747291565, + "learning_rate": 6.11357372741931e-06, + "loss": 1.7681, + "mean_token_accuracy": 0.5958582162857056, + "num_tokens": 6544072064.0, + "step": 12800 + }, + { + "epoch": 3.4616008653326125, + "grad_norm": 0.4237861931324005, + "learning_rate": 6.11224990788229e-06, + "loss": 1.1913, + "mean_token_accuracy": 0.6772685050964355, + "num_tokens": 6544596331.0, + "step": 12801 + }, + { + "epoch": 3.4618712817739317, + "grad_norm": 1.0112206935882568, + "learning_rate": 6.110926238315813e-06, + "loss": 1.7004, + "mean_token_accuracy": 0.593795120716095, + "num_tokens": 6545120605.0, + "step": 12802 + }, + { + "epoch": 3.4621416982152513, + "grad_norm": 1.271702527999878, + "learning_rate": 6.1096027187605e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.575960636138916, + "num_tokens": 6545644819.0, + "step": 12803 + }, + { + "epoch": 3.462412114656571, + "grad_norm": 0.9684860110282898, + "learning_rate": 6.108279349256953e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5891539454460144, + "num_tokens": 6546146220.0, + "step": 12804 + }, + { + "epoch": 3.4626825310978906, + "grad_norm": 0.9578292965888977, + "learning_rate": 6.1069561298457824e-06, + "loss": 1.9077, + "mean_token_accuracy": 0.5641213059425354, + "num_tokens": 6546670412.0, + "step": 12805 + }, + { + "epoch": 3.4629529475392102, + "grad_norm": 0.8855208158493042, + "learning_rate": 6.1056330605675905e-06, + "loss": 1.7584, + "mean_token_accuracy": 0.5974009037017822, + "num_tokens": 6547194657.0, + "step": 12806 + }, + { + "epoch": 3.46322336398053, + "grad_norm": 1.0906925201416016, + "learning_rate": 6.10431014146297e-06, + "loss": 1.8114, + "mean_token_accuracy": 0.5782167911529541, + "num_tokens": 6547661723.0, + "step": 12807 + }, + { + "epoch": 3.4634937804218495, + "grad_norm": 1.0780291557312012, + "learning_rate": 6.102987372572506e-06, + "loss": 1.7533, + "mean_token_accuracy": 0.5807588696479797, + "num_tokens": 6548180308.0, + "step": 12808 + }, + { + "epoch": 3.463764196863169, + "grad_norm": 1.0716102123260498, + "learning_rate": 6.101664753936798e-06, + "loss": 1.7571, + "mean_token_accuracy": 0.5841643214225769, + "num_tokens": 6548704587.0, + "step": 12809 + }, + { + "epoch": 3.464034613304489, + "grad_norm": 1.1162303686141968, + "learning_rate": 6.100342285596417e-06, + "loss": 1.89, + "mean_token_accuracy": 0.574363112449646, + "num_tokens": 6549228860.0, + "step": 12810 + }, + { + "epoch": 3.4643050297458085, + "grad_norm": 0.9111051559448242, + "learning_rate": 6.099019967591942e-06, + "loss": 1.9296, + "mean_token_accuracy": 0.5537029504776001, + "num_tokens": 6549753063.0, + "step": 12811 + }, + { + "epoch": 3.464575446187128, + "grad_norm": 0.9317581057548523, + "learning_rate": 6.097697799963948e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5639793872833252, + "num_tokens": 6550276213.0, + "step": 12812 + }, + { + "epoch": 3.4648458626284477, + "grad_norm": 0.9663051962852478, + "learning_rate": 6.096375782753006e-06, + "loss": 1.8363, + "mean_token_accuracy": 0.5783127546310425, + "num_tokens": 6550739506.0, + "step": 12813 + }, + { + "epoch": 3.4651162790697674, + "grad_norm": 0.9780154824256897, + "learning_rate": 6.09505391599967e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.6052404046058655, + "num_tokens": 6551164641.0, + "step": 12814 + }, + { + "epoch": 3.465386695511087, + "grad_norm": 1.0757722854614258, + "learning_rate": 6.0937321997445075e-06, + "loss": 1.7269, + "mean_token_accuracy": 0.633045494556427, + "num_tokens": 6551625477.0, + "step": 12815 + }, + { + "epoch": 3.4656571119524067, + "grad_norm": 1.2424372434616089, + "learning_rate": 6.0924106340280695e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5822805166244507, + "num_tokens": 6552101171.0, + "step": 12816 + }, + { + "epoch": 3.4659275283937263, + "grad_norm": 1.0527666807174683, + "learning_rate": 6.091089218890905e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5546584725379944, + "num_tokens": 6552625401.0, + "step": 12817 + }, + { + "epoch": 3.466197944835046, + "grad_norm": 0.945396900177002, + "learning_rate": 6.089767954373555e-06, + "loss": 1.834, + "mean_token_accuracy": 0.5826267600059509, + "num_tokens": 6553149493.0, + "step": 12818 + }, + { + "epoch": 3.4664683612763656, + "grad_norm": 0.993289589881897, + "learning_rate": 6.088446840516571e-06, + "loss": 1.9251, + "mean_token_accuracy": 0.5738203525543213, + "num_tokens": 6553673717.0, + "step": 12819 + }, + { + "epoch": 3.4667387777176852, + "grad_norm": 1.1449611186981201, + "learning_rate": 6.087125877360477e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5801602602005005, + "num_tokens": 6554197924.0, + "step": 12820 + }, + { + "epoch": 3.467009194159005, + "grad_norm": 0.35657572746276855, + "learning_rate": 6.085805064945813e-06, + "loss": 1.1614, + "mean_token_accuracy": 0.685858428478241, + "num_tokens": 6554722160.0, + "step": 12821 + }, + { + "epoch": 3.4672796106003245, + "grad_norm": 1.0550035238265991, + "learning_rate": 6.0844844033130975e-06, + "loss": 1.8179, + "mean_token_accuracy": 0.5723931789398193, + "num_tokens": 6555246429.0, + "step": 12822 + }, + { + "epoch": 3.467550027041644, + "grad_norm": 0.9865236282348633, + "learning_rate": 6.083163892502859e-06, + "loss": 1.9206, + "mean_token_accuracy": 0.5610343217849731, + "num_tokens": 6555770697.0, + "step": 12823 + }, + { + "epoch": 3.467820443482964, + "grad_norm": 0.8227479457855225, + "learning_rate": 6.081843532555611e-06, + "loss": 1.7279, + "mean_token_accuracy": 0.5914052128791809, + "num_tokens": 6556294912.0, + "step": 12824 + }, + { + "epoch": 3.4680908599242835, + "grad_norm": 1.211372971534729, + "learning_rate": 6.080523323511867e-06, + "loss": 1.9063, + "mean_token_accuracy": 0.5497369766235352, + "num_tokens": 6556819130.0, + "step": 12825 + }, + { + "epoch": 3.468361276365603, + "grad_norm": 1.051230788230896, + "learning_rate": 6.0792032654121345e-06, + "loss": 1.8245, + "mean_token_accuracy": 0.5744009017944336, + "num_tokens": 6557285077.0, + "step": 12826 + }, + { + "epoch": 3.4686316928069227, + "grad_norm": 1.3056379556655884, + "learning_rate": 6.077883358296919e-06, + "loss": 1.8681, + "mean_token_accuracy": 0.5824495553970337, + "num_tokens": 6557728135.0, + "step": 12827 + }, + { + "epoch": 3.4689021092482424, + "grad_norm": 1.0949763059616089, + "learning_rate": 6.076563602206713e-06, + "loss": 1.8507, + "mean_token_accuracy": 0.5563584566116333, + "num_tokens": 6558252315.0, + "step": 12828 + }, + { + "epoch": 3.469172525689562, + "grad_norm": 1.142451286315918, + "learning_rate": 6.0752439971820165e-06, + "loss": 1.8524, + "mean_token_accuracy": 0.5720337629318237, + "num_tokens": 6558744391.0, + "step": 12829 + }, + { + "epoch": 3.4694429421308817, + "grad_norm": 1.184988021850586, + "learning_rate": 6.073924543263319e-06, + "loss": 1.7662, + "mean_token_accuracy": 0.5934938192367554, + "num_tokens": 6559268613.0, + "step": 12830 + }, + { + "epoch": 3.4697133585722013, + "grad_norm": 0.8334160447120667, + "learning_rate": 6.072605240491098e-06, + "loss": 1.7678, + "mean_token_accuracy": 0.5950038433074951, + "num_tokens": 6559792880.0, + "step": 12831 + }, + { + "epoch": 3.469983775013521, + "grad_norm": 0.9762915968894958, + "learning_rate": 6.071286088905844e-06, + "loss": 1.7017, + "mean_token_accuracy": 0.5896925926208496, + "num_tokens": 6560306413.0, + "step": 12832 + }, + { + "epoch": 3.4702541914548406, + "grad_norm": 1.0780467987060547, + "learning_rate": 6.069967088548023e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.6056191921234131, + "num_tokens": 6560770387.0, + "step": 12833 + }, + { + "epoch": 3.4705246078961602, + "grad_norm": 1.1926772594451904, + "learning_rate": 6.06864823945811e-06, + "loss": 1.9415, + "mean_token_accuracy": 0.5532159209251404, + "num_tokens": 6561294648.0, + "step": 12834 + }, + { + "epoch": 3.47079502433748, + "grad_norm": 1.1309727430343628, + "learning_rate": 6.067329541676574e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.563715934753418, + "num_tokens": 6561818928.0, + "step": 12835 + }, + { + "epoch": 3.4710654407787995, + "grad_norm": 1.385704517364502, + "learning_rate": 6.066010995243874e-06, + "loss": 1.6687, + "mean_token_accuracy": 0.6066290140151978, + "num_tokens": 6562343113.0, + "step": 12836 + }, + { + "epoch": 3.471335857220119, + "grad_norm": 0.9859545230865479, + "learning_rate": 6.064692600200464e-06, + "loss": 1.9051, + "mean_token_accuracy": 0.5751748085021973, + "num_tokens": 6562867292.0, + "step": 12837 + }, + { + "epoch": 3.471606273661439, + "grad_norm": 1.0565294027328491, + "learning_rate": 6.0633743565868015e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5767291784286499, + "num_tokens": 6563391248.0, + "step": 12838 + }, + { + "epoch": 3.471876690102758, + "grad_norm": 0.9335885643959045, + "learning_rate": 6.062056264443331e-06, + "loss": 1.7516, + "mean_token_accuracy": 0.5951969623565674, + "num_tokens": 6563915527.0, + "step": 12839 + }, + { + "epoch": 3.472147106544078, + "grad_norm": 0.9550049304962158, + "learning_rate": 6.060738323810493e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5905306339263916, + "num_tokens": 6564439741.0, + "step": 12840 + }, + { + "epoch": 3.4724175229853973, + "grad_norm": 0.32741791009902954, + "learning_rate": 6.059420534728733e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.7034739851951599, + "num_tokens": 6564964000.0, + "step": 12841 + }, + { + "epoch": 3.4726879394267174, + "grad_norm": 1.2729977369308472, + "learning_rate": 6.058102897238476e-06, + "loss": 1.9348, + "mean_token_accuracy": 0.5653972625732422, + "num_tokens": 6565474808.0, + "step": 12842 + }, + { + "epoch": 3.4729583558680366, + "grad_norm": 1.107390284538269, + "learning_rate": 6.056785411380158e-06, + "loss": 1.8446, + "mean_token_accuracy": 0.5701985359191895, + "num_tokens": 6565999023.0, + "step": 12843 + }, + { + "epoch": 3.4732287723093562, + "grad_norm": 0.9322366714477539, + "learning_rate": 6.055468077194202e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.5900370478630066, + "num_tokens": 6566494280.0, + "step": 12844 + }, + { + "epoch": 3.473499188750676, + "grad_norm": 0.9026957154273987, + "learning_rate": 6.054150894721023e-06, + "loss": 1.8094, + "mean_token_accuracy": 0.5895897150039673, + "num_tokens": 6567018537.0, + "step": 12845 + }, + { + "epoch": 3.4737696051919955, + "grad_norm": 1.2331820726394653, + "learning_rate": 6.05283386400104e-06, + "loss": 1.9491, + "mean_token_accuracy": 0.5509930849075317, + "num_tokens": 6567542815.0, + "step": 12846 + }, + { + "epoch": 3.474040021633315, + "grad_norm": 1.2401692867279053, + "learning_rate": 6.051516985074661e-06, + "loss": 1.849, + "mean_token_accuracy": 0.5817239284515381, + "num_tokens": 6568040305.0, + "step": 12847 + }, + { + "epoch": 3.474310438074635, + "grad_norm": 1.0198439359664917, + "learning_rate": 6.050200257982295e-06, + "loss": 1.8752, + "mean_token_accuracy": 0.5845605731010437, + "num_tokens": 6568529304.0, + "step": 12848 + }, + { + "epoch": 3.4745808545159544, + "grad_norm": 1.120216965675354, + "learning_rate": 6.048883682764343e-06, + "loss": 1.877, + "mean_token_accuracy": 0.5508430004119873, + "num_tokens": 6569053259.0, + "step": 12849 + }, + { + "epoch": 3.474851270957274, + "grad_norm": 1.0829583406448364, + "learning_rate": 6.0475672594612e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5612510442733765, + "num_tokens": 6569577395.0, + "step": 12850 + }, + { + "epoch": 3.4751216873985937, + "grad_norm": 0.9132903218269348, + "learning_rate": 6.046250988113255e-06, + "loss": 1.8094, + "mean_token_accuracy": 0.5780526399612427, + "num_tokens": 6570101670.0, + "step": 12851 + }, + { + "epoch": 3.4753921038399134, + "grad_norm": 1.0867691040039062, + "learning_rate": 6.044934868760901e-06, + "loss": 1.8647, + "mean_token_accuracy": 0.5738680362701416, + "num_tokens": 6570625942.0, + "step": 12852 + }, + { + "epoch": 3.475662520281233, + "grad_norm": 0.9812228083610535, + "learning_rate": 6.043618901444516e-06, + "loss": 1.991, + "mean_token_accuracy": 0.543525218963623, + "num_tokens": 6571150214.0, + "step": 12853 + }, + { + "epoch": 3.4759329367225527, + "grad_norm": 0.9158093929290771, + "learning_rate": 6.042303086204477e-06, + "loss": 1.7903, + "mean_token_accuracy": 0.5892208814620972, + "num_tokens": 6571674366.0, + "step": 12854 + }, + { + "epoch": 3.4762033531638723, + "grad_norm": 1.1128695011138916, + "learning_rate": 6.040987423081162e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5898885726928711, + "num_tokens": 6572096859.0, + "step": 12855 + }, + { + "epoch": 3.476473769605192, + "grad_norm": 1.1515352725982666, + "learning_rate": 6.039671912114935e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5941143035888672, + "num_tokens": 6572555713.0, + "step": 12856 + }, + { + "epoch": 3.4767441860465116, + "grad_norm": 0.9549411535263062, + "learning_rate": 6.038356553346159e-06, + "loss": 1.7347, + "mean_token_accuracy": 0.588154673576355, + "num_tokens": 6573079996.0, + "step": 12857 + }, + { + "epoch": 3.4770146024878312, + "grad_norm": 1.0920330286026, + "learning_rate": 6.037041346815199e-06, + "loss": 1.9527, + "mean_token_accuracy": 0.5455768704414368, + "num_tokens": 6573580037.0, + "step": 12858 + }, + { + "epoch": 3.477285018929151, + "grad_norm": 1.1333799362182617, + "learning_rate": 6.035726292562405e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.5838966369628906, + "num_tokens": 6574060792.0, + "step": 12859 + }, + { + "epoch": 3.4775554353704705, + "grad_norm": 0.958869993686676, + "learning_rate": 6.0344113906281234e-06, + "loss": 1.919, + "mean_token_accuracy": 0.5717706084251404, + "num_tokens": 6574584940.0, + "step": 12860 + }, + { + "epoch": 3.47782585181179, + "grad_norm": 0.4261629581451416, + "learning_rate": 6.033096641052705e-06, + "loss": 1.2017, + "mean_token_accuracy": 0.6994520425796509, + "num_tokens": 6574963503.0, + "step": 12861 + }, + { + "epoch": 3.47809626825311, + "grad_norm": 1.179215908050537, + "learning_rate": 6.031782043876488e-06, + "loss": 1.7793, + "mean_token_accuracy": 0.5960714817047119, + "num_tokens": 6575487707.0, + "step": 12862 + }, + { + "epoch": 3.4783666846944294, + "grad_norm": 1.057028889656067, + "learning_rate": 6.030467599139809e-06, + "loss": 1.8104, + "mean_token_accuracy": 0.573244571685791, + "num_tokens": 6576011914.0, + "step": 12863 + }, + { + "epoch": 3.478637101135749, + "grad_norm": 0.9577669501304626, + "learning_rate": 6.029153306883e-06, + "loss": 1.858, + "mean_token_accuracy": 0.5594221353530884, + "num_tokens": 6576536140.0, + "step": 12864 + }, + { + "epoch": 3.4789075175770687, + "grad_norm": 0.8982965350151062, + "learning_rate": 6.027839167146383e-06, + "loss": 1.8278, + "mean_token_accuracy": 0.5899361968040466, + "num_tokens": 6577060222.0, + "step": 12865 + }, + { + "epoch": 3.4791779340183884, + "grad_norm": 1.2604386806488037, + "learning_rate": 6.026525179970286e-06, + "loss": 1.793, + "mean_token_accuracy": 0.6102885007858276, + "num_tokens": 6577584491.0, + "step": 12866 + }, + { + "epoch": 3.479448350459708, + "grad_norm": 1.1640254259109497, + "learning_rate": 6.025211345395023e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.5827817916870117, + "num_tokens": 6578108582.0, + "step": 12867 + }, + { + "epoch": 3.4797187669010277, + "grad_norm": 1.0537853240966797, + "learning_rate": 6.0238976634609e-06, + "loss": 1.7449, + "mean_token_accuracy": 0.582667350769043, + "num_tokens": 6578620749.0, + "step": 12868 + }, + { + "epoch": 3.4799891833423473, + "grad_norm": 1.1142768859863281, + "learning_rate": 6.022584134208236e-06, + "loss": 1.9374, + "mean_token_accuracy": 0.5734114646911621, + "num_tokens": 6579045961.0, + "step": 12869 + }, + { + "epoch": 3.480259599783667, + "grad_norm": 1.0233874320983887, + "learning_rate": 6.021270757677327e-06, + "loss": 1.7385, + "mean_token_accuracy": 0.59201979637146, + "num_tokens": 6579570206.0, + "step": 12870 + }, + { + "epoch": 3.4805300162249866, + "grad_norm": 0.9298044443130493, + "learning_rate": 6.019957533908472e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5832695364952087, + "num_tokens": 6580094323.0, + "step": 12871 + }, + { + "epoch": 3.4808004326663062, + "grad_norm": 1.1003901958465576, + "learning_rate": 6.0186444629419675e-06, + "loss": 1.6357, + "mean_token_accuracy": 0.6208457946777344, + "num_tokens": 6580556775.0, + "step": 12872 + }, + { + "epoch": 3.481070849107626, + "grad_norm": 0.9950740337371826, + "learning_rate": 6.017331544818099e-06, + "loss": 1.8449, + "mean_token_accuracy": 0.576171875, + "num_tokens": 6581081001.0, + "step": 12873 + }, + { + "epoch": 3.4813412655489455, + "grad_norm": 0.7812010645866394, + "learning_rate": 6.016018779577149e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.5680124759674072, + "num_tokens": 6581605149.0, + "step": 12874 + }, + { + "epoch": 3.481611681990265, + "grad_norm": 0.9994192123413086, + "learning_rate": 6.014706167259399e-06, + "loss": 1.8902, + "mean_token_accuracy": 0.572847306728363, + "num_tokens": 6582129261.0, + "step": 12875 + }, + { + "epoch": 3.481882098431585, + "grad_norm": 1.0068485736846924, + "learning_rate": 6.0133937079051284e-06, + "loss": 1.9559, + "mean_token_accuracy": 0.5463663935661316, + "num_tokens": 6582653438.0, + "step": 12876 + }, + { + "epoch": 3.4821525148729044, + "grad_norm": 0.8893139958381653, + "learning_rate": 6.0120814015546e-06, + "loss": 1.7748, + "mean_token_accuracy": 0.5977717638015747, + "num_tokens": 6583177707.0, + "step": 12877 + }, + { + "epoch": 3.482422931314224, + "grad_norm": 1.0621706247329712, + "learning_rate": 6.010769248248086e-06, + "loss": 1.9168, + "mean_token_accuracy": 0.5732781887054443, + "num_tokens": 6583701977.0, + "step": 12878 + }, + { + "epoch": 3.4826933477555437, + "grad_norm": 1.0157032012939453, + "learning_rate": 6.0094572480258436e-06, + "loss": 1.7237, + "mean_token_accuracy": 0.5840338468551636, + "num_tokens": 6584167592.0, + "step": 12879 + }, + { + "epoch": 3.482963764196863, + "grad_norm": 0.9492900371551514, + "learning_rate": 6.008145400928127e-06, + "loss": 1.8377, + "mean_token_accuracy": 0.572020411491394, + "num_tokens": 6584691695.0, + "step": 12880 + }, + { + "epoch": 3.483234180638183, + "grad_norm": 0.3970779776573181, + "learning_rate": 6.0068337069951925e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.7022037506103516, + "num_tokens": 6585208597.0, + "step": 12881 + }, + { + "epoch": 3.483504597079502, + "grad_norm": 1.315440058708191, + "learning_rate": 6.0055221662672834e-06, + "loss": 1.9443, + "mean_token_accuracy": 0.5633764266967773, + "num_tokens": 6585732745.0, + "step": 12882 + }, + { + "epoch": 3.4837750135208223, + "grad_norm": 1.139398217201233, + "learning_rate": 6.004210778784638e-06, + "loss": 1.9183, + "mean_token_accuracy": 0.5648171901702881, + "num_tokens": 6586256986.0, + "step": 12883 + }, + { + "epoch": 3.4840454299621415, + "grad_norm": 1.0299263000488281, + "learning_rate": 6.002899544587503e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.5777742862701416, + "num_tokens": 6586781168.0, + "step": 12884 + }, + { + "epoch": 3.484315846403461, + "grad_norm": 0.9582818746566772, + "learning_rate": 6.001588463716103e-06, + "loss": 1.9184, + "mean_token_accuracy": 0.5551406741142273, + "num_tokens": 6587295512.0, + "step": 12885 + }, + { + "epoch": 3.484586262844781, + "grad_norm": 1.0458338260650635, + "learning_rate": 6.000277536210667e-06, + "loss": 1.8927, + "mean_token_accuracy": 0.575973629951477, + "num_tokens": 6587807068.0, + "step": 12886 + }, + { + "epoch": 3.4848566792861004, + "grad_norm": 1.0710715055465698, + "learning_rate": 5.998966762111422e-06, + "loss": 2.0146, + "mean_token_accuracy": 0.541100263595581, + "num_tokens": 6588331259.0, + "step": 12887 + }, + { + "epoch": 3.48512709572742, + "grad_norm": 1.0050686597824097, + "learning_rate": 5.99765614145858e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5828769207000732, + "num_tokens": 6588827974.0, + "step": 12888 + }, + { + "epoch": 3.4853975121687397, + "grad_norm": 1.1278996467590332, + "learning_rate": 5.9963456742923635e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.5745477676391602, + "num_tokens": 6589352196.0, + "step": 12889 + }, + { + "epoch": 3.4856679286100594, + "grad_norm": 0.9421975612640381, + "learning_rate": 5.99503536065297e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5506492257118225, + "num_tokens": 6589876480.0, + "step": 12890 + }, + { + "epoch": 3.485938345051379, + "grad_norm": 0.9240046143531799, + "learning_rate": 5.993725200580614e-06, + "loss": 1.826, + "mean_token_accuracy": 0.5686657428741455, + "num_tokens": 6590400754.0, + "step": 12891 + }, + { + "epoch": 3.4862087614926986, + "grad_norm": 1.0185264348983765, + "learning_rate": 5.992415194115493e-06, + "loss": 1.8796, + "mean_token_accuracy": 0.5708224773406982, + "num_tokens": 6590924915.0, + "step": 12892 + }, + { + "epoch": 3.4864791779340183, + "grad_norm": 0.8696054816246033, + "learning_rate": 5.991105341297799e-06, + "loss": 1.7068, + "mean_token_accuracy": 0.5997439026832581, + "num_tokens": 6591448942.0, + "step": 12893 + }, + { + "epoch": 3.486749594375338, + "grad_norm": 0.9288626909255981, + "learning_rate": 5.989795642167722e-06, + "loss": 1.7963, + "mean_token_accuracy": 0.6007832288742065, + "num_tokens": 6591973209.0, + "step": 12894 + }, + { + "epoch": 3.4870200108166576, + "grad_norm": 0.9531740546226501, + "learning_rate": 5.988486096765451e-06, + "loss": 1.9307, + "mean_token_accuracy": 0.5496970415115356, + "num_tokens": 6592497345.0, + "step": 12895 + }, + { + "epoch": 3.487290427257977, + "grad_norm": 0.9379400610923767, + "learning_rate": 5.987176705131164e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.5672174096107483, + "num_tokens": 6593021530.0, + "step": 12896 + }, + { + "epoch": 3.487560843699297, + "grad_norm": 0.8893337845802307, + "learning_rate": 5.985867467305036e-06, + "loss": 1.8406, + "mean_token_accuracy": 0.5853450298309326, + "num_tokens": 6593545689.0, + "step": 12897 + }, + { + "epoch": 3.4878312601406165, + "grad_norm": 0.8967942595481873, + "learning_rate": 5.984558383327242e-06, + "loss": 1.8463, + "mean_token_accuracy": 0.5765278339385986, + "num_tokens": 6594069841.0, + "step": 12898 + }, + { + "epoch": 3.488101676581936, + "grad_norm": 0.9848049879074097, + "learning_rate": 5.9832494532379446e-06, + "loss": 1.8166, + "mean_token_accuracy": 0.5955807566642761, + "num_tokens": 6594594122.0, + "step": 12899 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 1.0101052522659302, + "learning_rate": 5.981940677077305e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5723744630813599, + "num_tokens": 6595118278.0, + "step": 12900 + }, + { + "epoch": 3.4886425094645754, + "grad_norm": 0.4180125892162323, + "learning_rate": 5.980632054885487e-06, + "loss": 1.1388, + "mean_token_accuracy": 0.6913228034973145, + "num_tokens": 6595642545.0, + "step": 12901 + }, + { + "epoch": 3.488912925905895, + "grad_norm": 1.1016738414764404, + "learning_rate": 5.9793235867026345e-06, + "loss": 1.8521, + "mean_token_accuracy": 0.5759584903717041, + "num_tokens": 6596166732.0, + "step": 12902 + }, + { + "epoch": 3.4891833423472147, + "grad_norm": 1.0199159383773804, + "learning_rate": 5.978015272568903e-06, + "loss": 1.9111, + "mean_token_accuracy": 0.5679324269294739, + "num_tokens": 6596690906.0, + "step": 12903 + }, + { + "epoch": 3.4894537587885344, + "grad_norm": 0.9381869435310364, + "learning_rate": 5.976707112524427e-06, + "loss": 1.7233, + "mean_token_accuracy": 0.6114164590835571, + "num_tokens": 6597152249.0, + "step": 12904 + }, + { + "epoch": 3.489724175229854, + "grad_norm": 0.9516660571098328, + "learning_rate": 5.975399106609352e-06, + "loss": 1.8676, + "mean_token_accuracy": 0.5599426031112671, + "num_tokens": 6597676465.0, + "step": 12905 + }, + { + "epoch": 3.4899945916711737, + "grad_norm": 1.0636330842971802, + "learning_rate": 5.974091254863808e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5639063715934753, + "num_tokens": 6598200737.0, + "step": 12906 + }, + { + "epoch": 3.4902650081124933, + "grad_norm": 1.1836661100387573, + "learning_rate": 5.972783557327928e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.580877959728241, + "num_tokens": 6598724959.0, + "step": 12907 + }, + { + "epoch": 3.490535424553813, + "grad_norm": 0.8650010824203491, + "learning_rate": 5.971476014041831e-06, + "loss": 1.8916, + "mean_token_accuracy": 0.565565824508667, + "num_tokens": 6599249167.0, + "step": 12908 + }, + { + "epoch": 3.4908058409951326, + "grad_norm": 0.8833203911781311, + "learning_rate": 5.970168625045635e-06, + "loss": 1.7495, + "mean_token_accuracy": 0.5958046913146973, + "num_tokens": 6599743369.0, + "step": 12909 + }, + { + "epoch": 3.4910762574364522, + "grad_norm": 1.0103036165237427, + "learning_rate": 5.96886139037946e-06, + "loss": 1.9908, + "mean_token_accuracy": 0.5319608449935913, + "num_tokens": 6600267645.0, + "step": 12910 + }, + { + "epoch": 3.491346673877772, + "grad_norm": 0.9177525639533997, + "learning_rate": 5.96755431008341e-06, + "loss": 1.7711, + "mean_token_accuracy": 0.5874330997467041, + "num_tokens": 6600743218.0, + "step": 12911 + }, + { + "epoch": 3.4916170903190915, + "grad_norm": 1.0067757368087769, + "learning_rate": 5.966247384197596e-06, + "loss": 1.8184, + "mean_token_accuracy": 0.579633355140686, + "num_tokens": 6601209465.0, + "step": 12912 + }, + { + "epoch": 3.491887506760411, + "grad_norm": 0.8568516373634338, + "learning_rate": 5.964940612762118e-06, + "loss": 1.7757, + "mean_token_accuracy": 0.5704641342163086, + "num_tokens": 6601733684.0, + "step": 12913 + }, + { + "epoch": 3.492157923201731, + "grad_norm": 0.8605145215988159, + "learning_rate": 5.963633995817063e-06, + "loss": 1.6249, + "mean_token_accuracy": 0.6182429790496826, + "num_tokens": 6602257809.0, + "step": 12914 + }, + { + "epoch": 3.4924283396430504, + "grad_norm": 0.9940999150276184, + "learning_rate": 5.962327533402533e-06, + "loss": 1.8657, + "mean_token_accuracy": 0.5775282979011536, + "num_tokens": 6602781932.0, + "step": 12915 + }, + { + "epoch": 3.49269875608437, + "grad_norm": 1.0409431457519531, + "learning_rate": 5.9610212255586085e-06, + "loss": 1.9659, + "mean_token_accuracy": 0.5515623688697815, + "num_tokens": 6603306208.0, + "step": 12916 + }, + { + "epoch": 3.4929691725256897, + "grad_norm": 0.8967425227165222, + "learning_rate": 5.959715072325369e-06, + "loss": 1.8613, + "mean_token_accuracy": 0.5626143217086792, + "num_tokens": 6603794145.0, + "step": 12917 + }, + { + "epoch": 3.4932395889670094, + "grad_norm": 0.9314039349555969, + "learning_rate": 5.9584090737428926e-06, + "loss": 1.8061, + "mean_token_accuracy": 0.5829017758369446, + "num_tokens": 6604267136.0, + "step": 12918 + }, + { + "epoch": 3.493510005408329, + "grad_norm": 1.0533839464187622, + "learning_rate": 5.957103229851255e-06, + "loss": 1.9374, + "mean_token_accuracy": 0.5446521043777466, + "num_tokens": 6604791344.0, + "step": 12919 + }, + { + "epoch": 3.4937804218496487, + "grad_norm": 0.9128528237342834, + "learning_rate": 5.955797540690519e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.587699294090271, + "num_tokens": 6605315607.0, + "step": 12920 + }, + { + "epoch": 3.494050838290968, + "grad_norm": 0.3610713481903076, + "learning_rate": 5.95449200630075e-06, + "loss": 1.1432, + "mean_token_accuracy": 0.7100294828414917, + "num_tokens": 6605839822.0, + "step": 12921 + }, + { + "epoch": 3.494321254732288, + "grad_norm": 1.250977873802185, + "learning_rate": 5.953186626722005e-06, + "loss": 1.9356, + "mean_token_accuracy": 0.5659542083740234, + "num_tokens": 6606307362.0, + "step": 12922 + }, + { + "epoch": 3.494591671173607, + "grad_norm": 1.219620704650879, + "learning_rate": 5.951881401994334e-06, + "loss": 1.9664, + "mean_token_accuracy": 0.5436779260635376, + "num_tokens": 6606831445.0, + "step": 12923 + }, + { + "epoch": 3.4948620876149272, + "grad_norm": 0.9017209410667419, + "learning_rate": 5.950576332157789e-06, + "loss": 1.7775, + "mean_token_accuracy": 0.5740190744400024, + "num_tokens": 6607355729.0, + "step": 12924 + }, + { + "epoch": 3.4951325040562464, + "grad_norm": 0.9006220698356628, + "learning_rate": 5.9492714172524115e-06, + "loss": 1.8159, + "mean_token_accuracy": 0.5657656192779541, + "num_tokens": 6607879922.0, + "step": 12925 + }, + { + "epoch": 3.495402920497566, + "grad_norm": 0.9404345750808716, + "learning_rate": 5.947966657318237e-06, + "loss": 1.8103, + "mean_token_accuracy": 0.5616577863693237, + "num_tokens": 6608404139.0, + "step": 12926 + }, + { + "epoch": 3.4956733369388857, + "grad_norm": 1.2619518041610718, + "learning_rate": 5.946662052395305e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6304227709770203, + "num_tokens": 6608906903.0, + "step": 12927 + }, + { + "epoch": 3.4959437533802054, + "grad_norm": 1.147719383239746, + "learning_rate": 5.9453576025236426e-06, + "loss": 1.8256, + "mean_token_accuracy": 0.5917780995368958, + "num_tokens": 6609381898.0, + "step": 12928 + }, + { + "epoch": 3.496214169821525, + "grad_norm": 1.0889979600906372, + "learning_rate": 5.944053307743272e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.5725512504577637, + "num_tokens": 6609906089.0, + "step": 12929 + }, + { + "epoch": 3.4964845862628446, + "grad_norm": 1.0508719682693481, + "learning_rate": 5.942749168094215e-06, + "loss": 1.8824, + "mean_token_accuracy": 0.5658979415893555, + "num_tokens": 6610430349.0, + "step": 12930 + }, + { + "epoch": 3.4967550027041643, + "grad_norm": 1.0934730768203735, + "learning_rate": 5.941445183616485e-06, + "loss": 1.803, + "mean_token_accuracy": 0.5977611541748047, + "num_tokens": 6610883739.0, + "step": 12931 + }, + { + "epoch": 3.497025419145484, + "grad_norm": 1.1378101110458374, + "learning_rate": 5.940141354350095e-06, + "loss": 1.8706, + "mean_token_accuracy": 0.5808666944503784, + "num_tokens": 6611384444.0, + "step": 12932 + }, + { + "epoch": 3.4972958355868036, + "grad_norm": 0.8970110416412354, + "learning_rate": 5.938837680335046e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5887401103973389, + "num_tokens": 6611908582.0, + "step": 12933 + }, + { + "epoch": 3.497566252028123, + "grad_norm": 0.9970220923423767, + "learning_rate": 5.93753416161134e-06, + "loss": 1.8692, + "mean_token_accuracy": 0.5901631712913513, + "num_tokens": 6612432857.0, + "step": 12934 + }, + { + "epoch": 3.497836668469443, + "grad_norm": 0.9170949459075928, + "learning_rate": 5.936230798218977e-06, + "loss": 1.8892, + "mean_token_accuracy": 0.575149416923523, + "num_tokens": 6612892455.0, + "step": 12935 + }, + { + "epoch": 3.4981070849107625, + "grad_norm": 0.9807799458503723, + "learning_rate": 5.934927590197945e-06, + "loss": 1.7424, + "mean_token_accuracy": 0.5965072512626648, + "num_tokens": 6613398828.0, + "step": 12936 + }, + { + "epoch": 3.498377501352082, + "grad_norm": 0.8983471989631653, + "learning_rate": 5.933624537588226e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5726140141487122, + "num_tokens": 6613922989.0, + "step": 12937 + }, + { + "epoch": 3.498647917793402, + "grad_norm": 0.8132741451263428, + "learning_rate": 5.9323216404298104e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.583609402179718, + "num_tokens": 6614447257.0, + "step": 12938 + }, + { + "epoch": 3.4989183342347214, + "grad_norm": 0.8612746000289917, + "learning_rate": 5.931018898762668e-06, + "loss": 1.9421, + "mean_token_accuracy": 0.5700318813323975, + "num_tokens": 6614937044.0, + "step": 12939 + }, + { + "epoch": 3.499188750676041, + "grad_norm": 1.0143518447875977, + "learning_rate": 5.9297163126267705e-06, + "loss": 1.7898, + "mean_token_accuracy": 0.5837914347648621, + "num_tokens": 6615461218.0, + "step": 12940 + }, + { + "epoch": 3.4994591671173607, + "grad_norm": 0.3425493836402893, + "learning_rate": 5.928413882062091e-06, + "loss": 1.1309, + "mean_token_accuracy": 0.6933432221412659, + "num_tokens": 6615985445.0, + "step": 12941 + }, + { + "epoch": 3.4997295835586804, + "grad_norm": 1.0125237703323364, + "learning_rate": 5.927111607108587e-06, + "loss": 1.935, + "mean_token_accuracy": 0.5721540451049805, + "num_tokens": 6616426148.0, + "step": 12942 + }, + { + "epoch": 3.5, + "grad_norm": 0.9863653182983398, + "learning_rate": 5.925809487806215e-06, + "loss": 1.6809, + "mean_token_accuracy": 0.6001044511795044, + "num_tokens": 6616922858.0, + "step": 12943 + }, + { + "epoch": 3.5002704164413196, + "grad_norm": 0.9419193863868713, + "learning_rate": 5.924507524194932e-06, + "loss": 1.7762, + "mean_token_accuracy": 0.6009873151779175, + "num_tokens": 6617447076.0, + "step": 12944 + }, + { + "epoch": 3.5005408328826393, + "grad_norm": 0.8497679233551025, + "learning_rate": 5.923205716314682e-06, + "loss": 1.5799, + "mean_token_accuracy": 0.6413734555244446, + "num_tokens": 6617934461.0, + "step": 12945 + }, + { + "epoch": 3.500811249323959, + "grad_norm": 0.899267315864563, + "learning_rate": 5.9219040642054135e-06, + "loss": 1.9285, + "mean_token_accuracy": 0.5532706379890442, + "num_tokens": 6618458717.0, + "step": 12946 + }, + { + "epoch": 3.5010816657652786, + "grad_norm": 0.8548554182052612, + "learning_rate": 5.920602567907059e-06, + "loss": 1.7023, + "mean_token_accuracy": 0.6253461837768555, + "num_tokens": 6618983005.0, + "step": 12947 + }, + { + "epoch": 3.501352082206598, + "grad_norm": 0.7667780518531799, + "learning_rate": 5.919301227459558e-06, + "loss": 1.758, + "mean_token_accuracy": 0.5820272564888, + "num_tokens": 6619507263.0, + "step": 12948 + }, + { + "epoch": 3.501622498647918, + "grad_norm": 0.910281240940094, + "learning_rate": 5.918000042902834e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5823670625686646, + "num_tokens": 6620017085.0, + "step": 12949 + }, + { + "epoch": 3.5018929150892375, + "grad_norm": 0.8787257671356201, + "learning_rate": 5.916699014276817e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5558568835258484, + "num_tokens": 6620516172.0, + "step": 12950 + }, + { + "epoch": 3.502163331530557, + "grad_norm": 0.9929347038269043, + "learning_rate": 5.915398141621425e-06, + "loss": 1.9031, + "mean_token_accuracy": 0.558354377746582, + "num_tokens": 6621040453.0, + "step": 12951 + }, + { + "epoch": 3.502433747971877, + "grad_norm": 1.052330493927002, + "learning_rate": 5.914097424976564e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.5837869644165039, + "num_tokens": 6621531663.0, + "step": 12952 + }, + { + "epoch": 3.5027041644131964, + "grad_norm": 0.9300360083580017, + "learning_rate": 5.912796864382155e-06, + "loss": 1.8608, + "mean_token_accuracy": 0.566227376461029, + "num_tokens": 6622055899.0, + "step": 12953 + }, + { + "epoch": 3.502974580854516, + "grad_norm": 0.9140810370445251, + "learning_rate": 5.911496459878101e-06, + "loss": 1.9136, + "mean_token_accuracy": 0.5663272738456726, + "num_tokens": 6622580150.0, + "step": 12954 + }, + { + "epoch": 3.5032449972958357, + "grad_norm": 0.8531609177589417, + "learning_rate": 5.910196211504294e-06, + "loss": 1.7257, + "mean_token_accuracy": 0.5888276100158691, + "num_tokens": 6623104320.0, + "step": 12955 + }, + { + "epoch": 3.5035154137371554, + "grad_norm": 1.0839166641235352, + "learning_rate": 5.90889611930064e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5669271349906921, + "num_tokens": 6623628564.0, + "step": 12956 + }, + { + "epoch": 3.503785830178475, + "grad_norm": 0.931039571762085, + "learning_rate": 5.907596183307021e-06, + "loss": 1.8505, + "mean_token_accuracy": 0.5780396461486816, + "num_tokens": 6624105105.0, + "step": 12957 + }, + { + "epoch": 3.5040562466197946, + "grad_norm": 1.0340598821640015, + "learning_rate": 5.9062964035633306e-06, + "loss": 1.8803, + "mean_token_accuracy": 0.5540797114372253, + "num_tokens": 6624629151.0, + "step": 12958 + }, + { + "epoch": 3.5043266630611143, + "grad_norm": 0.8938749432563782, + "learning_rate": 5.904996780109447e-06, + "loss": 1.7489, + "mean_token_accuracy": 0.5841134786605835, + "num_tokens": 6625143143.0, + "step": 12959 + }, + { + "epoch": 3.5045970795024335, + "grad_norm": 1.0128283500671387, + "learning_rate": 5.903697312985241e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.5758397579193115, + "num_tokens": 6625667194.0, + "step": 12960 + }, + { + "epoch": 3.5048674959437536, + "grad_norm": 0.37143316864967346, + "learning_rate": 5.902398002230589e-06, + "loss": 1.1314, + "mean_token_accuracy": 0.7013002634048462, + "num_tokens": 6626161327.0, + "step": 12961 + }, + { + "epoch": 3.5051379123850728, + "grad_norm": 1.1571439504623413, + "learning_rate": 5.901098847885361e-06, + "loss": 1.8102, + "mean_token_accuracy": 0.5924717783927917, + "num_tokens": 6626662120.0, + "step": 12962 + }, + { + "epoch": 3.505408328826393, + "grad_norm": 1.0556581020355225, + "learning_rate": 5.899799849989411e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5926296710968018, + "num_tokens": 6627133041.0, + "step": 12963 + }, + { + "epoch": 3.505678745267712, + "grad_norm": 0.8756210803985596, + "learning_rate": 5.898501008582605e-06, + "loss": 1.8, + "mean_token_accuracy": 0.5806558728218079, + "num_tokens": 6627657249.0, + "step": 12964 + }, + { + "epoch": 3.505949161709032, + "grad_norm": 1.18375825881958, + "learning_rate": 5.897202323704789e-06, + "loss": 1.6523, + "mean_token_accuracy": 0.6013831496238708, + "num_tokens": 6628139269.0, + "step": 12965 + }, + { + "epoch": 3.5062195781503513, + "grad_norm": 1.1349818706512451, + "learning_rate": 5.89590379539581e-06, + "loss": 1.7083, + "mean_token_accuracy": 0.6060417890548706, + "num_tokens": 6628630684.0, + "step": 12966 + }, + { + "epoch": 3.5064899945916714, + "grad_norm": 1.0441666841506958, + "learning_rate": 5.894605423695517e-06, + "loss": 1.7417, + "mean_token_accuracy": 0.5808022022247314, + "num_tokens": 6629154802.0, + "step": 12967 + }, + { + "epoch": 3.5067604110329906, + "grad_norm": 0.9995533227920532, + "learning_rate": 5.8933072086437415e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.5748150944709778, + "num_tokens": 6629678955.0, + "step": 12968 + }, + { + "epoch": 3.5070308274743103, + "grad_norm": 0.9150555729866028, + "learning_rate": 5.892009150280317e-06, + "loss": 1.865, + "mean_token_accuracy": 0.5819418430328369, + "num_tokens": 6630203102.0, + "step": 12969 + }, + { + "epoch": 3.50730124391563, + "grad_norm": 1.0791183710098267, + "learning_rate": 5.890711248645076e-06, + "loss": 1.8882, + "mean_token_accuracy": 0.5646302103996277, + "num_tokens": 6630727366.0, + "step": 12970 + }, + { + "epoch": 3.5075716603569496, + "grad_norm": 1.416555404663086, + "learning_rate": 5.889413503777839e-06, + "loss": 1.7846, + "mean_token_accuracy": 0.5670610666275024, + "num_tokens": 6631192001.0, + "step": 12971 + }, + { + "epoch": 3.507842076798269, + "grad_norm": 1.0181165933609009, + "learning_rate": 5.888115915718422e-06, + "loss": 1.8225, + "mean_token_accuracy": 0.5836979150772095, + "num_tokens": 6631667299.0, + "step": 12972 + }, + { + "epoch": 3.508112493239589, + "grad_norm": 0.8822299838066101, + "learning_rate": 5.886818484506644e-06, + "loss": 1.775, + "mean_token_accuracy": 0.581214964389801, + "num_tokens": 6632191371.0, + "step": 12973 + }, + { + "epoch": 3.5083829096809085, + "grad_norm": 1.1371707916259766, + "learning_rate": 5.88552121018231e-06, + "loss": 1.9283, + "mean_token_accuracy": 0.5432422161102295, + "num_tokens": 6632715631.0, + "step": 12974 + }, + { + "epoch": 3.508653326122228, + "grad_norm": 1.4169448614120483, + "learning_rate": 5.884224092785224e-06, + "loss": 1.9209, + "mean_token_accuracy": 0.5669207572937012, + "num_tokens": 6633239854.0, + "step": 12975 + }, + { + "epoch": 3.5089237425635478, + "grad_norm": 0.9714102745056152, + "learning_rate": 5.882927132355194e-06, + "loss": 1.8779, + "mean_token_accuracy": 0.579399824142456, + "num_tokens": 6633753293.0, + "step": 12976 + }, + { + "epoch": 3.5091941590048674, + "grad_norm": 0.9726839065551758, + "learning_rate": 5.881630328932004e-06, + "loss": 1.7587, + "mean_token_accuracy": 0.5839186310768127, + "num_tokens": 6634277527.0, + "step": 12977 + }, + { + "epoch": 3.509464575446187, + "grad_norm": 0.9528732299804688, + "learning_rate": 5.880333682555448e-06, + "loss": 1.8558, + "mean_token_accuracy": 0.5681378841400146, + "num_tokens": 6634801798.0, + "step": 12978 + }, + { + "epoch": 3.5097349918875067, + "grad_norm": 1.3912363052368164, + "learning_rate": 5.8790371932653116e-06, + "loss": 1.8785, + "mean_token_accuracy": 0.5783548951148987, + "num_tokens": 6635238939.0, + "step": 12979 + }, + { + "epoch": 3.5100054083288263, + "grad_norm": 0.9653294086456299, + "learning_rate": 5.877740861101371e-06, + "loss": 1.9129, + "mean_token_accuracy": 0.5603879690170288, + "num_tokens": 6635763205.0, + "step": 12980 + }, + { + "epoch": 3.510275824770146, + "grad_norm": 0.3378996253013611, + "learning_rate": 5.876444686103407e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.7096922397613525, + "num_tokens": 6636287336.0, + "step": 12981 + }, + { + "epoch": 3.5105462412114656, + "grad_norm": 1.1871461868286133, + "learning_rate": 5.87514866831119e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.6163528561592102, + "num_tokens": 6636733668.0, + "step": 12982 + }, + { + "epoch": 3.5108166576527853, + "grad_norm": 1.4338113069534302, + "learning_rate": 5.873852807764477e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.571265459060669, + "num_tokens": 6637210995.0, + "step": 12983 + }, + { + "epoch": 3.511087074094105, + "grad_norm": 1.1869922876358032, + "learning_rate": 5.872557104503041e-06, + "loss": 1.9602, + "mean_token_accuracy": 0.5628913044929504, + "num_tokens": 6637673947.0, + "step": 12984 + }, + { + "epoch": 3.5113574905354246, + "grad_norm": 1.034508228302002, + "learning_rate": 5.8712615585666275e-06, + "loss": 1.8949, + "mean_token_accuracy": 0.5582200288772583, + "num_tokens": 6638198190.0, + "step": 12985 + }, + { + "epoch": 3.511627906976744, + "grad_norm": 1.0633963346481323, + "learning_rate": 5.869966169994993e-06, + "loss": 1.7569, + "mean_token_accuracy": 0.5911575555801392, + "num_tokens": 6638722463.0, + "step": 12986 + }, + { + "epoch": 3.511898323418064, + "grad_norm": 1.0344138145446777, + "learning_rate": 5.868670938827884e-06, + "loss": 1.7838, + "mean_token_accuracy": 0.6000741720199585, + "num_tokens": 6639208254.0, + "step": 12987 + }, + { + "epoch": 3.5121687398593835, + "grad_norm": 0.8715531229972839, + "learning_rate": 5.867375865105038e-06, + "loss": 1.8907, + "mean_token_accuracy": 0.5661745071411133, + "num_tokens": 6639732524.0, + "step": 12988 + }, + { + "epoch": 3.512439156300703, + "grad_norm": 0.8964660167694092, + "learning_rate": 5.8660809488661975e-06, + "loss": 1.7528, + "mean_token_accuracy": 0.5721985101699829, + "num_tokens": 6640256703.0, + "step": 12989 + }, + { + "epoch": 3.512709572742023, + "grad_norm": 0.9824574589729309, + "learning_rate": 5.864786190151088e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5842907428741455, + "num_tokens": 6640780843.0, + "step": 12990 + }, + { + "epoch": 3.5129799891833424, + "grad_norm": 0.9611212611198425, + "learning_rate": 5.863491588999444e-06, + "loss": 1.8885, + "mean_token_accuracy": 0.5744364857673645, + "num_tokens": 6641305119.0, + "step": 12991 + }, + { + "epoch": 3.513250405624662, + "grad_norm": 0.9991870522499084, + "learning_rate": 5.8621971454509795e-06, + "loss": 1.8158, + "mean_token_accuracy": 0.5922151803970337, + "num_tokens": 6641829321.0, + "step": 12992 + }, + { + "epoch": 3.5135208220659817, + "grad_norm": 0.9772365689277649, + "learning_rate": 5.86090285954542e-06, + "loss": 1.7746, + "mean_token_accuracy": 0.599950909614563, + "num_tokens": 6642340482.0, + "step": 12993 + }, + { + "epoch": 3.5137912385073014, + "grad_norm": 0.8908132314682007, + "learning_rate": 5.8596087313224724e-06, + "loss": 1.9616, + "mean_token_accuracy": 0.5662550330162048, + "num_tokens": 6642864735.0, + "step": 12994 + }, + { + "epoch": 3.514061654948621, + "grad_norm": 1.035903811454773, + "learning_rate": 5.8583147608218435e-06, + "loss": 1.9306, + "mean_token_accuracy": 0.5475022792816162, + "num_tokens": 6643354208.0, + "step": 12995 + }, + { + "epoch": 3.5143320713899406, + "grad_norm": 1.0330517292022705, + "learning_rate": 5.8570209480832415e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5645293593406677, + "num_tokens": 6643878366.0, + "step": 12996 + }, + { + "epoch": 3.5146024878312603, + "grad_norm": 1.0700650215148926, + "learning_rate": 5.8557272931463625e-06, + "loss": 1.8569, + "mean_token_accuracy": 0.5761188268661499, + "num_tokens": 6644402547.0, + "step": 12997 + }, + { + "epoch": 3.51487290427258, + "grad_norm": 1.079682469367981, + "learning_rate": 5.8544337960508936e-06, + "loss": 1.8475, + "mean_token_accuracy": 0.5702213048934937, + "num_tokens": 6644926741.0, + "step": 12998 + }, + { + "epoch": 3.5151433207138996, + "grad_norm": 1.2424544095993042, + "learning_rate": 5.853140456836532e-06, + "loss": 1.7449, + "mean_token_accuracy": 0.6050148010253906, + "num_tokens": 6645414301.0, + "step": 12999 + }, + { + "epoch": 3.515413737155219, + "grad_norm": 1.415329933166504, + "learning_rate": 5.851847275542957e-06, + "loss": 1.7686, + "mean_token_accuracy": 0.568270742893219, + "num_tokens": 6645938472.0, + "step": 13000 + }, + { + "epoch": 3.5156841535965384, + "grad_norm": 0.34221798181533813, + "learning_rate": 5.850554252209843e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.7139130234718323, + "num_tokens": 6646462667.0, + "step": 13001 + }, + { + "epoch": 3.5159545700378585, + "grad_norm": 1.3645108938217163, + "learning_rate": 5.8492613868768734e-06, + "loss": 1.8273, + "mean_token_accuracy": 0.5803086757659912, + "num_tokens": 6646986746.0, + "step": 13002 + }, + { + "epoch": 3.5162249864791777, + "grad_norm": 1.1824266910552979, + "learning_rate": 5.8479686795837075e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5461108088493347, + "num_tokens": 6647511028.0, + "step": 13003 + }, + { + "epoch": 3.516495402920498, + "grad_norm": 1.0895583629608154, + "learning_rate": 5.8466761303700135e-06, + "loss": 1.9156, + "mean_token_accuracy": 0.5789953470230103, + "num_tokens": 6647983682.0, + "step": 13004 + }, + { + "epoch": 3.516765819361817, + "grad_norm": 0.9888282418251038, + "learning_rate": 5.845383739275454e-06, + "loss": 1.8197, + "mean_token_accuracy": 0.5687854886054993, + "num_tokens": 6648507950.0, + "step": 13005 + }, + { + "epoch": 3.517036235803137, + "grad_norm": 1.1187858581542969, + "learning_rate": 5.844091506339677e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.5682967901229858, + "num_tokens": 6649032073.0, + "step": 13006 + }, + { + "epoch": 3.5173066522444563, + "grad_norm": 1.066838026046753, + "learning_rate": 5.842799431602337e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5774533748626709, + "num_tokens": 6649556219.0, + "step": 13007 + }, + { + "epoch": 3.5175770686857764, + "grad_norm": 0.973744809627533, + "learning_rate": 5.84150751510308e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.5641525983810425, + "num_tokens": 6650080361.0, + "step": 13008 + }, + { + "epoch": 3.5178474851270956, + "grad_norm": 1.0455901622772217, + "learning_rate": 5.840215756881537e-06, + "loss": 1.7153, + "mean_token_accuracy": 0.6078248023986816, + "num_tokens": 6650604564.0, + "step": 13009 + }, + { + "epoch": 3.518117901568415, + "grad_norm": 1.1461540460586548, + "learning_rate": 5.8389241569773524e-06, + "loss": 1.805, + "mean_token_accuracy": 0.5898034572601318, + "num_tokens": 6651068972.0, + "step": 13010 + }, + { + "epoch": 3.518388318009735, + "grad_norm": 0.9622243046760559, + "learning_rate": 5.837632715430151e-06, + "loss": 1.7179, + "mean_token_accuracy": 0.5979390144348145, + "num_tokens": 6651593071.0, + "step": 13011 + }, + { + "epoch": 3.5186587344510545, + "grad_norm": 1.267812728881836, + "learning_rate": 5.836341432279559e-06, + "loss": 1.8515, + "mean_token_accuracy": 0.5638260245323181, + "num_tokens": 6652070120.0, + "step": 13012 + }, + { + "epoch": 3.518929150892374, + "grad_norm": 1.2354344129562378, + "learning_rate": 5.835050307565198e-06, + "loss": 1.7607, + "mean_token_accuracy": 0.5897657871246338, + "num_tokens": 6652594347.0, + "step": 13013 + }, + { + "epoch": 3.5191995673336938, + "grad_norm": 1.009220004081726, + "learning_rate": 5.833759341326683e-06, + "loss": 1.7645, + "mean_token_accuracy": 0.5885261297225952, + "num_tokens": 6653118535.0, + "step": 13014 + }, + { + "epoch": 3.5194699837750134, + "grad_norm": 1.0139002799987793, + "learning_rate": 5.832468533603623e-06, + "loss": 1.7819, + "mean_token_accuracy": 0.6000238656997681, + "num_tokens": 6653532419.0, + "step": 13015 + }, + { + "epoch": 3.519740400216333, + "grad_norm": 1.355819821357727, + "learning_rate": 5.831177884435626e-06, + "loss": 1.9135, + "mean_token_accuracy": 0.5633413195610046, + "num_tokens": 6654056697.0, + "step": 13016 + }, + { + "epoch": 3.5200108166576527, + "grad_norm": 1.159847378730774, + "learning_rate": 5.82988739386229e-06, + "loss": 1.8058, + "mean_token_accuracy": 0.5747482180595398, + "num_tokens": 6654580939.0, + "step": 13017 + }, + { + "epoch": 3.5202812330989723, + "grad_norm": 1.1621425151824951, + "learning_rate": 5.828597061923213e-06, + "loss": 1.6992, + "mean_token_accuracy": 0.6184910535812378, + "num_tokens": 6655105209.0, + "step": 13018 + }, + { + "epoch": 3.520551649540292, + "grad_norm": 1.0835431814193726, + "learning_rate": 5.827306888657988e-06, + "loss": 1.7652, + "mean_token_accuracy": 0.5864561796188354, + "num_tokens": 6655629288.0, + "step": 13019 + }, + { + "epoch": 3.5208220659816116, + "grad_norm": 1.1021981239318848, + "learning_rate": 5.826016874106202e-06, + "loss": 1.8188, + "mean_token_accuracy": 0.5718506574630737, + "num_tokens": 6656153552.0, + "step": 13020 + }, + { + "epoch": 3.5210924824229313, + "grad_norm": 0.378751277923584, + "learning_rate": 5.82472701830743e-06, + "loss": 1.1247, + "mean_token_accuracy": 0.7044860124588013, + "num_tokens": 6656657038.0, + "step": 13021 + }, + { + "epoch": 3.521362898864251, + "grad_norm": 1.4390223026275635, + "learning_rate": 5.823437321301255e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5847012996673584, + "num_tokens": 6657117148.0, + "step": 13022 + }, + { + "epoch": 3.5216333153055706, + "grad_norm": 1.2736003398895264, + "learning_rate": 5.822147783127248e-06, + "loss": 1.741, + "mean_token_accuracy": 0.5655233860015869, + "num_tokens": 6657641301.0, + "step": 13023 + }, + { + "epoch": 3.52190373174689, + "grad_norm": 0.9806120991706848, + "learning_rate": 5.820858403824967e-06, + "loss": 1.7501, + "mean_token_accuracy": 0.5924304127693176, + "num_tokens": 6658165484.0, + "step": 13024 + }, + { + "epoch": 3.52217414818821, + "grad_norm": 0.8866348266601562, + "learning_rate": 5.819569183433988e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5802282094955444, + "num_tokens": 6658689662.0, + "step": 13025 + }, + { + "epoch": 3.5224445646295295, + "grad_norm": 1.1697067022323608, + "learning_rate": 5.818280121993856e-06, + "loss": 1.891, + "mean_token_accuracy": 0.5770821571350098, + "num_tokens": 6659213853.0, + "step": 13026 + }, + { + "epoch": 3.522714981070849, + "grad_norm": 0.8686885833740234, + "learning_rate": 5.816991219544132e-06, + "loss": 1.8124, + "mean_token_accuracy": 0.5755792260169983, + "num_tokens": 6659733672.0, + "step": 13027 + }, + { + "epoch": 3.5229853975121688, + "grad_norm": 0.9312485456466675, + "learning_rate": 5.815702476124361e-06, + "loss": 1.7956, + "mean_token_accuracy": 0.5847642421722412, + "num_tokens": 6660239357.0, + "step": 13028 + }, + { + "epoch": 3.5232558139534884, + "grad_norm": 0.9224827885627747, + "learning_rate": 5.81441389177408e-06, + "loss": 1.9053, + "mean_token_accuracy": 0.5746525526046753, + "num_tokens": 6660763511.0, + "step": 13029 + }, + { + "epoch": 3.523526230394808, + "grad_norm": 1.103546380996704, + "learning_rate": 5.813125466532832e-06, + "loss": 1.915, + "mean_token_accuracy": 0.5598680973052979, + "num_tokens": 6661287794.0, + "step": 13030 + }, + { + "epoch": 3.5237966468361277, + "grad_norm": 0.842525839805603, + "learning_rate": 5.8118372004401465e-06, + "loss": 1.7867, + "mean_token_accuracy": 0.5901552438735962, + "num_tokens": 6661811993.0, + "step": 13031 + }, + { + "epoch": 3.5240670632774473, + "grad_norm": 0.8720224499702454, + "learning_rate": 5.810549093535557e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.5791867971420288, + "num_tokens": 6662336178.0, + "step": 13032 + }, + { + "epoch": 3.524337479718767, + "grad_norm": 1.0196770429611206, + "learning_rate": 5.809261145858579e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5640180110931396, + "num_tokens": 6662860351.0, + "step": 13033 + }, + { + "epoch": 3.5246078961600866, + "grad_norm": 0.9514038562774658, + "learning_rate": 5.807973357448736e-06, + "loss": 1.7869, + "mean_token_accuracy": 0.5904736518859863, + "num_tokens": 6663384531.0, + "step": 13034 + }, + { + "epoch": 3.5248783126014063, + "grad_norm": 1.0531867742538452, + "learning_rate": 5.806685728345537e-06, + "loss": 1.7714, + "mean_token_accuracy": 0.5990086793899536, + "num_tokens": 6663878964.0, + "step": 13035 + }, + { + "epoch": 3.525148729042726, + "grad_norm": 0.9336692690849304, + "learning_rate": 5.8053982585884945e-06, + "loss": 1.9468, + "mean_token_accuracy": 0.5594379901885986, + "num_tokens": 6664403106.0, + "step": 13036 + }, + { + "epoch": 3.5254191454840456, + "grad_norm": 0.9919993877410889, + "learning_rate": 5.804110948217112e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5702265501022339, + "num_tokens": 6664927271.0, + "step": 13037 + }, + { + "epoch": 3.525689561925365, + "grad_norm": 0.9392915964126587, + "learning_rate": 5.8028237972708815e-06, + "loss": 1.8251, + "mean_token_accuracy": 0.5745741724967957, + "num_tokens": 6665451430.0, + "step": 13038 + }, + { + "epoch": 3.525959978366685, + "grad_norm": 0.8949126601219177, + "learning_rate": 5.801536805789303e-06, + "loss": 1.6507, + "mean_token_accuracy": 0.6085914373397827, + "num_tokens": 6665975570.0, + "step": 13039 + }, + { + "epoch": 3.5262303948080045, + "grad_norm": 0.9136037230491638, + "learning_rate": 5.800249973811865e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5943581461906433, + "num_tokens": 6666460486.0, + "step": 13040 + }, + { + "epoch": 3.526500811249324, + "grad_norm": 0.35022881627082825, + "learning_rate": 5.798963301378043e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7253736257553101, + "num_tokens": 6666984637.0, + "step": 13041 + }, + { + "epoch": 3.5267712276906433, + "grad_norm": 1.1315797567367554, + "learning_rate": 5.7976767885273285e-06, + "loss": 1.9467, + "mean_token_accuracy": 0.5473041534423828, + "num_tokens": 6667508746.0, + "step": 13042 + }, + { + "epoch": 3.5270416441319634, + "grad_norm": 1.1586005687713623, + "learning_rate": 5.7963904352991875e-06, + "loss": 1.7708, + "mean_token_accuracy": 0.5962967276573181, + "num_tokens": 6668032956.0, + "step": 13043 + }, + { + "epoch": 3.5273120605732826, + "grad_norm": 1.1032860279083252, + "learning_rate": 5.795104241733087e-06, + "loss": 1.9216, + "mean_token_accuracy": 0.5576088428497314, + "num_tokens": 6668556840.0, + "step": 13044 + }, + { + "epoch": 3.5275824770146027, + "grad_norm": 0.9543040990829468, + "learning_rate": 5.793818207868496e-06, + "loss": 1.9084, + "mean_token_accuracy": 0.5597995519638062, + "num_tokens": 6669081095.0, + "step": 13045 + }, + { + "epoch": 3.527852893455922, + "grad_norm": 1.028382420539856, + "learning_rate": 5.792532333744876e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5758463144302368, + "num_tokens": 6669605342.0, + "step": 13046 + }, + { + "epoch": 3.528123309897242, + "grad_norm": 1.2199715375900269, + "learning_rate": 5.791246619401674e-06, + "loss": 1.8689, + "mean_token_accuracy": 0.5654386878013611, + "num_tokens": 6670129591.0, + "step": 13047 + }, + { + "epoch": 3.528393726338561, + "grad_norm": 0.9951088428497314, + "learning_rate": 5.789961064878348e-06, + "loss": 1.7984, + "mean_token_accuracy": 0.5789644718170166, + "num_tokens": 6670605745.0, + "step": 13048 + }, + { + "epoch": 3.5286641427798813, + "grad_norm": 1.0919443368911743, + "learning_rate": 5.788675670214334e-06, + "loss": 1.8539, + "mean_token_accuracy": 0.5909978747367859, + "num_tokens": 6671076367.0, + "step": 13049 + }, + { + "epoch": 3.5289345592212005, + "grad_norm": 0.9246366024017334, + "learning_rate": 5.787390435449081e-06, + "loss": 1.7973, + "mean_token_accuracy": 0.5752096772193909, + "num_tokens": 6671600615.0, + "step": 13050 + }, + { + "epoch": 3.52920497566252, + "grad_norm": 1.0005199909210205, + "learning_rate": 5.786105360622017e-06, + "loss": 1.9117, + "mean_token_accuracy": 0.5461336970329285, + "num_tokens": 6672124819.0, + "step": 13051 + }, + { + "epoch": 3.5294753921038398, + "grad_norm": 0.8916866779327393, + "learning_rate": 5.78482044577257e-06, + "loss": 1.814, + "mean_token_accuracy": 0.57929927110672, + "num_tokens": 6672649023.0, + "step": 13052 + }, + { + "epoch": 3.5297458085451594, + "grad_norm": 0.8885443806648254, + "learning_rate": 5.783535690940174e-06, + "loss": 1.7539, + "mean_token_accuracy": 0.591392993927002, + "num_tokens": 6673173299.0, + "step": 13053 + }, + { + "epoch": 3.530016224986479, + "grad_norm": 1.056655764579773, + "learning_rate": 5.7822510961642415e-06, + "loss": 1.8957, + "mean_token_accuracy": 0.5489163398742676, + "num_tokens": 6673697515.0, + "step": 13054 + }, + { + "epoch": 3.5302866414277987, + "grad_norm": 1.133541226387024, + "learning_rate": 5.780966661484187e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.5566906929016113, + "num_tokens": 6674221731.0, + "step": 13055 + }, + { + "epoch": 3.5305570578691183, + "grad_norm": 1.026625633239746, + "learning_rate": 5.779682386939428e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5683950185775757, + "num_tokens": 6674745779.0, + "step": 13056 + }, + { + "epoch": 3.530827474310438, + "grad_norm": 1.1112271547317505, + "learning_rate": 5.778398272569363e-06, + "loss": 1.681, + "mean_token_accuracy": 0.6152418851852417, + "num_tokens": 6675269856.0, + "step": 13057 + }, + { + "epoch": 3.5310978907517576, + "grad_norm": 0.9704968929290771, + "learning_rate": 5.777114318413393e-06, + "loss": 1.7777, + "mean_token_accuracy": 0.5655905604362488, + "num_tokens": 6675794114.0, + "step": 13058 + }, + { + "epoch": 3.5313683071930773, + "grad_norm": 1.4121240377426147, + "learning_rate": 5.775830524510919e-06, + "loss": 1.6341, + "mean_token_accuracy": 0.6115438938140869, + "num_tokens": 6676318372.0, + "step": 13059 + }, + { + "epoch": 3.531638723634397, + "grad_norm": 1.3226460218429565, + "learning_rate": 5.774546890901324e-06, + "loss": 1.855, + "mean_token_accuracy": 0.5685975551605225, + "num_tokens": 6676817819.0, + "step": 13060 + }, + { + "epoch": 3.5319091400757165, + "grad_norm": 0.3260459899902344, + "learning_rate": 5.773263417623997e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7402970790863037, + "num_tokens": 6677342054.0, + "step": 13061 + }, + { + "epoch": 3.532179556517036, + "grad_norm": 1.1408841609954834, + "learning_rate": 5.7719801047183235e-06, + "loss": 1.7414, + "mean_token_accuracy": 0.591833233833313, + "num_tokens": 6677866304.0, + "step": 13062 + }, + { + "epoch": 3.532449972958356, + "grad_norm": 1.2065191268920898, + "learning_rate": 5.770696952223676e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.587560772895813, + "num_tokens": 6678390484.0, + "step": 13063 + }, + { + "epoch": 3.5327203893996755, + "grad_norm": 0.9831485152244568, + "learning_rate": 5.769413960179418e-06, + "loss": 1.9343, + "mean_token_accuracy": 0.5528703927993774, + "num_tokens": 6678914669.0, + "step": 13064 + }, + { + "epoch": 3.532990805840995, + "grad_norm": 0.964885413646698, + "learning_rate": 5.768131128624925e-06, + "loss": 1.925, + "mean_token_accuracy": 0.5472254753112793, + "num_tokens": 6679438935.0, + "step": 13065 + }, + { + "epoch": 3.5332612222823148, + "grad_norm": 1.088750958442688, + "learning_rate": 5.766848457599557e-06, + "loss": 1.9562, + "mean_token_accuracy": 0.5474923849105835, + "num_tokens": 6679963033.0, + "step": 13066 + }, + { + "epoch": 3.5335316387236344, + "grad_norm": 1.274719476699829, + "learning_rate": 5.765565947142664e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5727728605270386, + "num_tokens": 6680487210.0, + "step": 13067 + }, + { + "epoch": 3.533802055164954, + "grad_norm": 0.9491849541664124, + "learning_rate": 5.764283597293605e-06, + "loss": 1.881, + "mean_token_accuracy": 0.5892903804779053, + "num_tokens": 6680895639.0, + "step": 13068 + }, + { + "epoch": 3.5340724716062737, + "grad_norm": 1.1267344951629639, + "learning_rate": 5.763001408091723e-06, + "loss": 1.7971, + "mean_token_accuracy": 0.5748366713523865, + "num_tokens": 6681381564.0, + "step": 13069 + }, + { + "epoch": 3.5343428880475933, + "grad_norm": 1.2877594232559204, + "learning_rate": 5.761719379576354e-06, + "loss": 1.9221, + "mean_token_accuracy": 0.5618294477462769, + "num_tokens": 6681854880.0, + "step": 13070 + }, + { + "epoch": 3.534613304488913, + "grad_norm": 1.076206922531128, + "learning_rate": 5.760437511786843e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5872713327407837, + "num_tokens": 6682339218.0, + "step": 13071 + }, + { + "epoch": 3.5348837209302326, + "grad_norm": 0.9612404108047485, + "learning_rate": 5.759155804762515e-06, + "loss": 1.8117, + "mean_token_accuracy": 0.5639095306396484, + "num_tokens": 6682863311.0, + "step": 13072 + }, + { + "epoch": 3.5351541373715523, + "grad_norm": 1.192906379699707, + "learning_rate": 5.757874258542702e-06, + "loss": 1.8979, + "mean_token_accuracy": 0.563572108745575, + "num_tokens": 6683387398.0, + "step": 13073 + }, + { + "epoch": 3.535424553812872, + "grad_norm": 1.2058005332946777, + "learning_rate": 5.75659287316672e-06, + "loss": 1.7605, + "mean_token_accuracy": 0.5883275270462036, + "num_tokens": 6683911543.0, + "step": 13074 + }, + { + "epoch": 3.5356949702541915, + "grad_norm": 1.1843302249908447, + "learning_rate": 5.755311648673887e-06, + "loss": 1.8135, + "mean_token_accuracy": 0.5933212041854858, + "num_tokens": 6684403210.0, + "step": 13075 + }, + { + "epoch": 3.535965386695511, + "grad_norm": 0.8968179225921631, + "learning_rate": 5.754030585103522e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5839009284973145, + "num_tokens": 6684927404.0, + "step": 13076 + }, + { + "epoch": 3.536235803136831, + "grad_norm": 0.9264867305755615, + "learning_rate": 5.752749682494924e-06, + "loss": 1.7647, + "mean_token_accuracy": 0.5850076079368591, + "num_tokens": 6685451611.0, + "step": 13077 + }, + { + "epoch": 3.5365062195781505, + "grad_norm": 0.9943699240684509, + "learning_rate": 5.751468940887394e-06, + "loss": 1.5848, + "mean_token_accuracy": 0.6393392086029053, + "num_tokens": 6685937285.0, + "step": 13078 + }, + { + "epoch": 3.53677663601947, + "grad_norm": 1.1278769969940186, + "learning_rate": 5.750188360320236e-06, + "loss": 1.8881, + "mean_token_accuracy": 0.5756804943084717, + "num_tokens": 6686461469.0, + "step": 13079 + }, + { + "epoch": 3.5370470524607898, + "grad_norm": 1.0111392736434937, + "learning_rate": 5.7489079408327364e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.5774108171463013, + "num_tokens": 6686942609.0, + "step": 13080 + }, + { + "epoch": 3.5373174689021094, + "grad_norm": 0.33337417244911194, + "learning_rate": 5.747627682464181e-06, + "loss": 1.1777, + "mean_token_accuracy": 0.679889440536499, + "num_tokens": 6687466748.0, + "step": 13081 + }, + { + "epoch": 3.537587885343429, + "grad_norm": 1.2629201412200928, + "learning_rate": 5.746347585253859e-06, + "loss": 1.8104, + "mean_token_accuracy": 0.6081457138061523, + "num_tokens": 6687990962.0, + "step": 13082 + }, + { + "epoch": 3.5378583017847482, + "grad_norm": 1.339820146560669, + "learning_rate": 5.745067649241042e-06, + "loss": 1.8767, + "mean_token_accuracy": 0.568226158618927, + "num_tokens": 6688515177.0, + "step": 13083 + }, + { + "epoch": 3.5381287182260683, + "grad_norm": 0.9877921938896179, + "learning_rate": 5.743787874464999e-06, + "loss": 1.7638, + "mean_token_accuracy": 0.5806041955947876, + "num_tokens": 6689008120.0, + "step": 13084 + }, + { + "epoch": 3.5383991346673875, + "grad_norm": 0.9157721996307373, + "learning_rate": 5.742508260965006e-06, + "loss": 1.8075, + "mean_token_accuracy": 0.565990686416626, + "num_tokens": 6689532294.0, + "step": 13085 + }, + { + "epoch": 3.5386695511087076, + "grad_norm": 1.2200340032577515, + "learning_rate": 5.741228808780321e-06, + "loss": 1.9775, + "mean_token_accuracy": 0.549994707107544, + "num_tokens": 6690056473.0, + "step": 13086 + }, + { + "epoch": 3.538939967550027, + "grad_norm": 1.0255738496780396, + "learning_rate": 5.739949517950197e-06, + "loss": 1.77, + "mean_token_accuracy": 0.5760245323181152, + "num_tokens": 6690580574.0, + "step": 13087 + }, + { + "epoch": 3.539210383991347, + "grad_norm": 0.9530889987945557, + "learning_rate": 5.738670388513892e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.5915975570678711, + "num_tokens": 6691104707.0, + "step": 13088 + }, + { + "epoch": 3.539480800432666, + "grad_norm": 1.0926637649536133, + "learning_rate": 5.737391420510654e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5651357173919678, + "num_tokens": 6691628986.0, + "step": 13089 + }, + { + "epoch": 3.539751216873986, + "grad_norm": 1.094874382019043, + "learning_rate": 5.736112613979721e-06, + "loss": 1.931, + "mean_token_accuracy": 0.5757437944412231, + "num_tokens": 6692106719.0, + "step": 13090 + }, + { + "epoch": 3.5400216333153054, + "grad_norm": 0.9648391604423523, + "learning_rate": 5.734833968960336e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.597266435623169, + "num_tokens": 6692561376.0, + "step": 13091 + }, + { + "epoch": 3.540292049756625, + "grad_norm": 0.9675382375717163, + "learning_rate": 5.733555485491729e-06, + "loss": 1.887, + "mean_token_accuracy": 0.5760549902915955, + "num_tokens": 6693085579.0, + "step": 13092 + }, + { + "epoch": 3.5405624661979447, + "grad_norm": 0.9241035580635071, + "learning_rate": 5.7322771636131235e-06, + "loss": 1.9467, + "mean_token_accuracy": 0.5660364031791687, + "num_tokens": 6693609807.0, + "step": 13093 + }, + { + "epoch": 3.5408328826392643, + "grad_norm": 0.9019832611083984, + "learning_rate": 5.730999003363749e-06, + "loss": 1.8848, + "mean_token_accuracy": 0.5706406831741333, + "num_tokens": 6694106328.0, + "step": 13094 + }, + { + "epoch": 3.541103299080584, + "grad_norm": 1.1013203859329224, + "learning_rate": 5.729721004782818e-06, + "loss": 1.7712, + "mean_token_accuracy": 0.601104736328125, + "num_tokens": 6694630581.0, + "step": 13095 + }, + { + "epoch": 3.5413737155219036, + "grad_norm": 1.1086143255233765, + "learning_rate": 5.728443167909548e-06, + "loss": 1.876, + "mean_token_accuracy": 0.5757430195808411, + "num_tokens": 6695154623.0, + "step": 13096 + }, + { + "epoch": 3.5416441319632233, + "grad_norm": 0.9640588164329529, + "learning_rate": 5.727165492783144e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.564534068107605, + "num_tokens": 6695678841.0, + "step": 13097 + }, + { + "epoch": 3.541914548404543, + "grad_norm": 0.9667514562606812, + "learning_rate": 5.725887979442807e-06, + "loss": 1.6972, + "mean_token_accuracy": 0.5867735147476196, + "num_tokens": 6696202945.0, + "step": 13098 + }, + { + "epoch": 3.5421849648458625, + "grad_norm": 1.0093547105789185, + "learning_rate": 5.72461062792774e-06, + "loss": 1.768, + "mean_token_accuracy": 0.5948636531829834, + "num_tokens": 6696727170.0, + "step": 13099 + }, + { + "epoch": 3.542455381287182, + "grad_norm": 1.105151653289795, + "learning_rate": 5.7233334382771325e-06, + "loss": 1.7621, + "mean_token_accuracy": 0.5933143496513367, + "num_tokens": 6697251442.0, + "step": 13100 + }, + { + "epoch": 3.542725797728502, + "grad_norm": 0.45148319005966187, + "learning_rate": 5.722056410530169e-06, + "loss": 1.1451, + "mean_token_accuracy": 0.6858856678009033, + "num_tokens": 6697775514.0, + "step": 13101 + }, + { + "epoch": 3.5429962141698215, + "grad_norm": 1.1337031126022339, + "learning_rate": 5.720779544726039e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5712412595748901, + "num_tokens": 6698299641.0, + "step": 13102 + }, + { + "epoch": 3.543266630611141, + "grad_norm": 1.111572265625, + "learning_rate": 5.719502840903916e-06, + "loss": 1.7853, + "mean_token_accuracy": 0.6090573072433472, + "num_tokens": 6698764357.0, + "step": 13103 + }, + { + "epoch": 3.5435370470524608, + "grad_norm": 0.9799730181694031, + "learning_rate": 5.718226299102973e-06, + "loss": 1.9084, + "mean_token_accuracy": 0.55446457862854, + "num_tokens": 6699288462.0, + "step": 13104 + }, + { + "epoch": 3.5438074634937804, + "grad_norm": 0.7806255221366882, + "learning_rate": 5.716949919362384e-06, + "loss": 1.7388, + "mean_token_accuracy": 0.5997902750968933, + "num_tokens": 6699812741.0, + "step": 13105 + }, + { + "epoch": 3.5440778799351, + "grad_norm": 1.031692624092102, + "learning_rate": 5.715673701721309e-06, + "loss": 1.816, + "mean_token_accuracy": 0.5851250886917114, + "num_tokens": 6700282187.0, + "step": 13106 + }, + { + "epoch": 3.5443482963764197, + "grad_norm": 1.0948675870895386, + "learning_rate": 5.714397646218901e-06, + "loss": 1.8388, + "mean_token_accuracy": 0.5859696865081787, + "num_tokens": 6700806404.0, + "step": 13107 + }, + { + "epoch": 3.5446187128177393, + "grad_norm": 1.0833206176757812, + "learning_rate": 5.713121752894319e-06, + "loss": 1.8467, + "mean_token_accuracy": 0.5664889216423035, + "num_tokens": 6701305555.0, + "step": 13108 + }, + { + "epoch": 3.544889129259059, + "grad_norm": 1.1566953659057617, + "learning_rate": 5.711846021786711e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5806529521942139, + "num_tokens": 6701829765.0, + "step": 13109 + }, + { + "epoch": 3.5451595457003786, + "grad_norm": 1.1100695133209229, + "learning_rate": 5.710570452935216e-06, + "loss": 1.7804, + "mean_token_accuracy": 0.5882520079612732, + "num_tokens": 6702286151.0, + "step": 13110 + }, + { + "epoch": 3.5454299621416983, + "grad_norm": 0.9488251209259033, + "learning_rate": 5.709295046378975e-06, + "loss": 1.6956, + "mean_token_accuracy": 0.5903814435005188, + "num_tokens": 6702810393.0, + "step": 13111 + }, + { + "epoch": 3.545700378583018, + "grad_norm": 0.8389283418655396, + "learning_rate": 5.7080198021571245e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5721150636672974, + "num_tokens": 6703334621.0, + "step": 13112 + }, + { + "epoch": 3.5459707950243375, + "grad_norm": 0.8389859795570374, + "learning_rate": 5.706744720308784e-06, + "loss": 1.7569, + "mean_token_accuracy": 0.5897514820098877, + "num_tokens": 6703858849.0, + "step": 13113 + }, + { + "epoch": 3.546241211465657, + "grad_norm": 0.8337829113006592, + "learning_rate": 5.705469800873085e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.5891844630241394, + "num_tokens": 6704383017.0, + "step": 13114 + }, + { + "epoch": 3.546511627906977, + "grad_norm": 0.939822256565094, + "learning_rate": 5.704195043889139e-06, + "loss": 1.9268, + "mean_token_accuracy": 0.5479317307472229, + "num_tokens": 6704907272.0, + "step": 13115 + }, + { + "epoch": 3.5467820443482965, + "grad_norm": 0.8896676898002625, + "learning_rate": 5.7029204493960675e-06, + "loss": 1.8464, + "mean_token_accuracy": 0.5518672466278076, + "num_tokens": 6705431494.0, + "step": 13116 + }, + { + "epoch": 3.547052460789616, + "grad_norm": 1.01007878780365, + "learning_rate": 5.701646017432971e-06, + "loss": 1.8794, + "mean_token_accuracy": 0.577953040599823, + "num_tokens": 6705939116.0, + "step": 13117 + }, + { + "epoch": 3.5473228772309358, + "grad_norm": 0.9064562916755676, + "learning_rate": 5.700371748038955e-06, + "loss": 1.7571, + "mean_token_accuracy": 0.589343786239624, + "num_tokens": 6706463268.0, + "step": 13118 + }, + { + "epoch": 3.5475932936722554, + "grad_norm": 0.8678848147392273, + "learning_rate": 5.699097641253124e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5786364078521729, + "num_tokens": 6706948707.0, + "step": 13119 + }, + { + "epoch": 3.547863710113575, + "grad_norm": 0.9203673601150513, + "learning_rate": 5.697823697114564e-06, + "loss": 1.7965, + "mean_token_accuracy": 0.5869745016098022, + "num_tokens": 6707472983.0, + "step": 13120 + }, + { + "epoch": 3.5481341265548947, + "grad_norm": 0.3407624065876007, + "learning_rate": 5.696549915662364e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7168724536895752, + "num_tokens": 6707997108.0, + "step": 13121 + }, + { + "epoch": 3.5484045429962143, + "grad_norm": 0.9709087610244751, + "learning_rate": 5.695276296935612e-06, + "loss": 1.8023, + "mean_token_accuracy": 0.5710055232048035, + "num_tokens": 6708521171.0, + "step": 13122 + }, + { + "epoch": 3.548674959437534, + "grad_norm": 1.0969518423080444, + "learning_rate": 5.694002840973384e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5758349895477295, + "num_tokens": 6709045389.0, + "step": 13123 + }, + { + "epoch": 3.548945375878853, + "grad_norm": 0.8142167925834656, + "learning_rate": 5.6927295478147486e-06, + "loss": 1.7107, + "mean_token_accuracy": 0.5862213373184204, + "num_tokens": 6709544361.0, + "step": 13124 + }, + { + "epoch": 3.5492157923201733, + "grad_norm": 0.8317261338233948, + "learning_rate": 5.691456417498781e-06, + "loss": 1.5644, + "mean_token_accuracy": 0.6155681610107422, + "num_tokens": 6710068452.0, + "step": 13125 + }, + { + "epoch": 3.5494862087614925, + "grad_norm": 1.1893221139907837, + "learning_rate": 5.690183450064544e-06, + "loss": 1.8883, + "mean_token_accuracy": 0.5762394070625305, + "num_tokens": 6710592605.0, + "step": 13126 + }, + { + "epoch": 3.5497566252028125, + "grad_norm": 1.0792258977890015, + "learning_rate": 5.688910645551089e-06, + "loss": 1.9596, + "mean_token_accuracy": 0.5595484972000122, + "num_tokens": 6711116870.0, + "step": 13127 + }, + { + "epoch": 3.5500270416441317, + "grad_norm": 0.8940653800964355, + "learning_rate": 5.687638003997477e-06, + "loss": 1.7065, + "mean_token_accuracy": 0.6030206680297852, + "num_tokens": 6711641035.0, + "step": 13128 + }, + { + "epoch": 3.550297458085452, + "grad_norm": 0.978069007396698, + "learning_rate": 5.686365525442754e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5659028887748718, + "num_tokens": 6712165181.0, + "step": 13129 + }, + { + "epoch": 3.550567874526771, + "grad_norm": 0.9434671401977539, + "learning_rate": 5.6850932099259606e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5767021179199219, + "num_tokens": 6712689351.0, + "step": 13130 + }, + { + "epoch": 3.550838290968091, + "grad_norm": 0.8694702386856079, + "learning_rate": 5.683821057486139e-06, + "loss": 1.7445, + "mean_token_accuracy": 0.5816171169281006, + "num_tokens": 6713213541.0, + "step": 13131 + }, + { + "epoch": 3.5511087074094103, + "grad_norm": 1.2172553539276123, + "learning_rate": 5.682549068162322e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5733376741409302, + "num_tokens": 6713689574.0, + "step": 13132 + }, + { + "epoch": 3.5513791238507304, + "grad_norm": 0.9891777634620667, + "learning_rate": 5.681277241993535e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.5630885362625122, + "num_tokens": 6714213839.0, + "step": 13133 + }, + { + "epoch": 3.5516495402920496, + "grad_norm": 0.8333332538604736, + "learning_rate": 5.680005579018806e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5851210951805115, + "num_tokens": 6714738026.0, + "step": 13134 + }, + { + "epoch": 3.5519199567333692, + "grad_norm": 1.179933786392212, + "learning_rate": 5.6787340792771526e-06, + "loss": 1.8931, + "mean_token_accuracy": 0.5709351301193237, + "num_tokens": 6715251842.0, + "step": 13135 + }, + { + "epoch": 3.552190373174689, + "grad_norm": 1.0143635272979736, + "learning_rate": 5.677462742807582e-06, + "loss": 1.8259, + "mean_token_accuracy": 0.5900535583496094, + "num_tokens": 6715712487.0, + "step": 13136 + }, + { + "epoch": 3.5524607896160085, + "grad_norm": 1.1360304355621338, + "learning_rate": 5.6761915696491095e-06, + "loss": 1.7647, + "mean_token_accuracy": 0.5737630724906921, + "num_tokens": 6716205232.0, + "step": 13137 + }, + { + "epoch": 3.552731206057328, + "grad_norm": 1.0747277736663818, + "learning_rate": 5.674920559840737e-06, + "loss": 1.9295, + "mean_token_accuracy": 0.5479801297187805, + "num_tokens": 6716729451.0, + "step": 13138 + }, + { + "epoch": 3.553001622498648, + "grad_norm": 0.9198748469352722, + "learning_rate": 5.673649713421459e-06, + "loss": 1.7405, + "mean_token_accuracy": 0.5705029964447021, + "num_tokens": 6717253681.0, + "step": 13139 + }, + { + "epoch": 3.5532720389399675, + "grad_norm": 0.8899161219596863, + "learning_rate": 5.672379030430274e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.5670827627182007, + "num_tokens": 6717777960.0, + "step": 13140 + }, + { + "epoch": 3.553542455381287, + "grad_norm": 0.358682781457901, + "learning_rate": 5.671108510906165e-06, + "loss": 1.1447, + "mean_token_accuracy": 0.6945980787277222, + "num_tokens": 6718302171.0, + "step": 13141 + }, + { + "epoch": 3.5538128718226067, + "grad_norm": 1.2628624439239502, + "learning_rate": 5.669838154888121e-06, + "loss": 1.8306, + "mean_token_accuracy": 0.5742689371109009, + "num_tokens": 6718826435.0, + "step": 13142 + }, + { + "epoch": 3.5540832882639264, + "grad_norm": 1.2539526224136353, + "learning_rate": 5.668567962415117e-06, + "loss": 1.8229, + "mean_token_accuracy": 0.5639635920524597, + "num_tokens": 6719350493.0, + "step": 13143 + }, + { + "epoch": 3.554353704705246, + "grad_norm": 0.9303102493286133, + "learning_rate": 5.667297933526126e-06, + "loss": 1.8344, + "mean_token_accuracy": 0.584761917591095, + "num_tokens": 6719874684.0, + "step": 13144 + }, + { + "epoch": 3.5546241211465657, + "grad_norm": 0.9826107025146484, + "learning_rate": 5.666028068260115e-06, + "loss": 1.7936, + "mean_token_accuracy": 0.5883647203445435, + "num_tokens": 6720341688.0, + "step": 13145 + }, + { + "epoch": 3.5548945375878853, + "grad_norm": 1.3458501100540161, + "learning_rate": 5.66475836665605e-06, + "loss": 1.6634, + "mean_token_accuracy": 0.6080806255340576, + "num_tokens": 6720860299.0, + "step": 13146 + }, + { + "epoch": 3.555164954029205, + "grad_norm": 1.1035903692245483, + "learning_rate": 5.6634888287528876e-06, + "loss": 1.8791, + "mean_token_accuracy": 0.5767550468444824, + "num_tokens": 6721330868.0, + "step": 13147 + }, + { + "epoch": 3.5554353704705246, + "grad_norm": 1.0724551677703857, + "learning_rate": 5.662219454589585e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5990163087844849, + "num_tokens": 6721832037.0, + "step": 13148 + }, + { + "epoch": 3.5557057869118442, + "grad_norm": 1.2134978771209717, + "learning_rate": 5.660950244205085e-06, + "loss": 1.9629, + "mean_token_accuracy": 0.5424216985702515, + "num_tokens": 6722356205.0, + "step": 13149 + }, + { + "epoch": 3.555976203353164, + "grad_norm": 1.1880967617034912, + "learning_rate": 5.659681197638331e-06, + "loss": 1.9093, + "mean_token_accuracy": 0.5684385895729065, + "num_tokens": 6722815693.0, + "step": 13150 + }, + { + "epoch": 3.5562466197944835, + "grad_norm": 1.0179357528686523, + "learning_rate": 5.658412314928265e-06, + "loss": 1.9367, + "mean_token_accuracy": 0.5630016326904297, + "num_tokens": 6723302014.0, + "step": 13151 + }, + { + "epoch": 3.556517036235803, + "grad_norm": 1.10154390335083, + "learning_rate": 5.657143596113821e-06, + "loss": 1.8129, + "mean_token_accuracy": 0.5820317268371582, + "num_tokens": 6723826075.0, + "step": 13152 + }, + { + "epoch": 3.556787452677123, + "grad_norm": 1.3227237462997437, + "learning_rate": 5.655875041233916e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5865176916122437, + "num_tokens": 6724350347.0, + "step": 13153 + }, + { + "epoch": 3.5570578691184425, + "grad_norm": 1.4248547554016113, + "learning_rate": 5.654606650327487e-06, + "loss": 1.8824, + "mean_token_accuracy": 0.5591886639595032, + "num_tokens": 6724854171.0, + "step": 13154 + }, + { + "epoch": 3.557328285559762, + "grad_norm": 0.8894221186637878, + "learning_rate": 5.653338423433444e-06, + "loss": 1.8704, + "mean_token_accuracy": 0.5556737184524536, + "num_tokens": 6725378393.0, + "step": 13155 + }, + { + "epoch": 3.5575987020010817, + "grad_norm": 1.0080958604812622, + "learning_rate": 5.6520703605907e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.560625433921814, + "num_tokens": 6725855486.0, + "step": 13156 + }, + { + "epoch": 3.5578691184424014, + "grad_norm": 1.0865472555160522, + "learning_rate": 5.650802461838166e-06, + "loss": 1.9353, + "mean_token_accuracy": 0.5719700455665588, + "num_tokens": 6726323204.0, + "step": 13157 + }, + { + "epoch": 3.558139534883721, + "grad_norm": 0.9486451148986816, + "learning_rate": 5.649534727214741e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.5728062391281128, + "num_tokens": 6726787810.0, + "step": 13158 + }, + { + "epoch": 3.5584099513250407, + "grad_norm": 0.876427173614502, + "learning_rate": 5.648267156759329e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5873755216598511, + "num_tokens": 6727312066.0, + "step": 13159 + }, + { + "epoch": 3.5586803677663603, + "grad_norm": 1.0202769041061401, + "learning_rate": 5.6469997505108155e-06, + "loss": 1.7842, + "mean_token_accuracy": 0.5839327573776245, + "num_tokens": 6727792849.0, + "step": 13160 + }, + { + "epoch": 3.55895078420768, + "grad_norm": 0.3930964767932892, + "learning_rate": 5.645732508508095e-06, + "loss": 1.068, + "mean_token_accuracy": 0.7193616628646851, + "num_tokens": 6728316939.0, + "step": 13161 + }, + { + "epoch": 3.5592212006489996, + "grad_norm": 1.252325415611267, + "learning_rate": 5.644465430790045e-06, + "loss": 1.8933, + "mean_token_accuracy": 0.5817428827285767, + "num_tokens": 6728759344.0, + "step": 13162 + }, + { + "epoch": 3.5594916170903192, + "grad_norm": 1.1163681745529175, + "learning_rate": 5.643198517395546e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.5726838707923889, + "num_tokens": 6729254642.0, + "step": 13163 + }, + { + "epoch": 3.559762033531639, + "grad_norm": 1.18350350856781, + "learning_rate": 5.641931768363469e-06, + "loss": 1.9574, + "mean_token_accuracy": 0.5505191087722778, + "num_tokens": 6729778917.0, + "step": 13164 + }, + { + "epoch": 3.560032449972958, + "grad_norm": 1.204132080078125, + "learning_rate": 5.640665183732683e-06, + "loss": 1.7965, + "mean_token_accuracy": 0.5932393074035645, + "num_tokens": 6730303108.0, + "step": 13165 + }, + { + "epoch": 3.560302866414278, + "grad_norm": 1.1354615688323975, + "learning_rate": 5.639398763542052e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5580383539199829, + "num_tokens": 6730827387.0, + "step": 13166 + }, + { + "epoch": 3.5605732828555974, + "grad_norm": 0.8802880644798279, + "learning_rate": 5.6381325078304274e-06, + "loss": 1.9236, + "mean_token_accuracy": 0.5588470697402954, + "num_tokens": 6731351519.0, + "step": 13167 + }, + { + "epoch": 3.5608436992969175, + "grad_norm": 1.1310372352600098, + "learning_rate": 5.636866416636669e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5674906969070435, + "num_tokens": 6731875595.0, + "step": 13168 + }, + { + "epoch": 3.5611141157382367, + "grad_norm": 1.218532681465149, + "learning_rate": 5.635600489999622e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5748946666717529, + "num_tokens": 6732369542.0, + "step": 13169 + }, + { + "epoch": 3.5613845321795568, + "grad_norm": 0.9137813448905945, + "learning_rate": 5.634334727958123e-06, + "loss": 1.8134, + "mean_token_accuracy": 0.5693442821502686, + "num_tokens": 6732870966.0, + "step": 13170 + }, + { + "epoch": 3.561654948620876, + "grad_norm": 1.1209181547164917, + "learning_rate": 5.633069130551017e-06, + "loss": 1.8635, + "mean_token_accuracy": 0.5760577917098999, + "num_tokens": 6733395232.0, + "step": 13171 + }, + { + "epoch": 3.561925365062196, + "grad_norm": 1.0753096342086792, + "learning_rate": 5.631803697817133e-06, + "loss": 1.8841, + "mean_token_accuracy": 0.5848932266235352, + "num_tokens": 6733904287.0, + "step": 13172 + }, + { + "epoch": 3.5621957815035152, + "grad_norm": 1.0153571367263794, + "learning_rate": 5.630538429795297e-06, + "loss": 1.9602, + "mean_token_accuracy": 0.5354214906692505, + "num_tokens": 6734428460.0, + "step": 13173 + }, + { + "epoch": 3.5624661979448353, + "grad_norm": 1.123401165008545, + "learning_rate": 5.629273326524332e-06, + "loss": 1.7943, + "mean_token_accuracy": 0.5810340642929077, + "num_tokens": 6734952736.0, + "step": 13174 + }, + { + "epoch": 3.5627366143861545, + "grad_norm": 0.8466899394989014, + "learning_rate": 5.628008388043058e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5763462781906128, + "num_tokens": 6735476993.0, + "step": 13175 + }, + { + "epoch": 3.563007030827474, + "grad_norm": 0.9954919815063477, + "learning_rate": 5.626743614390282e-06, + "loss": 1.8459, + "mean_token_accuracy": 0.597476601600647, + "num_tokens": 6735936848.0, + "step": 13176 + }, + { + "epoch": 3.563277447268794, + "grad_norm": 1.195015788078308, + "learning_rate": 5.625479005604816e-06, + "loss": 1.9176, + "mean_token_accuracy": 0.5520017743110657, + "num_tokens": 6736461090.0, + "step": 13177 + }, + { + "epoch": 3.5635478637101135, + "grad_norm": 1.0905646085739136, + "learning_rate": 5.624214561725461e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.5844473838806152, + "num_tokens": 6736924891.0, + "step": 13178 + }, + { + "epoch": 3.563818280151433, + "grad_norm": 0.9951016306877136, + "learning_rate": 5.622950282791007e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5602327585220337, + "num_tokens": 6737449124.0, + "step": 13179 + }, + { + "epoch": 3.5640886965927527, + "grad_norm": 1.029337763786316, + "learning_rate": 5.6216861688402555e-06, + "loss": 1.9311, + "mean_token_accuracy": 0.5678889751434326, + "num_tokens": 6737973409.0, + "step": 13180 + }, + { + "epoch": 3.5643591130340724, + "grad_norm": 0.36264556646347046, + "learning_rate": 5.620422219911989e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7333086729049683, + "num_tokens": 6738497544.0, + "step": 13181 + }, + { + "epoch": 3.564629529475392, + "grad_norm": 1.0451349020004272, + "learning_rate": 5.619158436044987e-06, + "loss": 1.8683, + "mean_token_accuracy": 0.584366500377655, + "num_tokens": 6739021688.0, + "step": 13182 + }, + { + "epoch": 3.5648999459167117, + "grad_norm": 1.0904797315597534, + "learning_rate": 5.617894817278029e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5538111329078674, + "num_tokens": 6739545863.0, + "step": 13183 + }, + { + "epoch": 3.5651703623580313, + "grad_norm": 0.9604141116142273, + "learning_rate": 5.616631363649888e-06, + "loss": 1.7259, + "mean_token_accuracy": 0.5926237106323242, + "num_tokens": 6740070092.0, + "step": 13184 + }, + { + "epoch": 3.565440778799351, + "grad_norm": 0.9715603590011597, + "learning_rate": 5.6153680751993255e-06, + "loss": 1.778, + "mean_token_accuracy": 0.5964140892028809, + "num_tokens": 6740534165.0, + "step": 13185 + }, + { + "epoch": 3.5657111952406706, + "grad_norm": 1.0278308391571045, + "learning_rate": 5.614104951965107e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5868839025497437, + "num_tokens": 6741058316.0, + "step": 13186 + }, + { + "epoch": 3.5659816116819902, + "grad_norm": 0.9152944087982178, + "learning_rate": 5.6128419939859845e-06, + "loss": 1.9025, + "mean_token_accuracy": 0.5449856519699097, + "num_tokens": 6741582500.0, + "step": 13187 + }, + { + "epoch": 3.56625202812331, + "grad_norm": 0.8864043354988098, + "learning_rate": 5.611579201300712e-06, + "loss": 1.8293, + "mean_token_accuracy": 0.5849785208702087, + "num_tokens": 6742106668.0, + "step": 13188 + }, + { + "epoch": 3.5665224445646295, + "grad_norm": 1.0118521451950073, + "learning_rate": 5.610316573948039e-06, + "loss": 1.8746, + "mean_token_accuracy": 0.5628396272659302, + "num_tokens": 6742630876.0, + "step": 13189 + }, + { + "epoch": 3.566792861005949, + "grad_norm": 0.8513004779815674, + "learning_rate": 5.609054111966701e-06, + "loss": 1.7767, + "mean_token_accuracy": 0.5857358574867249, + "num_tokens": 6743143978.0, + "step": 13190 + }, + { + "epoch": 3.567063277447269, + "grad_norm": 1.08806574344635, + "learning_rate": 5.60779181539544e-06, + "loss": 1.9316, + "mean_token_accuracy": 0.5611810088157654, + "num_tokens": 6743643086.0, + "step": 13191 + }, + { + "epoch": 3.5673336938885885, + "grad_norm": 1.0282195806503296, + "learning_rate": 5.6065296842729844e-06, + "loss": 1.9046, + "mean_token_accuracy": 0.5583250522613525, + "num_tokens": 6744167305.0, + "step": 13192 + }, + { + "epoch": 3.567604110329908, + "grad_norm": 0.9442831873893738, + "learning_rate": 5.605267718638054e-06, + "loss": 1.7233, + "mean_token_accuracy": 0.599226713180542, + "num_tokens": 6744691584.0, + "step": 13193 + }, + { + "epoch": 3.5678745267712277, + "grad_norm": 0.9600860476493835, + "learning_rate": 5.604005918529381e-06, + "loss": 1.8362, + "mean_token_accuracy": 0.5704421401023865, + "num_tokens": 6745215763.0, + "step": 13194 + }, + { + "epoch": 3.5681449432125474, + "grad_norm": 1.1006956100463867, + "learning_rate": 5.602744283985671e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.6056142449378967, + "num_tokens": 6745659734.0, + "step": 13195 + }, + { + "epoch": 3.568415359653867, + "grad_norm": 0.9430850744247437, + "learning_rate": 5.601482815045638e-06, + "loss": 1.7524, + "mean_token_accuracy": 0.5924761295318604, + "num_tokens": 6746183973.0, + "step": 13196 + }, + { + "epoch": 3.5686857760951867, + "grad_norm": 1.0676060914993286, + "learning_rate": 5.600221511747992e-06, + "loss": 1.8856, + "mean_token_accuracy": 0.562747597694397, + "num_tokens": 6746649875.0, + "step": 13197 + }, + { + "epoch": 3.5689561925365063, + "grad_norm": 0.9721649885177612, + "learning_rate": 5.598960374131427e-06, + "loss": 1.7953, + "mean_token_accuracy": 0.5674325227737427, + "num_tokens": 6747174128.0, + "step": 13198 + }, + { + "epoch": 3.569226608977826, + "grad_norm": 1.0731005668640137, + "learning_rate": 5.59769940223464e-06, + "loss": 1.8345, + "mean_token_accuracy": 0.573312520980835, + "num_tokens": 6747681686.0, + "step": 13199 + }, + { + "epoch": 3.5694970254191456, + "grad_norm": 1.0173964500427246, + "learning_rate": 5.596438596096326e-06, + "loss": 1.8983, + "mean_token_accuracy": 0.575821042060852, + "num_tokens": 6748205673.0, + "step": 13200 + }, + { + "epoch": 3.5697674418604652, + "grad_norm": 0.359904021024704, + "learning_rate": 5.595177955755162e-06, + "loss": 1.1667, + "mean_token_accuracy": 0.6917765140533447, + "num_tokens": 6748729808.0, + "step": 13201 + }, + { + "epoch": 3.570037858301785, + "grad_norm": 1.10433030128479, + "learning_rate": 5.593917481249837e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.5761616230010986, + "num_tokens": 6749228134.0, + "step": 13202 + }, + { + "epoch": 3.5703082747431045, + "grad_norm": 1.0954484939575195, + "learning_rate": 5.592657172619018e-06, + "loss": 1.8093, + "mean_token_accuracy": 0.5932121872901917, + "num_tokens": 6749678260.0, + "step": 13203 + }, + { + "epoch": 3.570578691184424, + "grad_norm": 1.0062594413757324, + "learning_rate": 5.591397029901384e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5840487480163574, + "num_tokens": 6750173285.0, + "step": 13204 + }, + { + "epoch": 3.570849107625744, + "grad_norm": 0.999824047088623, + "learning_rate": 5.590137053135589e-06, + "loss": 1.76, + "mean_token_accuracy": 0.5878422260284424, + "num_tokens": 6750697559.0, + "step": 13205 + }, + { + "epoch": 3.571119524067063, + "grad_norm": 0.9156564474105835, + "learning_rate": 5.5888772423603035e-06, + "loss": 1.8807, + "mean_token_accuracy": 0.5692746639251709, + "num_tokens": 6751221797.0, + "step": 13206 + }, + { + "epoch": 3.571389940508383, + "grad_norm": 1.6561753749847412, + "learning_rate": 5.587617597614177e-06, + "loss": 1.5404, + "mean_token_accuracy": 0.6198151111602783, + "num_tokens": 6751745978.0, + "step": 13207 + }, + { + "epoch": 3.5716603569497023, + "grad_norm": 1.0309983491897583, + "learning_rate": 5.5863581189358565e-06, + "loss": 1.7726, + "mean_token_accuracy": 0.5731748342514038, + "num_tokens": 6752270167.0, + "step": 13208 + }, + { + "epoch": 3.5719307733910224, + "grad_norm": 1.118079662322998, + "learning_rate": 5.5850988063639915e-06, + "loss": 1.7954, + "mean_token_accuracy": 0.5890711545944214, + "num_tokens": 6752777662.0, + "step": 13209 + }, + { + "epoch": 3.5722011898323416, + "grad_norm": 1.110238790512085, + "learning_rate": 5.583839659937217e-06, + "loss": 1.8869, + "mean_token_accuracy": 0.5631115436553955, + "num_tokens": 6753299592.0, + "step": 13210 + }, + { + "epoch": 3.5724716062736617, + "grad_norm": 0.87257319688797, + "learning_rate": 5.582580679694172e-06, + "loss": 1.8327, + "mean_token_accuracy": 0.5778327584266663, + "num_tokens": 6753823865.0, + "step": 13211 + }, + { + "epoch": 3.572742022714981, + "grad_norm": 0.9581620097160339, + "learning_rate": 5.581321865673484e-06, + "loss": 1.9132, + "mean_token_accuracy": 0.5540763139724731, + "num_tokens": 6754348035.0, + "step": 13212 + }, + { + "epoch": 3.573012439156301, + "grad_norm": 1.0325418710708618, + "learning_rate": 5.580063217913772e-06, + "loss": 1.6447, + "mean_token_accuracy": 0.5931473970413208, + "num_tokens": 6754821019.0, + "step": 13213 + }, + { + "epoch": 3.57328285559762, + "grad_norm": 0.9507797956466675, + "learning_rate": 5.578804736453665e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5883013010025024, + "num_tokens": 6755345177.0, + "step": 13214 + }, + { + "epoch": 3.5735532720389402, + "grad_norm": 0.7716845870018005, + "learning_rate": 5.577546421331768e-06, + "loss": 1.8088, + "mean_token_accuracy": 0.5730504989624023, + "num_tokens": 6755869235.0, + "step": 13215 + }, + { + "epoch": 3.5738236884802594, + "grad_norm": 0.9783807992935181, + "learning_rate": 5.576288272586694e-06, + "loss": 1.8799, + "mean_token_accuracy": 0.5705251693725586, + "num_tokens": 6756393332.0, + "step": 13216 + }, + { + "epoch": 3.574094104921579, + "grad_norm": 1.101300835609436, + "learning_rate": 5.575030290257044e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5848129987716675, + "num_tokens": 6756917574.0, + "step": 13217 + }, + { + "epoch": 3.5743645213628987, + "grad_norm": 0.912635087966919, + "learning_rate": 5.573772474381421e-06, + "loss": 1.8139, + "mean_token_accuracy": 0.5762344002723694, + "num_tokens": 6757441839.0, + "step": 13218 + }, + { + "epoch": 3.5746349378042184, + "grad_norm": 1.1133440732955933, + "learning_rate": 5.572514824998414e-06, + "loss": 1.7657, + "mean_token_accuracy": 0.5806224346160889, + "num_tokens": 6757966061.0, + "step": 13219 + }, + { + "epoch": 3.574905354245538, + "grad_norm": 0.9370958209037781, + "learning_rate": 5.571257342146616e-06, + "loss": 1.7564, + "mean_token_accuracy": 0.6014721393585205, + "num_tokens": 6758382786.0, + "step": 13220 + }, + { + "epoch": 3.5751757706868577, + "grad_norm": 0.37351781129837036, + "learning_rate": 5.5700000258646085e-06, + "loss": 1.079, + "mean_token_accuracy": 0.7133539319038391, + "num_tokens": 6758906980.0, + "step": 13221 + }, + { + "epoch": 3.5754461871281773, + "grad_norm": 1.2470234632492065, + "learning_rate": 5.568742876190964e-06, + "loss": 1.9239, + "mean_token_accuracy": 0.5769495964050293, + "num_tokens": 6759431232.0, + "step": 13222 + }, + { + "epoch": 3.575716603569497, + "grad_norm": 0.9940400719642639, + "learning_rate": 5.567485893164265e-06, + "loss": 1.5773, + "mean_token_accuracy": 0.6292933225631714, + "num_tokens": 6759873377.0, + "step": 13223 + }, + { + "epoch": 3.5759870200108166, + "grad_norm": 1.043290615081787, + "learning_rate": 5.566229076823076e-06, + "loss": 1.8552, + "mean_token_accuracy": 0.5790482759475708, + "num_tokens": 6760394997.0, + "step": 13224 + }, + { + "epoch": 3.5762574364521362, + "grad_norm": 1.0702403783798218, + "learning_rate": 5.564972427205951e-06, + "loss": 1.8261, + "mean_token_accuracy": 0.5854941010475159, + "num_tokens": 6760919167.0, + "step": 13225 + }, + { + "epoch": 3.576527852893456, + "grad_norm": 1.0228348970413208, + "learning_rate": 5.563715944351462e-06, + "loss": 1.7735, + "mean_token_accuracy": 0.5935931205749512, + "num_tokens": 6761387807.0, + "step": 13226 + }, + { + "epoch": 3.5767982693347755, + "grad_norm": 0.8678528070449829, + "learning_rate": 5.562459628298153e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.59824538230896, + "num_tokens": 6761911843.0, + "step": 13227 + }, + { + "epoch": 3.577068685776095, + "grad_norm": 1.2935569286346436, + "learning_rate": 5.561203479084572e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5687286853790283, + "num_tokens": 6762435975.0, + "step": 13228 + }, + { + "epoch": 3.577339102217415, + "grad_norm": 1.2432676553726196, + "learning_rate": 5.559947496749265e-06, + "loss": 2.0577, + "mean_token_accuracy": 0.5437743663787842, + "num_tokens": 6762921572.0, + "step": 13229 + }, + { + "epoch": 3.5776095186587344, + "grad_norm": 0.9255717992782593, + "learning_rate": 5.5586916813307634e-06, + "loss": 1.8531, + "mean_token_accuracy": 0.5919590592384338, + "num_tokens": 6763382287.0, + "step": 13230 + }, + { + "epoch": 3.577879935100054, + "grad_norm": 1.0359433889389038, + "learning_rate": 5.557436032867602e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5717638731002808, + "num_tokens": 6763881152.0, + "step": 13231 + }, + { + "epoch": 3.5781503515413737, + "grad_norm": 1.4567697048187256, + "learning_rate": 5.556180551398314e-06, + "loss": 1.7835, + "mean_token_accuracy": 0.5640168786048889, + "num_tokens": 6764405407.0, + "step": 13232 + }, + { + "epoch": 3.5784207679826934, + "grad_norm": 1.3443025350570679, + "learning_rate": 5.5549252369614125e-06, + "loss": 1.8254, + "mean_token_accuracy": 0.5800572633743286, + "num_tokens": 6764886155.0, + "step": 13233 + }, + { + "epoch": 3.578691184424013, + "grad_norm": 1.0392299890518188, + "learning_rate": 5.55367008959542e-06, + "loss": 1.8098, + "mean_token_accuracy": 0.5871775150299072, + "num_tokens": 6765373111.0, + "step": 13234 + }, + { + "epoch": 3.5789616008653327, + "grad_norm": 0.9820590019226074, + "learning_rate": 5.552415109338845e-06, + "loss": 1.9253, + "mean_token_accuracy": 0.5592383146286011, + "num_tokens": 6765897240.0, + "step": 13235 + }, + { + "epoch": 3.5792320173066523, + "grad_norm": 1.0035291910171509, + "learning_rate": 5.551160296230193e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5822749137878418, + "num_tokens": 6766386662.0, + "step": 13236 + }, + { + "epoch": 3.579502433747972, + "grad_norm": 0.9435518980026245, + "learning_rate": 5.549905650307971e-06, + "loss": 1.7861, + "mean_token_accuracy": 0.590643048286438, + "num_tokens": 6766910747.0, + "step": 13237 + }, + { + "epoch": 3.5797728501892916, + "grad_norm": 0.9603638052940369, + "learning_rate": 5.548651171610669e-06, + "loss": 1.8927, + "mean_token_accuracy": 0.552958607673645, + "num_tokens": 6767434975.0, + "step": 13238 + }, + { + "epoch": 3.5800432666306112, + "grad_norm": 1.070791482925415, + "learning_rate": 5.547396860176779e-06, + "loss": 1.9207, + "mean_token_accuracy": 0.5680587291717529, + "num_tokens": 6767909045.0, + "step": 13239 + }, + { + "epoch": 3.580313683071931, + "grad_norm": 0.9671128988265991, + "learning_rate": 5.546142716044791e-06, + "loss": 1.9352, + "mean_token_accuracy": 0.5696901679039001, + "num_tokens": 6768372320.0, + "step": 13240 + }, + { + "epoch": 3.5805840995132505, + "grad_norm": 0.39699703454971313, + "learning_rate": 5.544888739253182e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.7234600186347961, + "num_tokens": 6768877426.0, + "step": 13241 + }, + { + "epoch": 3.58085451595457, + "grad_norm": 1.1422678232192993, + "learning_rate": 5.543634929840428e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.5637620687484741, + "num_tokens": 6769401553.0, + "step": 13242 + }, + { + "epoch": 3.58112493239589, + "grad_norm": 1.3329575061798096, + "learning_rate": 5.542381287845003e-06, + "loss": 1.9403, + "mean_token_accuracy": 0.5606253743171692, + "num_tokens": 6769874068.0, + "step": 13243 + }, + { + "epoch": 3.5813953488372094, + "grad_norm": 1.1135610342025757, + "learning_rate": 5.541127813305365e-06, + "loss": 1.8549, + "mean_token_accuracy": 0.5822696685791016, + "num_tokens": 6770329061.0, + "step": 13244 + }, + { + "epoch": 3.581665765278529, + "grad_norm": 0.8914870023727417, + "learning_rate": 5.539874506259981e-06, + "loss": 1.8846, + "mean_token_accuracy": 0.5627370476722717, + "num_tokens": 6770853295.0, + "step": 13245 + }, + { + "epoch": 3.5819361817198487, + "grad_norm": 1.120456576347351, + "learning_rate": 5.538621366747306e-06, + "loss": 1.7806, + "mean_token_accuracy": 0.5860466957092285, + "num_tokens": 6771377476.0, + "step": 13246 + }, + { + "epoch": 3.582206598161168, + "grad_norm": 1.0673326253890991, + "learning_rate": 5.537368394805791e-06, + "loss": 1.7727, + "mean_token_accuracy": 0.590590238571167, + "num_tokens": 6771870884.0, + "step": 13247 + }, + { + "epoch": 3.582477014602488, + "grad_norm": 0.9829912185668945, + "learning_rate": 5.536115590473873e-06, + "loss": 1.8729, + "mean_token_accuracy": 0.5824183821678162, + "num_tokens": 6772395158.0, + "step": 13248 + }, + { + "epoch": 3.582747431043807, + "grad_norm": 0.8583850264549255, + "learning_rate": 5.534862953790001e-06, + "loss": 1.8601, + "mean_token_accuracy": 0.5907856225967407, + "num_tokens": 6772858783.0, + "step": 13249 + }, + { + "epoch": 3.5830178474851273, + "grad_norm": 0.9871848225593567, + "learning_rate": 5.5336104847926045e-06, + "loss": 1.8002, + "mean_token_accuracy": 0.5949260592460632, + "num_tokens": 6773382980.0, + "step": 13250 + }, + { + "epoch": 3.5832882639264465, + "grad_norm": 1.2751567363739014, + "learning_rate": 5.53235818352011e-06, + "loss": 1.7517, + "mean_token_accuracy": 0.6157523393630981, + "num_tokens": 6773843183.0, + "step": 13251 + }, + { + "epoch": 3.5835586803677666, + "grad_norm": 1.1338449716567993, + "learning_rate": 5.5311060500109504e-06, + "loss": 1.8226, + "mean_token_accuracy": 0.5741415619850159, + "num_tokens": 6774367408.0, + "step": 13252 + }, + { + "epoch": 3.583829096809086, + "grad_norm": 0.9712970852851868, + "learning_rate": 5.529854084303541e-06, + "loss": 1.7577, + "mean_token_accuracy": 0.5888038873672485, + "num_tokens": 6774853374.0, + "step": 13253 + }, + { + "epoch": 3.584099513250406, + "grad_norm": 0.9462721943855286, + "learning_rate": 5.52860228643629e-06, + "loss": 1.826, + "mean_token_accuracy": 0.5808100700378418, + "num_tokens": 6775377653.0, + "step": 13254 + }, + { + "epoch": 3.584369929691725, + "grad_norm": 0.9627392292022705, + "learning_rate": 5.527350656447614e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5592666864395142, + "num_tokens": 6775901797.0, + "step": 13255 + }, + { + "epoch": 3.584640346133045, + "grad_norm": 0.9825900793075562, + "learning_rate": 5.5260991943759155e-06, + "loss": 1.7319, + "mean_token_accuracy": 0.591940701007843, + "num_tokens": 6776425747.0, + "step": 13256 + }, + { + "epoch": 3.5849107625743644, + "grad_norm": 1.0574836730957031, + "learning_rate": 5.524847900259588e-06, + "loss": 1.9245, + "mean_token_accuracy": 0.5631812214851379, + "num_tokens": 6776949874.0, + "step": 13257 + }, + { + "epoch": 3.585181179015684, + "grad_norm": 0.8797606825828552, + "learning_rate": 5.523596774137027e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5988049507141113, + "num_tokens": 6777474136.0, + "step": 13258 + }, + { + "epoch": 3.5854515954570036, + "grad_norm": 1.0433629751205444, + "learning_rate": 5.522345816046627e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5981356501579285, + "num_tokens": 6777998356.0, + "step": 13259 + }, + { + "epoch": 3.5857220118983233, + "grad_norm": 0.9161161780357361, + "learning_rate": 5.521095026026762e-06, + "loss": 1.856, + "mean_token_accuracy": 0.5692124366760254, + "num_tokens": 6778522637.0, + "step": 13260 + }, + { + "epoch": 3.585992428339643, + "grad_norm": 0.37327146530151367, + "learning_rate": 5.519844404115816e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7093861103057861, + "num_tokens": 6779046879.0, + "step": 13261 + }, + { + "epoch": 3.5862628447809626, + "grad_norm": 1.111360788345337, + "learning_rate": 5.518593950352158e-06, + "loss": 1.7746, + "mean_token_accuracy": 0.5844938158988953, + "num_tokens": 6779571033.0, + "step": 13262 + }, + { + "epoch": 3.586533261222282, + "grad_norm": 1.1774858236312866, + "learning_rate": 5.5173436647741615e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5813588500022888, + "num_tokens": 6780095305.0, + "step": 13263 + }, + { + "epoch": 3.586803677663602, + "grad_norm": 1.0000121593475342, + "learning_rate": 5.5160935474201826e-06, + "loss": 1.8758, + "mean_token_accuracy": 0.5565711855888367, + "num_tokens": 6780619518.0, + "step": 13264 + }, + { + "epoch": 3.5870740941049215, + "grad_norm": 0.8009787201881409, + "learning_rate": 5.5148435983285786e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5749210119247437, + "num_tokens": 6781143792.0, + "step": 13265 + }, + { + "epoch": 3.587344510546241, + "grad_norm": 0.9941043853759766, + "learning_rate": 5.513593817537708e-06, + "loss": 1.7451, + "mean_token_accuracy": 0.5933113694190979, + "num_tokens": 6781621668.0, + "step": 13266 + }, + { + "epoch": 3.587614926987561, + "grad_norm": 0.9574931263923645, + "learning_rate": 5.512344205085911e-06, + "loss": 1.8398, + "mean_token_accuracy": 0.580086886882782, + "num_tokens": 6782145863.0, + "step": 13267 + }, + { + "epoch": 3.5878853434288804, + "grad_norm": 1.0355607271194458, + "learning_rate": 5.51109476101153e-06, + "loss": 1.9671, + "mean_token_accuracy": 0.5404533743858337, + "num_tokens": 6782670140.0, + "step": 13268 + }, + { + "epoch": 3.5881557598702, + "grad_norm": 0.9947044849395752, + "learning_rate": 5.509845485352907e-06, + "loss": 1.8856, + "mean_token_accuracy": 0.5710676908493042, + "num_tokens": 6783194370.0, + "step": 13269 + }, + { + "epoch": 3.5884261763115197, + "grad_norm": 1.2585300207138062, + "learning_rate": 5.508596378148371e-06, + "loss": 1.9336, + "mean_token_accuracy": 0.5380138158798218, + "num_tokens": 6783718521.0, + "step": 13270 + }, + { + "epoch": 3.5886965927528394, + "grad_norm": 1.03528892993927, + "learning_rate": 5.507347439436244e-06, + "loss": 1.9468, + "mean_token_accuracy": 0.5665429830551147, + "num_tokens": 6784242685.0, + "step": 13271 + }, + { + "epoch": 3.588967009194159, + "grad_norm": 1.0529396533966064, + "learning_rate": 5.5060986692548526e-06, + "loss": 1.8095, + "mean_token_accuracy": 0.5817147493362427, + "num_tokens": 6784766901.0, + "step": 13272 + }, + { + "epoch": 3.5892374256354787, + "grad_norm": 1.0425574779510498, + "learning_rate": 5.5048500676425065e-06, + "loss": 1.836, + "mean_token_accuracy": 0.5800996422767639, + "num_tokens": 6785290915.0, + "step": 13273 + }, + { + "epoch": 3.5895078420767983, + "grad_norm": 1.2295498847961426, + "learning_rate": 5.503601634637522e-06, + "loss": 1.7296, + "mean_token_accuracy": 0.5916198492050171, + "num_tokens": 6785802978.0, + "step": 13274 + }, + { + "epoch": 3.589778258518118, + "grad_norm": 1.03544282913208, + "learning_rate": 5.502353370278205e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.5822567939758301, + "num_tokens": 6786327258.0, + "step": 13275 + }, + { + "epoch": 3.5900486749594376, + "grad_norm": 1.1504673957824707, + "learning_rate": 5.501105274602856e-06, + "loss": 2.0202, + "mean_token_accuracy": 0.534378170967102, + "num_tokens": 6786851375.0, + "step": 13276 + }, + { + "epoch": 3.5903190914007572, + "grad_norm": 0.9517377614974976, + "learning_rate": 5.499857347649764e-06, + "loss": 1.7826, + "mean_token_accuracy": 0.585495114326477, + "num_tokens": 6787316816.0, + "step": 13277 + }, + { + "epoch": 3.590589507842077, + "grad_norm": 1.0988155603408813, + "learning_rate": 5.498609589457227e-06, + "loss": 1.9334, + "mean_token_accuracy": 0.5567730069160461, + "num_tokens": 6787840987.0, + "step": 13278 + }, + { + "epoch": 3.5908599242833965, + "grad_norm": 0.8969082832336426, + "learning_rate": 5.497362000063526e-06, + "loss": 1.7798, + "mean_token_accuracy": 0.5837430953979492, + "num_tokens": 6788365152.0, + "step": 13279 + }, + { + "epoch": 3.591130340724716, + "grad_norm": 0.896094024181366, + "learning_rate": 5.496114579506937e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5792521238327026, + "num_tokens": 6788837288.0, + "step": 13280 + }, + { + "epoch": 3.591400757166036, + "grad_norm": 0.3556397557258606, + "learning_rate": 5.494867327825744e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.7172052264213562, + "num_tokens": 6789361401.0, + "step": 13281 + }, + { + "epoch": 3.5916711736073554, + "grad_norm": 1.374679446220398, + "learning_rate": 5.493620245058206e-06, + "loss": 1.6805, + "mean_token_accuracy": 0.6182070970535278, + "num_tokens": 6789819196.0, + "step": 13282 + }, + { + "epoch": 3.591941590048675, + "grad_norm": 1.273014783859253, + "learning_rate": 5.492373331242595e-06, + "loss": 1.8371, + "mean_token_accuracy": 0.5750471353530884, + "num_tokens": 6790342373.0, + "step": 13283 + }, + { + "epoch": 3.5922120064899947, + "grad_norm": 1.1813713312149048, + "learning_rate": 5.491126586417168e-06, + "loss": 1.9417, + "mean_token_accuracy": 0.5607553720474243, + "num_tokens": 6790866648.0, + "step": 13284 + }, + { + "epoch": 3.5924824229313144, + "grad_norm": 0.9508567452430725, + "learning_rate": 5.4898800106201745e-06, + "loss": 1.7756, + "mean_token_accuracy": 0.5870428085327148, + "num_tokens": 6791390912.0, + "step": 13285 + }, + { + "epoch": 3.592752839372634, + "grad_norm": 1.2243592739105225, + "learning_rate": 5.48863360388987e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5498710870742798, + "num_tokens": 6791915102.0, + "step": 13286 + }, + { + "epoch": 3.5930232558139537, + "grad_norm": 1.4610167741775513, + "learning_rate": 5.487387366264491e-06, + "loss": 1.8224, + "mean_token_accuracy": 0.5774844884872437, + "num_tokens": 6792439354.0, + "step": 13287 + }, + { + "epoch": 3.593293672255273, + "grad_norm": 1.237243890762329, + "learning_rate": 5.486141297782279e-06, + "loss": 1.9478, + "mean_token_accuracy": 0.551069438457489, + "num_tokens": 6792963579.0, + "step": 13288 + }, + { + "epoch": 3.593564088696593, + "grad_norm": 0.8072181940078735, + "learning_rate": 5.48489539848147e-06, + "loss": 1.8655, + "mean_token_accuracy": 0.5785784721374512, + "num_tokens": 6793487859.0, + "step": 13289 + }, + { + "epoch": 3.593834505137912, + "grad_norm": 1.311249852180481, + "learning_rate": 5.48364966840029e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5586221218109131, + "num_tokens": 6794012138.0, + "step": 13290 + }, + { + "epoch": 3.5941049215792322, + "grad_norm": 1.3849866390228271, + "learning_rate": 5.4824041075769565e-06, + "loss": 1.9835, + "mean_token_accuracy": 0.563598096370697, + "num_tokens": 6794536341.0, + "step": 13291 + }, + { + "epoch": 3.5943753380205514, + "grad_norm": 1.3118560314178467, + "learning_rate": 5.4811587160496945e-06, + "loss": 1.8736, + "mean_token_accuracy": 0.5745308995246887, + "num_tokens": 6795024324.0, + "step": 13292 + }, + { + "epoch": 3.5946457544618715, + "grad_norm": 1.13895583152771, + "learning_rate": 5.479913493856713e-06, + "loss": 1.9108, + "mean_token_accuracy": 0.543938159942627, + "num_tokens": 6795548598.0, + "step": 13293 + }, + { + "epoch": 3.5949161709031907, + "grad_norm": 0.943145215511322, + "learning_rate": 5.4786684410362165e-06, + "loss": 1.8612, + "mean_token_accuracy": 0.5650181174278259, + "num_tokens": 6796072877.0, + "step": 13294 + }, + { + "epoch": 3.595186587344511, + "grad_norm": 1.3286340236663818, + "learning_rate": 5.477423557626413e-06, + "loss": 1.7983, + "mean_token_accuracy": 0.595098614692688, + "num_tokens": 6796597085.0, + "step": 13295 + }, + { + "epoch": 3.59545700378583, + "grad_norm": 1.4201160669326782, + "learning_rate": 5.476178843665494e-06, + "loss": 1.9156, + "mean_token_accuracy": 0.5680512189865112, + "num_tokens": 6797083742.0, + "step": 13296 + }, + { + "epoch": 3.59572742022715, + "grad_norm": 1.257501482963562, + "learning_rate": 5.4749342991916496e-06, + "loss": 1.9042, + "mean_token_accuracy": 0.5607227087020874, + "num_tokens": 6797607985.0, + "step": 13297 + }, + { + "epoch": 3.5959978366684693, + "grad_norm": 0.9255199432373047, + "learning_rate": 5.473689924243074e-06, + "loss": 1.8856, + "mean_token_accuracy": 0.5808514356613159, + "num_tokens": 6798080065.0, + "step": 13298 + }, + { + "epoch": 3.596268253109789, + "grad_norm": 1.2015982866287231, + "learning_rate": 5.472445718857941e-06, + "loss": 1.922, + "mean_token_accuracy": 0.552907407283783, + "num_tokens": 6798604263.0, + "step": 13299 + }, + { + "epoch": 3.5965386695511086, + "grad_norm": 1.0935240983963013, + "learning_rate": 5.471201683074428e-06, + "loss": 1.8998, + "mean_token_accuracy": 0.5701419711112976, + "num_tokens": 6799128539.0, + "step": 13300 + }, + { + "epoch": 3.596809085992428, + "grad_norm": 0.38812997937202454, + "learning_rate": 5.469957816930704e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.7161040306091309, + "num_tokens": 6799622770.0, + "step": 13301 + }, + { + "epoch": 3.597079502433748, + "grad_norm": 1.160639762878418, + "learning_rate": 5.468714120464942e-06, + "loss": 1.9239, + "mean_token_accuracy": 0.5479199290275574, + "num_tokens": 6800147046.0, + "step": 13302 + }, + { + "epoch": 3.5973499188750675, + "grad_norm": 1.3221114873886108, + "learning_rate": 5.467470593715294e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5838545560836792, + "num_tokens": 6800630041.0, + "step": 13303 + }, + { + "epoch": 3.597620335316387, + "grad_norm": 1.03388249874115, + "learning_rate": 5.466227236719921e-06, + "loss": 1.9012, + "mean_token_accuracy": 0.5604625940322876, + "num_tokens": 6801154276.0, + "step": 13304 + }, + { + "epoch": 3.597890751757707, + "grad_norm": 0.9203280210494995, + "learning_rate": 5.464984049516968e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5774226784706116, + "num_tokens": 6801678545.0, + "step": 13305 + }, + { + "epoch": 3.5981611681990264, + "grad_norm": 0.9930556416511536, + "learning_rate": 5.463741032144583e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5695017576217651, + "num_tokens": 6802202817.0, + "step": 13306 + }, + { + "epoch": 3.598431584640346, + "grad_norm": 1.043691873550415, + "learning_rate": 5.462498184640905e-06, + "loss": 1.8849, + "mean_token_accuracy": 0.5593857765197754, + "num_tokens": 6802726928.0, + "step": 13307 + }, + { + "epoch": 3.5987020010816657, + "grad_norm": 0.8674078583717346, + "learning_rate": 5.461255507044063e-06, + "loss": 1.7751, + "mean_token_accuracy": 0.5791651606559753, + "num_tokens": 6803251201.0, + "step": 13308 + }, + { + "epoch": 3.5989724175229854, + "grad_norm": 0.8692159652709961, + "learning_rate": 5.460012999392197e-06, + "loss": 1.795, + "mean_token_accuracy": 0.5727647542953491, + "num_tokens": 6803775467.0, + "step": 13309 + }, + { + "epoch": 3.599242833964305, + "grad_norm": 0.878684401512146, + "learning_rate": 5.458770661723421e-06, + "loss": 1.7571, + "mean_token_accuracy": 0.6010785102844238, + "num_tokens": 6804299687.0, + "step": 13310 + }, + { + "epoch": 3.5995132504056246, + "grad_norm": 1.1456784009933472, + "learning_rate": 5.457528494075855e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5665957927703857, + "num_tokens": 6804804841.0, + "step": 13311 + }, + { + "epoch": 3.5997836668469443, + "grad_norm": 0.8910924196243286, + "learning_rate": 5.456286496487617e-06, + "loss": 1.8383, + "mean_token_accuracy": 0.5675348043441772, + "num_tokens": 6805328957.0, + "step": 13312 + }, + { + "epoch": 3.600054083288264, + "grad_norm": 0.9604012966156006, + "learning_rate": 5.455044668996813e-06, + "loss": 1.7906, + "mean_token_accuracy": 0.5896315574645996, + "num_tokens": 6805853175.0, + "step": 13313 + }, + { + "epoch": 3.6003244997295836, + "grad_norm": 0.9082571864128113, + "learning_rate": 5.4538030116415404e-06, + "loss": 1.9517, + "mean_token_accuracy": 0.5469788312911987, + "num_tokens": 6806377446.0, + "step": 13314 + }, + { + "epoch": 3.600594916170903, + "grad_norm": 0.9053487181663513, + "learning_rate": 5.452561524459903e-06, + "loss": 1.7173, + "mean_token_accuracy": 0.6236341595649719, + "num_tokens": 6806901558.0, + "step": 13315 + }, + { + "epoch": 3.600865332612223, + "grad_norm": 1.0275160074234009, + "learning_rate": 5.451320207489995e-06, + "loss": 1.9077, + "mean_token_accuracy": 0.5652254223823547, + "num_tokens": 6807425821.0, + "step": 13316 + }, + { + "epoch": 3.6011357490535425, + "grad_norm": 1.129567265510559, + "learning_rate": 5.450079060769898e-06, + "loss": 1.8733, + "mean_token_accuracy": 0.5683857202529907, + "num_tokens": 6807950037.0, + "step": 13317 + }, + { + "epoch": 3.601406165494862, + "grad_norm": 1.0467783212661743, + "learning_rate": 5.4488380843377e-06, + "loss": 1.9088, + "mean_token_accuracy": 0.5644223690032959, + "num_tokens": 6808474214.0, + "step": 13318 + }, + { + "epoch": 3.601676581936182, + "grad_norm": 0.8431773781776428, + "learning_rate": 5.447597278231474e-06, + "loss": 1.8682, + "mean_token_accuracy": 0.5693514347076416, + "num_tokens": 6808998457.0, + "step": 13319 + }, + { + "epoch": 3.6019469983775014, + "grad_norm": 0.9440701603889465, + "learning_rate": 5.446356642489288e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5850223302841187, + "num_tokens": 6809499069.0, + "step": 13320 + }, + { + "epoch": 3.602217414818821, + "grad_norm": 0.35145023465156555, + "learning_rate": 5.445116177149215e-06, + "loss": 1.1263, + "mean_token_accuracy": 0.6951079368591309, + "num_tokens": 6810023271.0, + "step": 13321 + }, + { + "epoch": 3.6024878312601407, + "grad_norm": 1.035613775253296, + "learning_rate": 5.443875882249314e-06, + "loss": 1.9102, + "mean_token_accuracy": 0.5324110984802246, + "num_tokens": 6810547418.0, + "step": 13322 + }, + { + "epoch": 3.6027582477014604, + "grad_norm": 1.3065072298049927, + "learning_rate": 5.442635757827639e-06, + "loss": 1.869, + "mean_token_accuracy": 0.562659740447998, + "num_tokens": 6811053024.0, + "step": 13323 + }, + { + "epoch": 3.60302866414278, + "grad_norm": 1.0205494165420532, + "learning_rate": 5.441395803922245e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5755302309989929, + "num_tokens": 6811577282.0, + "step": 13324 + }, + { + "epoch": 3.6032990805840996, + "grad_norm": 0.951820433139801, + "learning_rate": 5.440156020571173e-06, + "loss": 1.9219, + "mean_token_accuracy": 0.5579586029052734, + "num_tokens": 6812101560.0, + "step": 13325 + }, + { + "epoch": 3.6035694970254193, + "grad_norm": 1.1001160144805908, + "learning_rate": 5.438916407812463e-06, + "loss": 1.9079, + "mean_token_accuracy": 0.569676399230957, + "num_tokens": 6812563767.0, + "step": 13326 + }, + { + "epoch": 3.603839913466739, + "grad_norm": 0.8900473117828369, + "learning_rate": 5.437676965684153e-06, + "loss": 1.5971, + "mean_token_accuracy": 0.6022011637687683, + "num_tokens": 6813087975.0, + "step": 13327 + }, + { + "epoch": 3.6041103299080586, + "grad_norm": 1.0799578428268433, + "learning_rate": 5.4364376942242705e-06, + "loss": 1.8067, + "mean_token_accuracy": 0.571927547454834, + "num_tokens": 6813612004.0, + "step": 13328 + }, + { + "epoch": 3.6043807463493778, + "grad_norm": 1.1073188781738281, + "learning_rate": 5.435198593470843e-06, + "loss": 1.8264, + "mean_token_accuracy": 0.5847784280776978, + "num_tokens": 6814136192.0, + "step": 13329 + }, + { + "epoch": 3.604651162790698, + "grad_norm": 1.1139955520629883, + "learning_rate": 5.433959663461884e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5845433473587036, + "num_tokens": 6814635226.0, + "step": 13330 + }, + { + "epoch": 3.604921579232017, + "grad_norm": 1.05580735206604, + "learning_rate": 5.432720904235411e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5708386898040771, + "num_tokens": 6815159399.0, + "step": 13331 + }, + { + "epoch": 3.605191995673337, + "grad_norm": 1.1598615646362305, + "learning_rate": 5.4314823158294374e-06, + "loss": 1.8038, + "mean_token_accuracy": 0.5889842510223389, + "num_tokens": 6815625756.0, + "step": 13332 + }, + { + "epoch": 3.6054624121146563, + "grad_norm": 0.9520301222801208, + "learning_rate": 5.430243898281962e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.5726236701011658, + "num_tokens": 6816149848.0, + "step": 13333 + }, + { + "epoch": 3.6057328285559764, + "grad_norm": 1.0033177137374878, + "learning_rate": 5.42900565163098e-06, + "loss": 1.7861, + "mean_token_accuracy": 0.5709244608879089, + "num_tokens": 6816674041.0, + "step": 13334 + }, + { + "epoch": 3.6060032449972956, + "grad_norm": 1.1393063068389893, + "learning_rate": 5.427767575914491e-06, + "loss": 1.7912, + "mean_token_accuracy": 0.5860545635223389, + "num_tokens": 6817198325.0, + "step": 13335 + }, + { + "epoch": 3.6062736614386157, + "grad_norm": 1.1504331827163696, + "learning_rate": 5.42652967117048e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5636905431747437, + "num_tokens": 6817722543.0, + "step": 13336 + }, + { + "epoch": 3.606544077879935, + "grad_norm": 1.0817886590957642, + "learning_rate": 5.425291937436925e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5704582929611206, + "num_tokens": 6818246780.0, + "step": 13337 + }, + { + "epoch": 3.606814494321255, + "grad_norm": 1.0537821054458618, + "learning_rate": 5.4240543747518105e-06, + "loss": 1.8089, + "mean_token_accuracy": 0.5698739290237427, + "num_tokens": 6818771052.0, + "step": 13338 + }, + { + "epoch": 3.607084910762574, + "grad_norm": 0.9360297322273254, + "learning_rate": 5.422816983153107e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5764542818069458, + "num_tokens": 6819295234.0, + "step": 13339 + }, + { + "epoch": 3.607355327203894, + "grad_norm": 1.089547872543335, + "learning_rate": 5.421579762678777e-06, + "loss": 1.8135, + "mean_token_accuracy": 0.583892822265625, + "num_tokens": 6819819465.0, + "step": 13340 + }, + { + "epoch": 3.6076257436452135, + "grad_norm": 0.4112412929534912, + "learning_rate": 5.420342713366786e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.7167161703109741, + "num_tokens": 6820283308.0, + "step": 13341 + }, + { + "epoch": 3.607896160086533, + "grad_norm": 1.553661823272705, + "learning_rate": 5.41910583525509e-06, + "loss": 1.8717, + "mean_token_accuracy": 0.5713175535202026, + "num_tokens": 6820807482.0, + "step": 13342 + }, + { + "epoch": 3.6081665765278528, + "grad_norm": 1.6011908054351807, + "learning_rate": 5.417869128381637e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.5596402287483215, + "num_tokens": 6821331657.0, + "step": 13343 + }, + { + "epoch": 3.6084369929691724, + "grad_norm": 1.1929378509521484, + "learning_rate": 5.416632592784374e-06, + "loss": 1.7739, + "mean_token_accuracy": 0.5469793081283569, + "num_tokens": 6821855847.0, + "step": 13344 + }, + { + "epoch": 3.608707409410492, + "grad_norm": 1.0967692136764526, + "learning_rate": 5.415396228501247e-06, + "loss": 1.8519, + "mean_token_accuracy": 0.5693233013153076, + "num_tokens": 6822379957.0, + "step": 13345 + }, + { + "epoch": 3.6089778258518117, + "grad_norm": 1.3679066896438599, + "learning_rate": 5.414160035570184e-06, + "loss": 1.8183, + "mean_token_accuracy": 0.5752179622650146, + "num_tokens": 6822904216.0, + "step": 13346 + }, + { + "epoch": 3.6092482422931313, + "grad_norm": 1.2695940732955933, + "learning_rate": 5.4129240140291205e-06, + "loss": 1.7456, + "mean_token_accuracy": 0.5913256406784058, + "num_tokens": 6823394743.0, + "step": 13347 + }, + { + "epoch": 3.609518658734451, + "grad_norm": 1.352735996246338, + "learning_rate": 5.411688163915978e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5626405477523804, + "num_tokens": 6823919008.0, + "step": 13348 + }, + { + "epoch": 3.6097890751757706, + "grad_norm": 1.153761863708496, + "learning_rate": 5.410452485268676e-06, + "loss": 1.9321, + "mean_token_accuracy": 0.5707176923751831, + "num_tokens": 6824379614.0, + "step": 13349 + }, + { + "epoch": 3.6100594916170903, + "grad_norm": 1.0998610258102417, + "learning_rate": 5.409216978125129e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5860095024108887, + "num_tokens": 6824856032.0, + "step": 13350 + }, + { + "epoch": 3.61032990805841, + "grad_norm": 1.2521562576293945, + "learning_rate": 5.407981642523246e-06, + "loss": 1.8941, + "mean_token_accuracy": 0.5758323073387146, + "num_tokens": 6825380291.0, + "step": 13351 + }, + { + "epoch": 3.6106003244997296, + "grad_norm": 1.477755069732666, + "learning_rate": 5.406746478500935e-06, + "loss": 1.8239, + "mean_token_accuracy": 0.5830324292182922, + "num_tokens": 6825806177.0, + "step": 13352 + }, + { + "epoch": 3.610870740941049, + "grad_norm": 1.0682493448257446, + "learning_rate": 5.405511486096089e-06, + "loss": 1.8975, + "mean_token_accuracy": 0.568472683429718, + "num_tokens": 6826307356.0, + "step": 13353 + }, + { + "epoch": 3.611141157382369, + "grad_norm": 0.8936750888824463, + "learning_rate": 5.4042766653465996e-06, + "loss": 1.7326, + "mean_token_accuracy": 0.5909478664398193, + "num_tokens": 6826831493.0, + "step": 13354 + }, + { + "epoch": 3.6114115738236885, + "grad_norm": 1.185529112815857, + "learning_rate": 5.403042016290363e-06, + "loss": 1.9613, + "mean_token_accuracy": 0.55875563621521, + "num_tokens": 6827355734.0, + "step": 13355 + }, + { + "epoch": 3.611681990265008, + "grad_norm": 1.1197731494903564, + "learning_rate": 5.401807538965256e-06, + "loss": 1.8584, + "mean_token_accuracy": 0.5589956045150757, + "num_tokens": 6827837495.0, + "step": 13356 + }, + { + "epoch": 3.611952406706328, + "grad_norm": 0.9146426916122437, + "learning_rate": 5.400573233409152e-06, + "loss": 1.8013, + "mean_token_accuracy": 0.5881190896034241, + "num_tokens": 6828361671.0, + "step": 13357 + }, + { + "epoch": 3.6122228231476474, + "grad_norm": 0.9198224544525146, + "learning_rate": 5.399339099659931e-06, + "loss": 1.816, + "mean_token_accuracy": 0.5758103132247925, + "num_tokens": 6828885923.0, + "step": 13358 + }, + { + "epoch": 3.612493239588967, + "grad_norm": 0.7815648913383484, + "learning_rate": 5.398105137755458e-06, + "loss": 1.8672, + "mean_token_accuracy": 0.5694288015365601, + "num_tokens": 6829410152.0, + "step": 13359 + }, + { + "epoch": 3.6127636560302867, + "grad_norm": 1.0649908781051636, + "learning_rate": 5.396871347733592e-06, + "loss": 1.8648, + "mean_token_accuracy": 0.567508339881897, + "num_tokens": 6829934401.0, + "step": 13360 + }, + { + "epoch": 3.6130340724716064, + "grad_norm": 0.3472743332386017, + "learning_rate": 5.395637729632193e-06, + "loss": 1.1166, + "mean_token_accuracy": 0.7064567804336548, + "num_tokens": 6830458606.0, + "step": 13361 + }, + { + "epoch": 3.613304488912926, + "grad_norm": 1.4067848920822144, + "learning_rate": 5.39440428348911e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5703423023223877, + "num_tokens": 6830982634.0, + "step": 13362 + }, + { + "epoch": 3.6135749053542456, + "grad_norm": 1.3653302192687988, + "learning_rate": 5.393171009342184e-06, + "loss": 1.9158, + "mean_token_accuracy": 0.5604316592216492, + "num_tokens": 6831478121.0, + "step": 13363 + }, + { + "epoch": 3.6138453217955653, + "grad_norm": 0.9885250926017761, + "learning_rate": 5.391937907229265e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5736300945281982, + "num_tokens": 6832002281.0, + "step": 13364 + }, + { + "epoch": 3.614115738236885, + "grad_norm": 0.9527252912521362, + "learning_rate": 5.3907049771881816e-06, + "loss": 1.8344, + "mean_token_accuracy": 0.5695374011993408, + "num_tokens": 6832526460.0, + "step": 13365 + }, + { + "epoch": 3.6143861546782046, + "grad_norm": 1.0813132524490356, + "learning_rate": 5.389472219256764e-06, + "loss": 1.8302, + "mean_token_accuracy": 0.5813479423522949, + "num_tokens": 6833050730.0, + "step": 13366 + }, + { + "epoch": 3.614656571119524, + "grad_norm": 1.1698609590530396, + "learning_rate": 5.3882396334728405e-06, + "loss": 1.7198, + "mean_token_accuracy": 0.5821999311447144, + "num_tokens": 6833521113.0, + "step": 13367 + }, + { + "epoch": 3.614926987560844, + "grad_norm": 1.2071233987808228, + "learning_rate": 5.38700721987423e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5808014869689941, + "num_tokens": 6834007465.0, + "step": 13368 + }, + { + "epoch": 3.6151974040021635, + "grad_norm": 1.0443878173828125, + "learning_rate": 5.385774978498739e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5815993547439575, + "num_tokens": 6834531634.0, + "step": 13369 + }, + { + "epoch": 3.6154678204434827, + "grad_norm": 1.1734189987182617, + "learning_rate": 5.384542909384185e-06, + "loss": 1.8803, + "mean_token_accuracy": 0.5835186839103699, + "num_tokens": 6835024774.0, + "step": 13370 + }, + { + "epoch": 3.615738236884803, + "grad_norm": 0.9798425436019897, + "learning_rate": 5.383311012568366e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.5831801295280457, + "num_tokens": 6835485013.0, + "step": 13371 + }, + { + "epoch": 3.616008653326122, + "grad_norm": 1.0941208600997925, + "learning_rate": 5.3820792880890875e-06, + "loss": 1.8147, + "mean_token_accuracy": 0.5856879353523254, + "num_tokens": 6835964680.0, + "step": 13372 + }, + { + "epoch": 3.616279069767442, + "grad_norm": 1.2496979236602783, + "learning_rate": 5.3808477359841314e-06, + "loss": 1.7477, + "mean_token_accuracy": 0.5829026699066162, + "num_tokens": 6836488860.0, + "step": 13373 + }, + { + "epoch": 3.6165494862087613, + "grad_norm": 1.0302393436431885, + "learning_rate": 5.379616356291295e-06, + "loss": 1.8277, + "mean_token_accuracy": 0.5885320901870728, + "num_tokens": 6837013111.0, + "step": 13374 + }, + { + "epoch": 3.6168199026500814, + "grad_norm": 0.927828311920166, + "learning_rate": 5.378385149048358e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5712205171585083, + "num_tokens": 6837537280.0, + "step": 13375 + }, + { + "epoch": 3.6170903190914006, + "grad_norm": 0.8660772442817688, + "learning_rate": 5.377154114293097e-06, + "loss": 1.6772, + "mean_token_accuracy": 0.6037778854370117, + "num_tokens": 6838022997.0, + "step": 13376 + }, + { + "epoch": 3.6173607355327206, + "grad_norm": 0.9174701571464539, + "learning_rate": 5.375923252063282e-06, + "loss": 1.8579, + "mean_token_accuracy": 0.5774272680282593, + "num_tokens": 6838500373.0, + "step": 13377 + }, + { + "epoch": 3.61763115197404, + "grad_norm": 1.0800237655639648, + "learning_rate": 5.374692562396684e-06, + "loss": 1.9618, + "mean_token_accuracy": 0.5430862903594971, + "num_tokens": 6839024589.0, + "step": 13378 + }, + { + "epoch": 3.61790156841536, + "grad_norm": 1.0075193643569946, + "learning_rate": 5.373462045331059e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.5846672058105469, + "num_tokens": 6839548835.0, + "step": 13379 + }, + { + "epoch": 3.618171984856679, + "grad_norm": 1.0537604093551636, + "learning_rate": 5.372231700904165e-06, + "loss": 1.8277, + "mean_token_accuracy": 0.5890908241271973, + "num_tokens": 6839968158.0, + "step": 13380 + }, + { + "epoch": 3.6184424012979988, + "grad_norm": 0.38158470392227173, + "learning_rate": 5.371001529153752e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.7075660824775696, + "num_tokens": 6840492336.0, + "step": 13381 + }, + { + "epoch": 3.6187128177393184, + "grad_norm": 1.2101221084594727, + "learning_rate": 5.369771530117568e-06, + "loss": 1.9328, + "mean_token_accuracy": 0.5679565668106079, + "num_tokens": 6840950115.0, + "step": 13382 + }, + { + "epoch": 3.618983234180638, + "grad_norm": 1.1468311548233032, + "learning_rate": 5.368541703833348e-06, + "loss": 1.8147, + "mean_token_accuracy": 0.5892121195793152, + "num_tokens": 6841474163.0, + "step": 13383 + }, + { + "epoch": 3.6192536506219577, + "grad_norm": 0.9259488582611084, + "learning_rate": 5.367312050338832e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.5790358781814575, + "num_tokens": 6841998434.0, + "step": 13384 + }, + { + "epoch": 3.6195240670632773, + "grad_norm": 1.1328363418579102, + "learning_rate": 5.366082569671744e-06, + "loss": 1.9486, + "mean_token_accuracy": 0.5517743825912476, + "num_tokens": 6842522705.0, + "step": 13385 + }, + { + "epoch": 3.619794483504597, + "grad_norm": 0.8626633882522583, + "learning_rate": 5.364853261869815e-06, + "loss": 1.6574, + "mean_token_accuracy": 0.6201081871986389, + "num_tokens": 6843046968.0, + "step": 13386 + }, + { + "epoch": 3.6200648999459166, + "grad_norm": 0.9846973419189453, + "learning_rate": 5.363624126970754e-06, + "loss": 1.773, + "mean_token_accuracy": 0.5815744400024414, + "num_tokens": 6843562859.0, + "step": 13387 + }, + { + "epoch": 3.6203353163872363, + "grad_norm": 0.9741714000701904, + "learning_rate": 5.362395165012286e-06, + "loss": 1.8239, + "mean_token_accuracy": 0.5712223649024963, + "num_tokens": 6844087050.0, + "step": 13388 + }, + { + "epoch": 3.620605732828556, + "grad_norm": 1.0823321342468262, + "learning_rate": 5.36116637603211e-06, + "loss": 1.7551, + "mean_token_accuracy": 0.6377348899841309, + "num_tokens": 6844546022.0, + "step": 13389 + }, + { + "epoch": 3.6208761492698756, + "grad_norm": 1.1309254169464111, + "learning_rate": 5.3599377600679345e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5812217593193054, + "num_tokens": 6845041512.0, + "step": 13390 + }, + { + "epoch": 3.621146565711195, + "grad_norm": 0.8526059985160828, + "learning_rate": 5.358709317157456e-06, + "loss": 1.7938, + "mean_token_accuracy": 0.5814916491508484, + "num_tokens": 6845565679.0, + "step": 13391 + }, + { + "epoch": 3.621416982152515, + "grad_norm": 0.8867337107658386, + "learning_rate": 5.357481047338362e-06, + "loss": 1.5896, + "mean_token_accuracy": 0.6461546421051025, + "num_tokens": 6846037413.0, + "step": 13392 + }, + { + "epoch": 3.6216873985938345, + "grad_norm": 1.1326321363449097, + "learning_rate": 5.356252950648345e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.5600663423538208, + "num_tokens": 6846561668.0, + "step": 13393 + }, + { + "epoch": 3.621957815035154, + "grad_norm": 1.1644225120544434, + "learning_rate": 5.355025027125086e-06, + "loss": 1.859, + "mean_token_accuracy": 0.5613071918487549, + "num_tokens": 6847040108.0, + "step": 13394 + }, + { + "epoch": 3.6222282314764738, + "grad_norm": 0.8792673945426941, + "learning_rate": 5.353797276806258e-06, + "loss": 1.7551, + "mean_token_accuracy": 0.5888279676437378, + "num_tokens": 6847564342.0, + "step": 13395 + }, + { + "epoch": 3.6224986479177934, + "grad_norm": 0.8799439072608948, + "learning_rate": 5.352569699729534e-06, + "loss": 1.7099, + "mean_token_accuracy": 0.5745131373405457, + "num_tokens": 6848088511.0, + "step": 13396 + }, + { + "epoch": 3.622769064359113, + "grad_norm": 1.0088738203048706, + "learning_rate": 5.351342295932578e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5735387802124023, + "num_tokens": 6848612699.0, + "step": 13397 + }, + { + "epoch": 3.6230394808004327, + "grad_norm": 1.0671846866607666, + "learning_rate": 5.350115065453056e-06, + "loss": 1.7782, + "mean_token_accuracy": 0.59259033203125, + "num_tokens": 6849136966.0, + "step": 13398 + }, + { + "epoch": 3.6233098972417523, + "grad_norm": 1.0471934080123901, + "learning_rate": 5.348888008328618e-06, + "loss": 1.7878, + "mean_token_accuracy": 0.5838740468025208, + "num_tokens": 6849661212.0, + "step": 13399 + }, + { + "epoch": 3.623580313683072, + "grad_norm": 1.0367391109466553, + "learning_rate": 5.347661124596912e-06, + "loss": 1.7354, + "mean_token_accuracy": 0.601874828338623, + "num_tokens": 6850185395.0, + "step": 13400 + }, + { + "epoch": 3.6238507301243916, + "grad_norm": 0.39551493525505066, + "learning_rate": 5.346434414295585e-06, + "loss": 1.1356, + "mean_token_accuracy": 0.6927049160003662, + "num_tokens": 6850709602.0, + "step": 13401 + }, + { + "epoch": 3.6241211465657113, + "grad_norm": 1.2397866249084473, + "learning_rate": 5.34520787746228e-06, + "loss": 1.8817, + "mean_token_accuracy": 0.584734320640564, + "num_tokens": 6851226441.0, + "step": 13402 + }, + { + "epoch": 3.624391563007031, + "grad_norm": 1.1462148427963257, + "learning_rate": 5.343981514134622e-06, + "loss": 1.8276, + "mean_token_accuracy": 0.555213451385498, + "num_tokens": 6851750714.0, + "step": 13403 + }, + { + "epoch": 3.6246619794483506, + "grad_norm": 1.0972480773925781, + "learning_rate": 5.342755324350248e-06, + "loss": 1.8327, + "mean_token_accuracy": 0.5857488512992859, + "num_tokens": 6852274919.0, + "step": 13404 + }, + { + "epoch": 3.62493239588967, + "grad_norm": 0.8597122430801392, + "learning_rate": 5.341529308146778e-06, + "loss": 1.8118, + "mean_token_accuracy": 0.5768548250198364, + "num_tokens": 6852799200.0, + "step": 13405 + }, + { + "epoch": 3.62520281233099, + "grad_norm": 0.9993745684623718, + "learning_rate": 5.3403034655618265e-06, + "loss": 1.8293, + "mean_token_accuracy": 0.5709953904151917, + "num_tokens": 6853323480.0, + "step": 13406 + }, + { + "epoch": 3.6254732287723095, + "grad_norm": 1.0637649297714233, + "learning_rate": 5.339077796633012e-06, + "loss": 1.8853, + "mean_token_accuracy": 0.5615168213844299, + "num_tokens": 6853847757.0, + "step": 13407 + }, + { + "epoch": 3.625743645213629, + "grad_norm": 1.0813336372375488, + "learning_rate": 5.3378523013979385e-06, + "loss": 1.8158, + "mean_token_accuracy": 0.5708238482475281, + "num_tokens": 6854371983.0, + "step": 13408 + }, + { + "epoch": 3.6260140616549488, + "grad_norm": 0.8356741666793823, + "learning_rate": 5.336626979894204e-06, + "loss": 1.8638, + "mean_token_accuracy": 0.565868616104126, + "num_tokens": 6854896205.0, + "step": 13409 + }, + { + "epoch": 3.6262844780962684, + "grad_norm": 1.127947449684143, + "learning_rate": 5.335401832159413e-06, + "loss": 1.8816, + "mean_token_accuracy": 0.5703054666519165, + "num_tokens": 6855369191.0, + "step": 13410 + }, + { + "epoch": 3.6265548945375876, + "grad_norm": 0.9423738718032837, + "learning_rate": 5.3341768582311504e-06, + "loss": 1.7997, + "mean_token_accuracy": 0.5763438940048218, + "num_tokens": 6855893415.0, + "step": 13411 + }, + { + "epoch": 3.6268253109789077, + "grad_norm": 0.8492210507392883, + "learning_rate": 5.332952058147002e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5941001176834106, + "num_tokens": 6856360083.0, + "step": 13412 + }, + { + "epoch": 3.627095727420227, + "grad_norm": 0.8598038554191589, + "learning_rate": 5.331727431944553e-06, + "loss": 1.6924, + "mean_token_accuracy": 0.6041879057884216, + "num_tokens": 6856862487.0, + "step": 13413 + }, + { + "epoch": 3.627366143861547, + "grad_norm": 0.8633260726928711, + "learning_rate": 5.330502979661373e-06, + "loss": 1.7687, + "mean_token_accuracy": 0.5967975854873657, + "num_tokens": 6857386738.0, + "step": 13414 + }, + { + "epoch": 3.627636560302866, + "grad_norm": 0.8627641797065735, + "learning_rate": 5.329278701335037e-06, + "loss": 1.9412, + "mean_token_accuracy": 0.5424844622612, + "num_tokens": 6857910975.0, + "step": 13415 + }, + { + "epoch": 3.6279069767441863, + "grad_norm": 0.8600972890853882, + "learning_rate": 5.328054597003104e-06, + "loss": 1.7467, + "mean_token_accuracy": 0.5724496841430664, + "num_tokens": 6858435224.0, + "step": 13416 + }, + { + "epoch": 3.6281773931855055, + "grad_norm": 0.8490625023841858, + "learning_rate": 5.326830666703138e-06, + "loss": 1.7106, + "mean_token_accuracy": 0.6148123145103455, + "num_tokens": 6858959418.0, + "step": 13417 + }, + { + "epoch": 3.6284478096268256, + "grad_norm": 0.8721454739570618, + "learning_rate": 5.3256069104726895e-06, + "loss": 1.8012, + "mean_token_accuracy": 0.5846972465515137, + "num_tokens": 6859483615.0, + "step": 13418 + }, + { + "epoch": 3.6287182260681448, + "grad_norm": 0.8501722812652588, + "learning_rate": 5.32438332834931e-06, + "loss": 1.7938, + "mean_token_accuracy": 0.5768566727638245, + "num_tokens": 6859982932.0, + "step": 13419 + }, + { + "epoch": 3.628988642509465, + "grad_norm": 1.0274642705917358, + "learning_rate": 5.32315992037054e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5757900476455688, + "num_tokens": 6860446781.0, + "step": 13420 + }, + { + "epoch": 3.629259058950784, + "grad_norm": 0.39245742559432983, + "learning_rate": 5.321936686573918e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6930921077728271, + "num_tokens": 6860970978.0, + "step": 13421 + }, + { + "epoch": 3.6295294753921037, + "grad_norm": 1.077040195465088, + "learning_rate": 5.32071362699698e-06, + "loss": 1.8899, + "mean_token_accuracy": 0.5699238777160645, + "num_tokens": 6861495261.0, + "step": 13422 + }, + { + "epoch": 3.6297998918334233, + "grad_norm": 0.9029653668403625, + "learning_rate": 5.319490741677244e-06, + "loss": 1.8095, + "mean_token_accuracy": 0.5882824659347534, + "num_tokens": 6862007206.0, + "step": 13423 + }, + { + "epoch": 3.630070308274743, + "grad_norm": 0.893893301486969, + "learning_rate": 5.3182680306522415e-06, + "loss": 1.8582, + "mean_token_accuracy": 0.5648230314254761, + "num_tokens": 6862479741.0, + "step": 13424 + }, + { + "epoch": 3.6303407247160626, + "grad_norm": 0.8427790403366089, + "learning_rate": 5.317045493959485e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.5672602653503418, + "num_tokens": 6863003727.0, + "step": 13425 + }, + { + "epoch": 3.6306111411573823, + "grad_norm": 0.9342799186706543, + "learning_rate": 5.315823131636482e-06, + "loss": 1.7695, + "mean_token_accuracy": 0.5779165029525757, + "num_tokens": 6863527887.0, + "step": 13426 + }, + { + "epoch": 3.630881557598702, + "grad_norm": 1.076107382774353, + "learning_rate": 5.314600943720745e-06, + "loss": 1.8801, + "mean_token_accuracy": 0.5606274008750916, + "num_tokens": 6864052001.0, + "step": 13427 + }, + { + "epoch": 3.6311519740400215, + "grad_norm": 1.0048093795776367, + "learning_rate": 5.313378930249765e-06, + "loss": 1.8663, + "mean_token_accuracy": 0.5871667265892029, + "num_tokens": 6864576244.0, + "step": 13428 + }, + { + "epoch": 3.631422390481341, + "grad_norm": 1.0856744050979614, + "learning_rate": 5.312157091261048e-06, + "loss": 1.783, + "mean_token_accuracy": 0.5793580412864685, + "num_tokens": 6865100443.0, + "step": 13429 + }, + { + "epoch": 3.631692806922661, + "grad_norm": 0.9810135960578918, + "learning_rate": 5.310935426792075e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5802204608917236, + "num_tokens": 6865566971.0, + "step": 13430 + }, + { + "epoch": 3.6319632233639805, + "grad_norm": 1.0500123500823975, + "learning_rate": 5.3097139368803365e-06, + "loss": 1.8273, + "mean_token_accuracy": 0.5872266888618469, + "num_tokens": 6866091189.0, + "step": 13431 + }, + { + "epoch": 3.6322336398053, + "grad_norm": 1.0293326377868652, + "learning_rate": 5.308492621563305e-06, + "loss": 1.941, + "mean_token_accuracy": 0.5520932674407959, + "num_tokens": 6866615351.0, + "step": 13432 + }, + { + "epoch": 3.6325040562466198, + "grad_norm": 0.8357024788856506, + "learning_rate": 5.3072714808784594e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5846604108810425, + "num_tokens": 6867118816.0, + "step": 13433 + }, + { + "epoch": 3.6327744726879394, + "grad_norm": 0.9723379015922546, + "learning_rate": 5.306050514863268e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5903934240341187, + "num_tokens": 6867643034.0, + "step": 13434 + }, + { + "epoch": 3.633044889129259, + "grad_norm": 0.8777309656143188, + "learning_rate": 5.304829723555189e-06, + "loss": 1.912, + "mean_token_accuracy": 0.563048779964447, + "num_tokens": 6868159360.0, + "step": 13435 + }, + { + "epoch": 3.6333153055705787, + "grad_norm": 1.002540111541748, + "learning_rate": 5.303609106991685e-06, + "loss": 1.7871, + "mean_token_accuracy": 0.5730693340301514, + "num_tokens": 6868683583.0, + "step": 13436 + }, + { + "epoch": 3.6335857220118983, + "grad_norm": 1.712937831878662, + "learning_rate": 5.302388665210204e-06, + "loss": 1.5939, + "mean_token_accuracy": 0.617557168006897, + "num_tokens": 6869207852.0, + "step": 13437 + }, + { + "epoch": 3.633856138453218, + "grad_norm": 1.2377982139587402, + "learning_rate": 5.3011683982481935e-06, + "loss": 1.9782, + "mean_token_accuracy": 0.5288645625114441, + "num_tokens": 6869732031.0, + "step": 13438 + }, + { + "epoch": 3.6341265548945376, + "grad_norm": 1.1758280992507935, + "learning_rate": 5.2999483061430964e-06, + "loss": 1.9155, + "mean_token_accuracy": 0.5598530769348145, + "num_tokens": 6870249653.0, + "step": 13439 + }, + { + "epoch": 3.6343969713358573, + "grad_norm": 1.2492144107818604, + "learning_rate": 5.29872838893235e-06, + "loss": 1.7914, + "mean_token_accuracy": 0.5841875076293945, + "num_tokens": 6870773809.0, + "step": 13440 + }, + { + "epoch": 3.634667387777177, + "grad_norm": 0.4125535786151886, + "learning_rate": 5.297508646653379e-06, + "loss": 1.1404, + "mean_token_accuracy": 0.6975576877593994, + "num_tokens": 6871246754.0, + "step": 13441 + }, + { + "epoch": 3.6349378042184965, + "grad_norm": 1.5208584070205688, + "learning_rate": 5.296289079343617e-06, + "loss": 1.7427, + "mean_token_accuracy": 0.6042621731758118, + "num_tokens": 6871770972.0, + "step": 13442 + }, + { + "epoch": 3.635208220659816, + "grad_norm": 1.561604619026184, + "learning_rate": 5.295069687040475e-06, + "loss": 1.7456, + "mean_token_accuracy": 0.5879135727882385, + "num_tokens": 6872295125.0, + "step": 13443 + }, + { + "epoch": 3.635478637101136, + "grad_norm": 1.5548095703125, + "learning_rate": 5.293850469781372e-06, + "loss": 1.8224, + "mean_token_accuracy": 0.581656277179718, + "num_tokens": 6872819230.0, + "step": 13444 + }, + { + "epoch": 3.6357490535424555, + "grad_norm": 0.963938295841217, + "learning_rate": 5.292631427603721e-06, + "loss": 1.754, + "mean_token_accuracy": 0.5838332176208496, + "num_tokens": 6873343485.0, + "step": 13445 + }, + { + "epoch": 3.636019469983775, + "grad_norm": 0.9260680675506592, + "learning_rate": 5.291412560544919e-06, + "loss": 1.8048, + "mean_token_accuracy": 0.5756895542144775, + "num_tokens": 6873867687.0, + "step": 13446 + }, + { + "epoch": 3.6362898864250948, + "grad_norm": 1.018213152885437, + "learning_rate": 5.29019386864237e-06, + "loss": 1.8052, + "mean_token_accuracy": 0.5773355960845947, + "num_tokens": 6874391944.0, + "step": 13447 + }, + { + "epoch": 3.6365603028664144, + "grad_norm": 1.1363693475723267, + "learning_rate": 5.288975351933466e-06, + "loss": 1.9269, + "mean_token_accuracy": 0.5568823218345642, + "num_tokens": 6874916118.0, + "step": 13448 + }, + { + "epoch": 3.636830719307734, + "grad_norm": 1.034252643585205, + "learning_rate": 5.287757010455589e-06, + "loss": 1.848, + "mean_token_accuracy": 0.5748409628868103, + "num_tokens": 6875424173.0, + "step": 13449 + }, + { + "epoch": 3.6371011357490537, + "grad_norm": 1.024014949798584, + "learning_rate": 5.2865388442461274e-06, + "loss": 1.8463, + "mean_token_accuracy": 0.5863207578659058, + "num_tokens": 6875948328.0, + "step": 13450 + }, + { + "epoch": 3.6373715521903733, + "grad_norm": 1.1114466190338135, + "learning_rate": 5.285320853342458e-06, + "loss": 1.8231, + "mean_token_accuracy": 0.6131421327590942, + "num_tokens": 6876441322.0, + "step": 13451 + }, + { + "epoch": 3.6376419686316925, + "grad_norm": 1.0680447816848755, + "learning_rate": 5.284103037781947e-06, + "loss": 1.776, + "mean_token_accuracy": 0.5851554870605469, + "num_tokens": 6876965520.0, + "step": 13452 + }, + { + "epoch": 3.6379123850730126, + "grad_norm": 0.8743323087692261, + "learning_rate": 5.282885397601968e-06, + "loss": 1.7325, + "mean_token_accuracy": 0.6044597625732422, + "num_tokens": 6877461221.0, + "step": 13453 + }, + { + "epoch": 3.638182801514332, + "grad_norm": 1.2956202030181885, + "learning_rate": 5.281667932839878e-06, + "loss": 1.9131, + "mean_token_accuracy": 0.5848840475082397, + "num_tokens": 6877930872.0, + "step": 13454 + }, + { + "epoch": 3.638453217955652, + "grad_norm": 1.1643635034561157, + "learning_rate": 5.280450643533028e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5806218385696411, + "num_tokens": 6878411025.0, + "step": 13455 + }, + { + "epoch": 3.638723634396971, + "grad_norm": 0.9743698239326477, + "learning_rate": 5.279233529718775e-06, + "loss": 1.7478, + "mean_token_accuracy": 0.611568033695221, + "num_tokens": 6878885911.0, + "step": 13456 + }, + { + "epoch": 3.638994050838291, + "grad_norm": 0.9673731923103333, + "learning_rate": 5.278016591434461e-06, + "loss": 1.7222, + "mean_token_accuracy": 0.5959219336509705, + "num_tokens": 6879410167.0, + "step": 13457 + }, + { + "epoch": 3.6392644672796104, + "grad_norm": 1.0693780183792114, + "learning_rate": 5.2767998287174215e-06, + "loss": 1.8463, + "mean_token_accuracy": 0.5552804470062256, + "num_tokens": 6879934235.0, + "step": 13458 + }, + { + "epoch": 3.6395348837209305, + "grad_norm": 0.9993634223937988, + "learning_rate": 5.275583241604999e-06, + "loss": 1.6514, + "mean_token_accuracy": 0.6155583262443542, + "num_tokens": 6880442617.0, + "step": 13459 + }, + { + "epoch": 3.6398053001622497, + "grad_norm": 0.8715377449989319, + "learning_rate": 5.2743668301345165e-06, + "loss": 1.9577, + "mean_token_accuracy": 0.5570218563079834, + "num_tokens": 6880966745.0, + "step": 13460 + }, + { + "epoch": 3.6400757166035698, + "grad_norm": 0.3688109517097473, + "learning_rate": 5.273150594343295e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.7100551128387451, + "num_tokens": 6881447441.0, + "step": 13461 + }, + { + "epoch": 3.640346133044889, + "grad_norm": 1.2435575723648071, + "learning_rate": 5.271934534268659e-06, + "loss": 1.9179, + "mean_token_accuracy": 0.5470222234725952, + "num_tokens": 6881971620.0, + "step": 13462 + }, + { + "epoch": 3.6406165494862086, + "grad_norm": 1.280287265777588, + "learning_rate": 5.270718649947915e-06, + "loss": 1.958, + "mean_token_accuracy": 0.5538182854652405, + "num_tokens": 6882475819.0, + "step": 13463 + }, + { + "epoch": 3.6408869659275283, + "grad_norm": 0.9572860598564148, + "learning_rate": 5.2695029414183696e-06, + "loss": 1.7991, + "mean_token_accuracy": 0.5620269775390625, + "num_tokens": 6883000049.0, + "step": 13464 + }, + { + "epoch": 3.641157382368848, + "grad_norm": 0.9771444201469421, + "learning_rate": 5.2682874087173286e-06, + "loss": 1.9079, + "mean_token_accuracy": 0.5706878900527954, + "num_tokens": 6883524296.0, + "step": 13465 + }, + { + "epoch": 3.6414277988101675, + "grad_norm": 0.9571672677993774, + "learning_rate": 5.267072051882083e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5656680464744568, + "num_tokens": 6884048442.0, + "step": 13466 + }, + { + "epoch": 3.641698215251487, + "grad_norm": 1.1358485221862793, + "learning_rate": 5.265856870949928e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5834144353866577, + "num_tokens": 6884519618.0, + "step": 13467 + }, + { + "epoch": 3.641968631692807, + "grad_norm": 0.954364538192749, + "learning_rate": 5.2646418659581485e-06, + "loss": 1.815, + "mean_token_accuracy": 0.5776054859161377, + "num_tokens": 6885043756.0, + "step": 13468 + }, + { + "epoch": 3.6422390481341265, + "grad_norm": 1.0652148723602295, + "learning_rate": 5.26342703694402e-06, + "loss": 1.8142, + "mean_token_accuracy": 0.5700907707214355, + "num_tokens": 6885568035.0, + "step": 13469 + }, + { + "epoch": 3.642509464575446, + "grad_norm": 0.9894809722900391, + "learning_rate": 5.262212383944824e-06, + "loss": 1.8684, + "mean_token_accuracy": 0.5773613452911377, + "num_tokens": 6886080179.0, + "step": 13470 + }, + { + "epoch": 3.6427798810167658, + "grad_norm": 0.9127960801124573, + "learning_rate": 5.2609979069978225e-06, + "loss": 1.966, + "mean_token_accuracy": 0.5539003610610962, + "num_tokens": 6886604446.0, + "step": 13471 + }, + { + "epoch": 3.6430502974580854, + "grad_norm": 0.8749920725822449, + "learning_rate": 5.259783606140283e-06, + "loss": 1.8881, + "mean_token_accuracy": 0.5717558264732361, + "num_tokens": 6887128424.0, + "step": 13472 + }, + { + "epoch": 3.643320713899405, + "grad_norm": 1.0119056701660156, + "learning_rate": 5.258569481409464e-06, + "loss": 1.7635, + "mean_token_accuracy": 0.5852727890014648, + "num_tokens": 6887608065.0, + "step": 13473 + }, + { + "epoch": 3.6435911303407247, + "grad_norm": 1.0779510736465454, + "learning_rate": 5.257355532842618e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5953640937805176, + "num_tokens": 6888123326.0, + "step": 13474 + }, + { + "epoch": 3.6438615467820443, + "grad_norm": 0.8746423721313477, + "learning_rate": 5.256141760476991e-06, + "loss": 1.831, + "mean_token_accuracy": 0.5774208307266235, + "num_tokens": 6888647544.0, + "step": 13475 + }, + { + "epoch": 3.644131963223364, + "grad_norm": 1.1023879051208496, + "learning_rate": 5.254928164349828e-06, + "loss": 1.8114, + "mean_token_accuracy": 0.5863097906112671, + "num_tokens": 6889171649.0, + "step": 13476 + }, + { + "epoch": 3.6444023796646836, + "grad_norm": 1.1041743755340576, + "learning_rate": 5.253714744498364e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5768749117851257, + "num_tokens": 6889643276.0, + "step": 13477 + }, + { + "epoch": 3.6446727961060033, + "grad_norm": 1.1761606931686401, + "learning_rate": 5.252501500959829e-06, + "loss": 1.8405, + "mean_token_accuracy": 0.5769127607345581, + "num_tokens": 6890167511.0, + "step": 13478 + }, + { + "epoch": 3.644943212547323, + "grad_norm": 0.950020968914032, + "learning_rate": 5.251288433771453e-06, + "loss": 1.8159, + "mean_token_accuracy": 0.5863932967185974, + "num_tokens": 6890691779.0, + "step": 13479 + }, + { + "epoch": 3.6452136289886425, + "grad_norm": 0.9092922806739807, + "learning_rate": 5.250075542970452e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5714861750602722, + "num_tokens": 6891183372.0, + "step": 13480 + }, + { + "epoch": 3.645484045429962, + "grad_norm": 0.4264103174209595, + "learning_rate": 5.248862828594041e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.7071883082389832, + "num_tokens": 6891686504.0, + "step": 13481 + }, + { + "epoch": 3.645754461871282, + "grad_norm": 1.2549927234649658, + "learning_rate": 5.2476502906794336e-06, + "loss": 1.7696, + "mean_token_accuracy": 0.5978254079818726, + "num_tokens": 6892159721.0, + "step": 13482 + }, + { + "epoch": 3.6460248783126015, + "grad_norm": 1.2697341442108154, + "learning_rate": 5.246437929263833e-06, + "loss": 1.8544, + "mean_token_accuracy": 0.5723851919174194, + "num_tokens": 6892628889.0, + "step": 13483 + }, + { + "epoch": 3.646295294753921, + "grad_norm": 1.002387285232544, + "learning_rate": 5.245225744384435e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5763797760009766, + "num_tokens": 6893153098.0, + "step": 13484 + }, + { + "epoch": 3.6465657111952408, + "grad_norm": 1.067728042602539, + "learning_rate": 5.244013736078433e-06, + "loss": 1.72, + "mean_token_accuracy": 0.5989631414413452, + "num_tokens": 6893667739.0, + "step": 13485 + }, + { + "epoch": 3.6468361276365604, + "grad_norm": 1.3116166591644287, + "learning_rate": 5.242801904383019e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.5592607259750366, + "num_tokens": 6894192017.0, + "step": 13486 + }, + { + "epoch": 3.64710654407788, + "grad_norm": 1.1362217664718628, + "learning_rate": 5.241590249335371e-06, + "loss": 1.6501, + "mean_token_accuracy": 0.6311326026916504, + "num_tokens": 6894716218.0, + "step": 13487 + }, + { + "epoch": 3.6473769605191997, + "grad_norm": 1.0957046747207642, + "learning_rate": 5.240378770972671e-06, + "loss": 1.8657, + "mean_token_accuracy": 0.57732093334198, + "num_tokens": 6895202898.0, + "step": 13488 + }, + { + "epoch": 3.6476473769605193, + "grad_norm": 0.9435182213783264, + "learning_rate": 5.239167469332087e-06, + "loss": 1.9218, + "mean_token_accuracy": 0.5558091402053833, + "num_tokens": 6895703475.0, + "step": 13489 + }, + { + "epoch": 3.647917793401839, + "grad_norm": 1.0606180429458618, + "learning_rate": 5.23795634445079e-06, + "loss": 1.897, + "mean_token_accuracy": 0.5516510605812073, + "num_tokens": 6896227701.0, + "step": 13490 + }, + { + "epoch": 3.6481882098431586, + "grad_norm": 1.021232008934021, + "learning_rate": 5.236745396365935e-06, + "loss": 1.7524, + "mean_token_accuracy": 0.5897007584571838, + "num_tokens": 6896751816.0, + "step": 13491 + }, + { + "epoch": 3.6484586262844783, + "grad_norm": 1.0587542057037354, + "learning_rate": 5.235534625114678e-06, + "loss": 1.8924, + "mean_token_accuracy": 0.5651592016220093, + "num_tokens": 6897276100.0, + "step": 13492 + }, + { + "epoch": 3.6487290427257975, + "grad_norm": 0.9556188583374023, + "learning_rate": 5.2343240307341744e-06, + "loss": 1.8622, + "mean_token_accuracy": 0.5811073780059814, + "num_tokens": 6897800350.0, + "step": 13493 + }, + { + "epoch": 3.6489994591671175, + "grad_norm": 0.8958392143249512, + "learning_rate": 5.233113613261565e-06, + "loss": 1.6612, + "mean_token_accuracy": 0.6019203662872314, + "num_tokens": 6898324486.0, + "step": 13494 + }, + { + "epoch": 3.6492698756084367, + "grad_norm": 0.9850791096687317, + "learning_rate": 5.231903372733986e-06, + "loss": 1.8083, + "mean_token_accuracy": 0.5762773752212524, + "num_tokens": 6898847688.0, + "step": 13495 + }, + { + "epoch": 3.649540292049757, + "grad_norm": 0.8820663690567017, + "learning_rate": 5.2306933091885766e-06, + "loss": 1.8537, + "mean_token_accuracy": 0.5760172009468079, + "num_tokens": 6899371972.0, + "step": 13496 + }, + { + "epoch": 3.649810708491076, + "grad_norm": 0.8466354608535767, + "learning_rate": 5.229483422662464e-06, + "loss": 1.7569, + "mean_token_accuracy": 0.5824559926986694, + "num_tokens": 6899896151.0, + "step": 13497 + }, + { + "epoch": 3.650081124932396, + "grad_norm": 0.9647166132926941, + "learning_rate": 5.228273713192769e-06, + "loss": 1.9023, + "mean_token_accuracy": 0.5636168718338013, + "num_tokens": 6900414174.0, + "step": 13498 + }, + { + "epoch": 3.6503515413737153, + "grad_norm": 0.9944054484367371, + "learning_rate": 5.2270641808166105e-06, + "loss": 1.7815, + "mean_token_accuracy": 0.5965771675109863, + "num_tokens": 6900938370.0, + "step": 13499 + }, + { + "epoch": 3.6506219578150354, + "grad_norm": 0.9196586608886719, + "learning_rate": 5.225854825571097e-06, + "loss": 1.8437, + "mean_token_accuracy": 0.5564929246902466, + "num_tokens": 6901462597.0, + "step": 13500 + }, + { + "epoch": 3.6508923742563546, + "grad_norm": 0.3937259912490845, + "learning_rate": 5.224645647493338e-06, + "loss": 1.1415, + "mean_token_accuracy": 0.6933056116104126, + "num_tokens": 6901943482.0, + "step": 13501 + }, + { + "epoch": 3.6511627906976747, + "grad_norm": 1.1277494430541992, + "learning_rate": 5.223436646620438e-06, + "loss": 1.8194, + "mean_token_accuracy": 0.5807307958602905, + "num_tokens": 6902423343.0, + "step": 13502 + }, + { + "epoch": 3.651433207138994, + "grad_norm": 1.0271164178848267, + "learning_rate": 5.2222278229894896e-06, + "loss": 1.7519, + "mean_token_accuracy": 0.6032202243804932, + "num_tokens": 6902947612.0, + "step": 13503 + }, + { + "epoch": 3.6517036235803135, + "grad_norm": 0.994816243648529, + "learning_rate": 5.221019176637581e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.590272068977356, + "num_tokens": 6903471895.0, + "step": 13504 + }, + { + "epoch": 3.651974040021633, + "grad_norm": 0.9369277954101562, + "learning_rate": 5.2198107076018e-06, + "loss": 1.7453, + "mean_token_accuracy": 0.5912014245986938, + "num_tokens": 6903960329.0, + "step": 13505 + }, + { + "epoch": 3.652244456462953, + "grad_norm": 1.0880192518234253, + "learning_rate": 5.218602415919225e-06, + "loss": 1.6712, + "mean_token_accuracy": 0.5840046405792236, + "num_tokens": 6904484437.0, + "step": 13506 + }, + { + "epoch": 3.6525148729042725, + "grad_norm": 0.9671480655670166, + "learning_rate": 5.217394301626928e-06, + "loss": 1.894, + "mean_token_accuracy": 0.5712128281593323, + "num_tokens": 6905008660.0, + "step": 13507 + }, + { + "epoch": 3.652785289345592, + "grad_norm": 0.9126370549201965, + "learning_rate": 5.216186364761982e-06, + "loss": 1.877, + "mean_token_accuracy": 0.568589448928833, + "num_tokens": 6905532781.0, + "step": 13508 + }, + { + "epoch": 3.6530557057869117, + "grad_norm": 0.8408381342887878, + "learning_rate": 5.214978605361447e-06, + "loss": 1.7514, + "mean_token_accuracy": 0.5921134352684021, + "num_tokens": 6906056958.0, + "step": 13509 + }, + { + "epoch": 3.6533261222282314, + "grad_norm": 0.8921473026275635, + "learning_rate": 5.213771023462379e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5701276659965515, + "num_tokens": 6906581080.0, + "step": 13510 + }, + { + "epoch": 3.653596538669551, + "grad_norm": 0.8927169442176819, + "learning_rate": 5.212563619101834e-06, + "loss": 1.7874, + "mean_token_accuracy": 0.5906028747558594, + "num_tokens": 6907047759.0, + "step": 13511 + }, + { + "epoch": 3.6538669551108707, + "grad_norm": 0.933320164680481, + "learning_rate": 5.2113563923168555e-06, + "loss": 1.9451, + "mean_token_accuracy": 0.5626516342163086, + "num_tokens": 6907557123.0, + "step": 13512 + }, + { + "epoch": 3.6541373715521903, + "grad_norm": 0.9713997840881348, + "learning_rate": 5.210149343144488e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5733429193496704, + "num_tokens": 6908081323.0, + "step": 13513 + }, + { + "epoch": 3.65440778799351, + "grad_norm": 0.8968984484672546, + "learning_rate": 5.208942471621763e-06, + "loss": 1.6192, + "mean_token_accuracy": 0.6268575191497803, + "num_tokens": 6908605558.0, + "step": 13514 + }, + { + "epoch": 3.6546782044348296, + "grad_norm": 0.9809255003929138, + "learning_rate": 5.207735777785714e-06, + "loss": 1.952, + "mean_token_accuracy": 0.5662626028060913, + "num_tokens": 6909129806.0, + "step": 13515 + }, + { + "epoch": 3.6549486208761492, + "grad_norm": 1.1080440282821655, + "learning_rate": 5.206529261673365e-06, + "loss": 1.8397, + "mean_token_accuracy": 0.5738874077796936, + "num_tokens": 6909565446.0, + "step": 13516 + }, + { + "epoch": 3.655219037317469, + "grad_norm": 0.8927583694458008, + "learning_rate": 5.205322923321738e-06, + "loss": 1.8052, + "mean_token_accuracy": 0.5780351161956787, + "num_tokens": 6910089725.0, + "step": 13517 + }, + { + "epoch": 3.6554894537587885, + "grad_norm": 1.2484421730041504, + "learning_rate": 5.204116762767842e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5700837969779968, + "num_tokens": 6910613989.0, + "step": 13518 + }, + { + "epoch": 3.655759870200108, + "grad_norm": 0.9985647201538086, + "learning_rate": 5.202910780048691e-06, + "loss": 1.8047, + "mean_token_accuracy": 0.5788818001747131, + "num_tokens": 6911138231.0, + "step": 13519 + }, + { + "epoch": 3.656030286641428, + "grad_norm": 0.885099470615387, + "learning_rate": 5.201704975201287e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5656269788742065, + "num_tokens": 6911662398.0, + "step": 13520 + }, + { + "epoch": 3.6563007030827475, + "grad_norm": 0.3740660548210144, + "learning_rate": 5.20049934826262e-06, + "loss": 1.1018, + "mean_token_accuracy": 0.696925163269043, + "num_tokens": 6912174917.0, + "step": 13521 + }, + { + "epoch": 3.656571119524067, + "grad_norm": 1.1221017837524414, + "learning_rate": 5.199293899269694e-06, + "loss": 1.7032, + "mean_token_accuracy": 0.5816643238067627, + "num_tokens": 6912699087.0, + "step": 13522 + }, + { + "epoch": 3.6568415359653867, + "grad_norm": 1.097314715385437, + "learning_rate": 5.198088628259488e-06, + "loss": 1.827, + "mean_token_accuracy": 0.5821185111999512, + "num_tokens": 6913223213.0, + "step": 13523 + }, + { + "epoch": 3.6571119524067064, + "grad_norm": 1.071823239326477, + "learning_rate": 5.196883535268984e-06, + "loss": 1.8858, + "mean_token_accuracy": 0.5664528012275696, + "num_tokens": 6913747488.0, + "step": 13524 + }, + { + "epoch": 3.657382368848026, + "grad_norm": 1.0281808376312256, + "learning_rate": 5.195678620335159e-06, + "loss": 1.8861, + "mean_token_accuracy": 0.5709874629974365, + "num_tokens": 6914219848.0, + "step": 13525 + }, + { + "epoch": 3.6576527852893457, + "grad_norm": 1.072992205619812, + "learning_rate": 5.194473883494986e-06, + "loss": 1.8767, + "mean_token_accuracy": 0.5460449457168579, + "num_tokens": 6914706347.0, + "step": 13526 + }, + { + "epoch": 3.6579232017306653, + "grad_norm": 1.2180101871490479, + "learning_rate": 5.193269324785424e-06, + "loss": 1.9303, + "mean_token_accuracy": 0.573192298412323, + "num_tokens": 6915173253.0, + "step": 13527 + }, + { + "epoch": 3.658193618171985, + "grad_norm": 1.1628427505493164, + "learning_rate": 5.192064944243434e-06, + "loss": 1.9362, + "mean_token_accuracy": 0.5650902986526489, + "num_tokens": 6915697376.0, + "step": 13528 + }, + { + "epoch": 3.6584640346133046, + "grad_norm": 1.1004090309143066, + "learning_rate": 5.190860741905977e-06, + "loss": 1.875, + "mean_token_accuracy": 0.5631067752838135, + "num_tokens": 6916221513.0, + "step": 13529 + }, + { + "epoch": 3.6587344510546242, + "grad_norm": 1.069008708000183, + "learning_rate": 5.18965671780999e-06, + "loss": 1.7827, + "mean_token_accuracy": 0.5768405199050903, + "num_tokens": 6916745783.0, + "step": 13530 + }, + { + "epoch": 3.659004867495944, + "grad_norm": 1.032088041305542, + "learning_rate": 5.188452871992428e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5592154264450073, + "num_tokens": 6917269972.0, + "step": 13531 + }, + { + "epoch": 3.6592752839372635, + "grad_norm": 0.9926677346229553, + "learning_rate": 5.187249204490219e-06, + "loss": 1.7784, + "mean_token_accuracy": 0.5999516248703003, + "num_tokens": 6917736284.0, + "step": 13532 + }, + { + "epoch": 3.659545700378583, + "grad_norm": 0.9081203937530518, + "learning_rate": 5.186045715340299e-06, + "loss": 1.7628, + "mean_token_accuracy": 0.5936177372932434, + "num_tokens": 6918260564.0, + "step": 13533 + }, + { + "epoch": 3.6598161168199024, + "grad_norm": 1.0026729106903076, + "learning_rate": 5.184842404579595e-06, + "loss": 1.7806, + "mean_token_accuracy": 0.5869280099868774, + "num_tokens": 6918784823.0, + "step": 13534 + }, + { + "epoch": 3.6600865332612225, + "grad_norm": 1.1219779253005981, + "learning_rate": 5.183639272245025e-06, + "loss": 1.8057, + "mean_token_accuracy": 0.5791971683502197, + "num_tokens": 6919309034.0, + "step": 13535 + }, + { + "epoch": 3.6603569497025417, + "grad_norm": 1.205230712890625, + "learning_rate": 5.1824363183735106e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5741891860961914, + "num_tokens": 6919833253.0, + "step": 13536 + }, + { + "epoch": 3.6606273661438617, + "grad_norm": 1.246375560760498, + "learning_rate": 5.181233543001958e-06, + "loss": 1.7808, + "mean_token_accuracy": 0.5978439450263977, + "num_tokens": 6920357420.0, + "step": 13537 + }, + { + "epoch": 3.660897782585181, + "grad_norm": 0.9730942845344543, + "learning_rate": 5.180030946167269e-06, + "loss": 1.9148, + "mean_token_accuracy": 0.5608002543449402, + "num_tokens": 6920881668.0, + "step": 13538 + }, + { + "epoch": 3.661168199026501, + "grad_norm": 1.198219895362854, + "learning_rate": 5.178828527906347e-06, + "loss": 1.8703, + "mean_token_accuracy": 0.5870610475540161, + "num_tokens": 6921294287.0, + "step": 13539 + }, + { + "epoch": 3.6614386154678202, + "grad_norm": 1.415705919265747, + "learning_rate": 5.1776262882560876e-06, + "loss": 1.9529, + "mean_token_accuracy": 0.5628190636634827, + "num_tokens": 6921772631.0, + "step": 13540 + }, + { + "epoch": 3.6617090319091403, + "grad_norm": 0.3859040439128876, + "learning_rate": 5.176424227253374e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7267776727676392, + "num_tokens": 6922296904.0, + "step": 13541 + }, + { + "epoch": 3.6619794483504595, + "grad_norm": 1.3049858808517456, + "learning_rate": 5.175222344935091e-06, + "loss": 1.8734, + "mean_token_accuracy": 0.5719399452209473, + "num_tokens": 6922784695.0, + "step": 13542 + }, + { + "epoch": 3.6622498647917796, + "grad_norm": 1.1874165534973145, + "learning_rate": 5.174020641338114e-06, + "loss": 1.8, + "mean_token_accuracy": 0.5804314613342285, + "num_tokens": 6923308896.0, + "step": 13543 + }, + { + "epoch": 3.662520281233099, + "grad_norm": 0.8761705160140991, + "learning_rate": 5.172819116499319e-06, + "loss": 1.9559, + "mean_token_accuracy": 0.5475867986679077, + "num_tokens": 6923830183.0, + "step": 13544 + }, + { + "epoch": 3.6627906976744184, + "grad_norm": 0.8204560875892639, + "learning_rate": 5.17161777045557e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5820941925048828, + "num_tokens": 6924354442.0, + "step": 13545 + }, + { + "epoch": 3.663061114115738, + "grad_norm": 1.269394040107727, + "learning_rate": 5.1704166032437285e-06, + "loss": 1.7437, + "mean_token_accuracy": 0.5867626070976257, + "num_tokens": 6924878692.0, + "step": 13546 + }, + { + "epoch": 3.6633315305570577, + "grad_norm": 0.8861913084983826, + "learning_rate": 5.169215614900648e-06, + "loss": 1.6411, + "mean_token_accuracy": 0.6263667345046997, + "num_tokens": 6925402822.0, + "step": 13547 + }, + { + "epoch": 3.6636019469983774, + "grad_norm": 0.9488544464111328, + "learning_rate": 5.168014805463182e-06, + "loss": 1.8053, + "mean_token_accuracy": 0.5761873722076416, + "num_tokens": 6925870187.0, + "step": 13548 + }, + { + "epoch": 3.663872363439697, + "grad_norm": 0.8767250180244446, + "learning_rate": 5.166814174968171e-06, + "loss": 1.8915, + "mean_token_accuracy": 0.5785545110702515, + "num_tokens": 6926370420.0, + "step": 13549 + }, + { + "epoch": 3.6641427798810167, + "grad_norm": 1.0213884115219116, + "learning_rate": 5.165613723452454e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5889238119125366, + "num_tokens": 6926827856.0, + "step": 13550 + }, + { + "epoch": 3.6644131963223363, + "grad_norm": 1.0232049226760864, + "learning_rate": 5.16441345095287e-06, + "loss": 1.7354, + "mean_token_accuracy": 0.5932769179344177, + "num_tokens": 6927352089.0, + "step": 13551 + }, + { + "epoch": 3.664683612763656, + "grad_norm": 0.8946492075920105, + "learning_rate": 5.16321335750624e-06, + "loss": 1.7656, + "mean_token_accuracy": 0.5928372740745544, + "num_tokens": 6927876062.0, + "step": 13552 + }, + { + "epoch": 3.6649540292049756, + "grad_norm": 0.9616405367851257, + "learning_rate": 5.1620134431493874e-06, + "loss": 1.8904, + "mean_token_accuracy": 0.5640662908554077, + "num_tokens": 6928400228.0, + "step": 13553 + }, + { + "epoch": 3.6652244456462952, + "grad_norm": 1.0718306303024292, + "learning_rate": 5.160813707919132e-06, + "loss": 1.7392, + "mean_token_accuracy": 0.5653359889984131, + "num_tokens": 6928883064.0, + "step": 13554 + }, + { + "epoch": 3.665494862087615, + "grad_norm": 0.930067777633667, + "learning_rate": 5.159614151852284e-06, + "loss": 1.6951, + "mean_token_accuracy": 0.611255943775177, + "num_tokens": 6929407219.0, + "step": 13555 + }, + { + "epoch": 3.6657652785289345, + "grad_norm": 1.0931870937347412, + "learning_rate": 5.158414774985651e-06, + "loss": 1.8491, + "mean_token_accuracy": 0.5718845725059509, + "num_tokens": 6929931475.0, + "step": 13556 + }, + { + "epoch": 3.666035694970254, + "grad_norm": 0.9704825282096863, + "learning_rate": 5.157215577356027e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5772839784622192, + "num_tokens": 6930406282.0, + "step": 13557 + }, + { + "epoch": 3.666306111411574, + "grad_norm": 0.881348729133606, + "learning_rate": 5.156016559000214e-06, + "loss": 1.6762, + "mean_token_accuracy": 0.6022195816040039, + "num_tokens": 6930889737.0, + "step": 13558 + }, + { + "epoch": 3.6665765278528935, + "grad_norm": 0.9579763412475586, + "learning_rate": 5.154817719955001e-06, + "loss": 1.7793, + "mean_token_accuracy": 0.5727307796478271, + "num_tokens": 6931376533.0, + "step": 13559 + }, + { + "epoch": 3.666846944294213, + "grad_norm": 1.1757755279541016, + "learning_rate": 5.153619060257169e-06, + "loss": 1.9144, + "mean_token_accuracy": 0.5609602332115173, + "num_tokens": 6931900579.0, + "step": 13560 + }, + { + "epoch": 3.6671173607355327, + "grad_norm": 0.38682129979133606, + "learning_rate": 5.152420579943496e-06, + "loss": 1.1736, + "mean_token_accuracy": 0.680793285369873, + "num_tokens": 6932424787.0, + "step": 13561 + }, + { + "epoch": 3.6673877771768524, + "grad_norm": 1.1759151220321655, + "learning_rate": 5.151222279050758e-06, + "loss": 1.9793, + "mean_token_accuracy": 0.5434462428092957, + "num_tokens": 6932915820.0, + "step": 13562 + }, + { + "epoch": 3.667658193618172, + "grad_norm": 0.9875051379203796, + "learning_rate": 5.15002415761572e-06, + "loss": 1.8568, + "mean_token_accuracy": 0.5737266540527344, + "num_tokens": 6933440061.0, + "step": 13563 + }, + { + "epoch": 3.6679286100594917, + "grad_norm": 0.9380910396575928, + "learning_rate": 5.148826215675144e-06, + "loss": 1.8423, + "mean_token_accuracy": 0.5500203371047974, + "num_tokens": 6933964135.0, + "step": 13564 + }, + { + "epoch": 3.6681990265008113, + "grad_norm": 0.8632131218910217, + "learning_rate": 5.147628453265788e-06, + "loss": 1.8949, + "mean_token_accuracy": 0.5751442909240723, + "num_tokens": 6934432540.0, + "step": 13565 + }, + { + "epoch": 3.668469442942131, + "grad_norm": 0.9724780321121216, + "learning_rate": 5.146430870424402e-06, + "loss": 1.5694, + "mean_token_accuracy": 0.6086299419403076, + "num_tokens": 6934956617.0, + "step": 13566 + }, + { + "epoch": 3.6687398593834506, + "grad_norm": 0.9943839907646179, + "learning_rate": 5.1452334671877294e-06, + "loss": 1.8874, + "mean_token_accuracy": 0.5767539739608765, + "num_tokens": 6935480767.0, + "step": 13567 + }, + { + "epoch": 3.6690102758247702, + "grad_norm": 0.9569502472877502, + "learning_rate": 5.144036243592514e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.5658608675003052, + "num_tokens": 6936005033.0, + "step": 13568 + }, + { + "epoch": 3.66928069226609, + "grad_norm": 0.8583930134773254, + "learning_rate": 5.1428391996754865e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.5606353878974915, + "num_tokens": 6936529283.0, + "step": 13569 + }, + { + "epoch": 3.6695511087074095, + "grad_norm": 1.1501024961471558, + "learning_rate": 5.141642335473375e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5651754140853882, + "num_tokens": 6937051515.0, + "step": 13570 + }, + { + "epoch": 3.669821525148729, + "grad_norm": 0.9798660278320312, + "learning_rate": 5.140445651022906e-06, + "loss": 1.9164, + "mean_token_accuracy": 0.5698705911636353, + "num_tokens": 6937575798.0, + "step": 13571 + }, + { + "epoch": 3.670091941590049, + "grad_norm": 1.050820231437683, + "learning_rate": 5.139249146360799e-06, + "loss": 1.6396, + "mean_token_accuracy": 0.6261559724807739, + "num_tokens": 6938099995.0, + "step": 13572 + }, + { + "epoch": 3.6703623580313685, + "grad_norm": 0.9004116654396057, + "learning_rate": 5.13805282152376e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5799514055252075, + "num_tokens": 6938624267.0, + "step": 13573 + }, + { + "epoch": 3.670632774472688, + "grad_norm": 0.9974396824836731, + "learning_rate": 5.136856676548503e-06, + "loss": 1.8579, + "mean_token_accuracy": 0.5699845552444458, + "num_tokens": 6939148495.0, + "step": 13574 + }, + { + "epoch": 3.6709031909140073, + "grad_norm": 0.8965423107147217, + "learning_rate": 5.135660711471724e-06, + "loss": 1.7959, + "mean_token_accuracy": 0.58894282579422, + "num_tokens": 6939672680.0, + "step": 13575 + }, + { + "epoch": 3.6711736073553274, + "grad_norm": 0.8547603487968445, + "learning_rate": 5.134464926330118e-06, + "loss": 1.8646, + "mean_token_accuracy": 0.5516177415847778, + "num_tokens": 6940196894.0, + "step": 13576 + }, + { + "epoch": 3.6714440237966466, + "grad_norm": 0.9305587410926819, + "learning_rate": 5.13326932116038e-06, + "loss": 1.8551, + "mean_token_accuracy": 0.5654675960540771, + "num_tokens": 6940721167.0, + "step": 13577 + }, + { + "epoch": 3.6717144402379667, + "grad_norm": 0.8470267653465271, + "learning_rate": 5.132073895999193e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5632864236831665, + "num_tokens": 6941245434.0, + "step": 13578 + }, + { + "epoch": 3.671984856679286, + "grad_norm": 0.9406408667564392, + "learning_rate": 5.130878650883233e-06, + "loss": 1.6148, + "mean_token_accuracy": 0.6287785172462463, + "num_tokens": 6941769583.0, + "step": 13579 + }, + { + "epoch": 3.672255273120606, + "grad_norm": 1.08939790725708, + "learning_rate": 5.129683585849178e-06, + "loss": 1.8385, + "mean_token_accuracy": 0.594210684299469, + "num_tokens": 6942293760.0, + "step": 13580 + }, + { + "epoch": 3.672525689561925, + "grad_norm": 0.3667464852333069, + "learning_rate": 5.128488700933691e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7225257754325867, + "num_tokens": 6942817951.0, + "step": 13581 + }, + { + "epoch": 3.6727961060032452, + "grad_norm": 0.9649707674980164, + "learning_rate": 5.127293996173442e-06, + "loss": 1.7578, + "mean_token_accuracy": 0.5872824192047119, + "num_tokens": 6943342003.0, + "step": 13582 + }, + { + "epoch": 3.6730665224445644, + "grad_norm": 1.0058263540267944, + "learning_rate": 5.126099471605082e-06, + "loss": 1.8174, + "mean_token_accuracy": 0.5895134210586548, + "num_tokens": 6943857640.0, + "step": 13583 + }, + { + "epoch": 3.6733369388858845, + "grad_norm": 0.9833253622055054, + "learning_rate": 5.1249051272652606e-06, + "loss": 1.8364, + "mean_token_accuracy": 0.5886572599411011, + "num_tokens": 6944357870.0, + "step": 13584 + }, + { + "epoch": 3.6736073553272037, + "grad_norm": 0.9135982394218445, + "learning_rate": 5.123710963190631e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.5757362842559814, + "num_tokens": 6944882147.0, + "step": 13585 + }, + { + "epoch": 3.6738777717685234, + "grad_norm": 0.9847008585929871, + "learning_rate": 5.122516979417829e-06, + "loss": 1.7834, + "mean_token_accuracy": 0.5856714248657227, + "num_tokens": 6945387669.0, + "step": 13586 + }, + { + "epoch": 3.674148188209843, + "grad_norm": 1.0108145475387573, + "learning_rate": 5.121323175983488e-06, + "loss": 1.9589, + "mean_token_accuracy": 0.5531339645385742, + "num_tokens": 6945911828.0, + "step": 13587 + }, + { + "epoch": 3.6744186046511627, + "grad_norm": 1.004315972328186, + "learning_rate": 5.120129552924244e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5819277167320251, + "num_tokens": 6946436073.0, + "step": 13588 + }, + { + "epoch": 3.6746890210924823, + "grad_norm": 0.9167796969413757, + "learning_rate": 5.118936110276717e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.572604775428772, + "num_tokens": 6946960354.0, + "step": 13589 + }, + { + "epoch": 3.674959437533802, + "grad_norm": 1.094356656074524, + "learning_rate": 5.117742848077523e-06, + "loss": 1.8032, + "mean_token_accuracy": 0.5849728584289551, + "num_tokens": 6947484548.0, + "step": 13590 + }, + { + "epoch": 3.6752298539751216, + "grad_norm": 0.9678269028663635, + "learning_rate": 5.116549766363279e-06, + "loss": 1.7368, + "mean_token_accuracy": 0.6185859441757202, + "num_tokens": 6947937827.0, + "step": 13591 + }, + { + "epoch": 3.6755002704164412, + "grad_norm": 0.9367457628250122, + "learning_rate": 5.115356865170591e-06, + "loss": 1.919, + "mean_token_accuracy": 0.5572034120559692, + "num_tokens": 6948462079.0, + "step": 13592 + }, + { + "epoch": 3.675770686857761, + "grad_norm": 1.0373235940933228, + "learning_rate": 5.1141641445360554e-06, + "loss": 1.7785, + "mean_token_accuracy": 0.5993632078170776, + "num_tokens": 6948952222.0, + "step": 13593 + }, + { + "epoch": 3.6760411032990805, + "grad_norm": 1.0252467393875122, + "learning_rate": 5.112971604496278e-06, + "loss": 1.9374, + "mean_token_accuracy": 0.5531822443008423, + "num_tokens": 6949476499.0, + "step": 13594 + }, + { + "epoch": 3.6763115197404, + "grad_norm": 0.867193341255188, + "learning_rate": 5.111779245087845e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5830078721046448, + "num_tokens": 6950000745.0, + "step": 13595 + }, + { + "epoch": 3.67658193618172, + "grad_norm": 0.8940495252609253, + "learning_rate": 5.1105870663473345e-06, + "loss": 1.9116, + "mean_token_accuracy": 0.5780777931213379, + "num_tokens": 6950468787.0, + "step": 13596 + }, + { + "epoch": 3.6768523526230394, + "grad_norm": 1.2137972116470337, + "learning_rate": 5.109395068311338e-06, + "loss": 1.7989, + "mean_token_accuracy": 0.5818538665771484, + "num_tokens": 6950896860.0, + "step": 13597 + }, + { + "epoch": 3.677122769064359, + "grad_norm": 1.163974642753601, + "learning_rate": 5.10820325101642e-06, + "loss": 1.8718, + "mean_token_accuracy": 0.5611329078674316, + "num_tokens": 6951384517.0, + "step": 13598 + }, + { + "epoch": 3.6773931855056787, + "grad_norm": 0.9247601628303528, + "learning_rate": 5.107011614499158e-06, + "loss": 1.7495, + "mean_token_accuracy": 0.591759443283081, + "num_tokens": 6951883597.0, + "step": 13599 + }, + { + "epoch": 3.6776636019469984, + "grad_norm": 0.9894992709159851, + "learning_rate": 5.105820158796103e-06, + "loss": 1.8334, + "mean_token_accuracy": 0.5723408460617065, + "num_tokens": 6952407861.0, + "step": 13600 + }, + { + "epoch": 3.677934018388318, + "grad_norm": 0.3315412402153015, + "learning_rate": 5.104628883943825e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.7114719152450562, + "num_tokens": 6952932008.0, + "step": 13601 + }, + { + "epoch": 3.6782044348296377, + "grad_norm": 0.9920164346694946, + "learning_rate": 5.103437789978867e-06, + "loss": 1.7341, + "mean_token_accuracy": 0.576834499835968, + "num_tokens": 6953456291.0, + "step": 13602 + }, + { + "epoch": 3.6784748512709573, + "grad_norm": 1.005969762802124, + "learning_rate": 5.10224687693778e-06, + "loss": 1.7816, + "mean_token_accuracy": 0.5891036987304688, + "num_tokens": 6953980570.0, + "step": 13603 + }, + { + "epoch": 3.678745267712277, + "grad_norm": 0.9249866008758545, + "learning_rate": 5.101056144857099e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.579817533493042, + "num_tokens": 6954504710.0, + "step": 13604 + }, + { + "epoch": 3.6790156841535966, + "grad_norm": 0.7922179102897644, + "learning_rate": 5.0998655937733674e-06, + "loss": 1.8856, + "mean_token_accuracy": 0.5573389530181885, + "num_tokens": 6955028786.0, + "step": 13605 + }, + { + "epoch": 3.6792861005949162, + "grad_norm": 1.056899070739746, + "learning_rate": 5.098675223723112e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5629950761795044, + "num_tokens": 6955552946.0, + "step": 13606 + }, + { + "epoch": 3.679556517036236, + "grad_norm": 0.9822866916656494, + "learning_rate": 5.097485034742851e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.5726259350776672, + "num_tokens": 6956077102.0, + "step": 13607 + }, + { + "epoch": 3.6798269334775555, + "grad_norm": 0.980965793132782, + "learning_rate": 5.09629502686911e-06, + "loss": 1.7512, + "mean_token_accuracy": 0.6042584180831909, + "num_tokens": 6956601184.0, + "step": 13608 + }, + { + "epoch": 3.680097349918875, + "grad_norm": 0.8179013133049011, + "learning_rate": 5.095105200138401e-06, + "loss": 1.877, + "mean_token_accuracy": 0.5877642631530762, + "num_tokens": 6957062790.0, + "step": 13609 + }, + { + "epoch": 3.680367766360195, + "grad_norm": 0.904940664768219, + "learning_rate": 5.093915554587225e-06, + "loss": 1.8125, + "mean_token_accuracy": 0.5858374238014221, + "num_tokens": 6957524102.0, + "step": 13610 + }, + { + "epoch": 3.6806381828015144, + "grad_norm": 0.8587949275970459, + "learning_rate": 5.092726090252091e-06, + "loss": 1.7579, + "mean_token_accuracy": 0.5816433429718018, + "num_tokens": 6958048368.0, + "step": 13611 + }, + { + "epoch": 3.680908599242834, + "grad_norm": 0.9852052927017212, + "learning_rate": 5.091536807169495e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5690164566040039, + "num_tokens": 6958572554.0, + "step": 13612 + }, + { + "epoch": 3.6811790156841537, + "grad_norm": 0.8779730200767517, + "learning_rate": 5.09034770537592e-06, + "loss": 1.8825, + "mean_token_accuracy": 0.5750341415405273, + "num_tokens": 6959096741.0, + "step": 13613 + }, + { + "epoch": 3.6814494321254734, + "grad_norm": 1.039705753326416, + "learning_rate": 5.0891587849078574e-06, + "loss": 1.7407, + "mean_token_accuracy": 0.5867793560028076, + "num_tokens": 6959540300.0, + "step": 13614 + }, + { + "epoch": 3.681719848566793, + "grad_norm": 1.0140347480773926, + "learning_rate": 5.087970045801789e-06, + "loss": 1.933, + "mean_token_accuracy": 0.562792956829071, + "num_tokens": 6959995953.0, + "step": 13615 + }, + { + "epoch": 3.681990265008112, + "grad_norm": 1.0344059467315674, + "learning_rate": 5.086781488094183e-06, + "loss": 1.9258, + "mean_token_accuracy": 0.5614710450172424, + "num_tokens": 6960520137.0, + "step": 13616 + }, + { + "epoch": 3.6822606814494323, + "grad_norm": 0.9178361892700195, + "learning_rate": 5.085593111821512e-06, + "loss": 1.7863, + "mean_token_accuracy": 0.5922863483428955, + "num_tokens": 6960980804.0, + "step": 13617 + }, + { + "epoch": 3.6825310978907515, + "grad_norm": 1.0090702772140503, + "learning_rate": 5.084404917020239e-06, + "loss": 1.8764, + "mean_token_accuracy": 0.5832434892654419, + "num_tokens": 6961451433.0, + "step": 13618 + }, + { + "epoch": 3.6828015143320716, + "grad_norm": 0.8917234539985657, + "learning_rate": 5.083216903726818e-06, + "loss": 1.9382, + "mean_token_accuracy": 0.5642541646957397, + "num_tokens": 6961975653.0, + "step": 13619 + }, + { + "epoch": 3.683071930773391, + "grad_norm": 1.0614371299743652, + "learning_rate": 5.082029071977703e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.5715300440788269, + "num_tokens": 6962499907.0, + "step": 13620 + }, + { + "epoch": 3.683342347214711, + "grad_norm": 0.36145079135894775, + "learning_rate": 5.080841421809342e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.7143365144729614, + "num_tokens": 6962967614.0, + "step": 13621 + }, + { + "epoch": 3.68361276365603, + "grad_norm": 1.1782853603363037, + "learning_rate": 5.079653953258168e-06, + "loss": 1.9695, + "mean_token_accuracy": 0.5593626499176025, + "num_tokens": 6963491883.0, + "step": 13622 + }, + { + "epoch": 3.68388318009735, + "grad_norm": 1.1605216264724731, + "learning_rate": 5.078466666360624e-06, + "loss": 1.5608, + "mean_token_accuracy": 0.6450514793395996, + "num_tokens": 6964015973.0, + "step": 13623 + }, + { + "epoch": 3.6841535965386694, + "grad_norm": 0.9111449718475342, + "learning_rate": 5.07727956115314e-06, + "loss": 2.0404, + "mean_token_accuracy": 0.5374506115913391, + "num_tokens": 6964540179.0, + "step": 13624 + }, + { + "epoch": 3.6844240129799894, + "grad_norm": 0.911896288394928, + "learning_rate": 5.076092637672132e-06, + "loss": 1.887, + "mean_token_accuracy": 0.5703607797622681, + "num_tokens": 6965064402.0, + "step": 13625 + }, + { + "epoch": 3.6846944294213086, + "grad_norm": 1.1596119403839111, + "learning_rate": 5.074905895954025e-06, + "loss": 1.9246, + "mean_token_accuracy": 0.5493299961090088, + "num_tokens": 6965588590.0, + "step": 13626 + }, + { + "epoch": 3.6849648458626283, + "grad_norm": 1.0958645343780518, + "learning_rate": 5.073719336035227e-06, + "loss": 1.8317, + "mean_token_accuracy": 0.5730074644088745, + "num_tokens": 6966112835.0, + "step": 13627 + }, + { + "epoch": 3.685235262303948, + "grad_norm": 1.0147651433944702, + "learning_rate": 5.072532957952149e-06, + "loss": 1.8859, + "mean_token_accuracy": 0.5765625238418579, + "num_tokens": 6966637011.0, + "step": 13628 + }, + { + "epoch": 3.6855056787452676, + "grad_norm": 0.9008265733718872, + "learning_rate": 5.071346761741193e-06, + "loss": 1.9545, + "mean_token_accuracy": 0.543372631072998, + "num_tokens": 6967161231.0, + "step": 13629 + }, + { + "epoch": 3.685776095186587, + "grad_norm": 1.036881923675537, + "learning_rate": 5.070160747438752e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5791040658950806, + "num_tokens": 6967685458.0, + "step": 13630 + }, + { + "epoch": 3.686046511627907, + "grad_norm": 0.964052140712738, + "learning_rate": 5.068974915081219e-06, + "loss": 1.7261, + "mean_token_accuracy": 0.593734860420227, + "num_tokens": 6968188760.0, + "step": 13631 + }, + { + "epoch": 3.6863169280692265, + "grad_norm": 1.0025323629379272, + "learning_rate": 5.067789264704979e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5530598163604736, + "num_tokens": 6968713019.0, + "step": 13632 + }, + { + "epoch": 3.686587344510546, + "grad_norm": 0.8611606955528259, + "learning_rate": 5.066603796346406e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.586344301700592, + "num_tokens": 6969237182.0, + "step": 13633 + }, + { + "epoch": 3.686857760951866, + "grad_norm": 0.9108320474624634, + "learning_rate": 5.065418510041882e-06, + "loss": 1.9044, + "mean_token_accuracy": 0.5725767612457275, + "num_tokens": 6969678389.0, + "step": 13634 + }, + { + "epoch": 3.6871281773931854, + "grad_norm": 0.7691665887832642, + "learning_rate": 5.064233405827768e-06, + "loss": 1.745, + "mean_token_accuracy": 0.6044671535491943, + "num_tokens": 6970189536.0, + "step": 13635 + }, + { + "epoch": 3.687398593834505, + "grad_norm": 0.8553891181945801, + "learning_rate": 5.063048483740427e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5925455689430237, + "num_tokens": 6970713757.0, + "step": 13636 + }, + { + "epoch": 3.6876690102758247, + "grad_norm": 0.9042280316352844, + "learning_rate": 5.061863743816222e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5808174014091492, + "num_tokens": 6971237933.0, + "step": 13637 + }, + { + "epoch": 3.6879394267171444, + "grad_norm": 0.7476990818977356, + "learning_rate": 5.060679186091499e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5608634352684021, + "num_tokens": 6971762202.0, + "step": 13638 + }, + { + "epoch": 3.688209843158464, + "grad_norm": 0.7992724776268005, + "learning_rate": 5.059494810602603e-06, + "loss": 1.813, + "mean_token_accuracy": 0.5657238364219666, + "num_tokens": 6972256559.0, + "step": 13639 + }, + { + "epoch": 3.6884802595997837, + "grad_norm": 0.9266980886459351, + "learning_rate": 5.058310617385879e-06, + "loss": 1.8682, + "mean_token_accuracy": 0.5622889995574951, + "num_tokens": 6972780665.0, + "step": 13640 + }, + { + "epoch": 3.6887506760411033, + "grad_norm": 0.36296185851097107, + "learning_rate": 5.057126606477656e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6977581977844238, + "num_tokens": 6973304950.0, + "step": 13641 + }, + { + "epoch": 3.689021092482423, + "grad_norm": 1.2442735433578491, + "learning_rate": 5.055942777914267e-06, + "loss": 1.9007, + "mean_token_accuracy": 0.5548762679100037, + "num_tokens": 6973829169.0, + "step": 13642 + }, + { + "epoch": 3.6892915089237426, + "grad_norm": 1.1184698343276978, + "learning_rate": 5.0547591317320325e-06, + "loss": 1.8515, + "mean_token_accuracy": 0.5732537508010864, + "num_tokens": 6974325252.0, + "step": 13643 + }, + { + "epoch": 3.6895619253650622, + "grad_norm": 0.9659637808799744, + "learning_rate": 5.0535756679672745e-06, + "loss": 1.7748, + "mean_token_accuracy": 0.5789352655410767, + "num_tokens": 6974849529.0, + "step": 13644 + }, + { + "epoch": 3.689832341806382, + "grad_norm": 0.797543466091156, + "learning_rate": 5.0523923866562975e-06, + "loss": 1.7221, + "mean_token_accuracy": 0.5907959938049316, + "num_tokens": 6975373712.0, + "step": 13645 + }, + { + "epoch": 3.6901027582477015, + "grad_norm": 0.9201709032058716, + "learning_rate": 5.051209287835417e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5804543495178223, + "num_tokens": 6975882400.0, + "step": 13646 + }, + { + "epoch": 3.690373174689021, + "grad_norm": 0.9450700283050537, + "learning_rate": 5.0500263715409305e-06, + "loss": 1.8257, + "mean_token_accuracy": 0.589029848575592, + "num_tokens": 6976345687.0, + "step": 13647 + }, + { + "epoch": 3.690643591130341, + "grad_norm": 0.9001927971839905, + "learning_rate": 5.048843637809129e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5802105665206909, + "num_tokens": 6976869909.0, + "step": 13648 + }, + { + "epoch": 3.6909140075716604, + "grad_norm": 0.8507794141769409, + "learning_rate": 5.04766108667631e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5760903358459473, + "num_tokens": 6977360009.0, + "step": 13649 + }, + { + "epoch": 3.69118442401298, + "grad_norm": 0.841728687286377, + "learning_rate": 5.0464787181787485e-06, + "loss": 1.7329, + "mean_token_accuracy": 0.5892208218574524, + "num_tokens": 6977884128.0, + "step": 13650 + }, + { + "epoch": 3.6914548404542997, + "grad_norm": 1.00802743434906, + "learning_rate": 5.045296532352731e-06, + "loss": 1.8017, + "mean_token_accuracy": 0.5738056898117065, + "num_tokens": 6978408287.0, + "step": 13651 + }, + { + "epoch": 3.6917252568956194, + "grad_norm": 0.8343343138694763, + "learning_rate": 5.044114529234529e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5702703595161438, + "num_tokens": 6978932420.0, + "step": 13652 + }, + { + "epoch": 3.691995673336939, + "grad_norm": 1.0094358921051025, + "learning_rate": 5.042932708860405e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.5898647308349609, + "num_tokens": 6979450386.0, + "step": 13653 + }, + { + "epoch": 3.6922660897782587, + "grad_norm": 0.9561346173286438, + "learning_rate": 5.041751071266627e-06, + "loss": 1.7889, + "mean_token_accuracy": 0.5933374166488647, + "num_tokens": 6979974603.0, + "step": 13654 + }, + { + "epoch": 3.6925365062195783, + "grad_norm": 1.0977811813354492, + "learning_rate": 5.04056961648945e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5799273252487183, + "num_tokens": 6980468823.0, + "step": 13655 + }, + { + "epoch": 3.692806922660898, + "grad_norm": 1.3713606595993042, + "learning_rate": 5.039388344565117e-06, + "loss": 1.7483, + "mean_token_accuracy": 0.568564772605896, + "num_tokens": 6980985150.0, + "step": 13656 + }, + { + "epoch": 3.693077339102217, + "grad_norm": 1.1939078569412231, + "learning_rate": 5.038207255529881e-06, + "loss": 1.9003, + "mean_token_accuracy": 0.5635782480239868, + "num_tokens": 6981463766.0, + "step": 13657 + }, + { + "epoch": 3.6933477555435372, + "grad_norm": 1.151882529258728, + "learning_rate": 5.03702634941998e-06, + "loss": 1.8256, + "mean_token_accuracy": 0.5841594338417053, + "num_tokens": 6981988039.0, + "step": 13658 + }, + { + "epoch": 3.6936181719848564, + "grad_norm": 1.119285225868225, + "learning_rate": 5.035845626271644e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5822559595108032, + "num_tokens": 6982512170.0, + "step": 13659 + }, + { + "epoch": 3.6938885884261765, + "grad_norm": 0.9394549131393433, + "learning_rate": 5.034665086121106e-06, + "loss": 1.829, + "mean_token_accuracy": 0.5793902277946472, + "num_tokens": 6983036318.0, + "step": 13660 + }, + { + "epoch": 3.6941590048674957, + "grad_norm": 0.35456517338752747, + "learning_rate": 5.033484729004588e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.7149621248245239, + "num_tokens": 6983560514.0, + "step": 13661 + }, + { + "epoch": 3.694429421308816, + "grad_norm": 1.6255682706832886, + "learning_rate": 5.0323045549582995e-06, + "loss": 1.8552, + "mean_token_accuracy": 0.5784706473350525, + "num_tokens": 6984049426.0, + "step": 13662 + }, + { + "epoch": 3.694699837750135, + "grad_norm": 1.233172059059143, + "learning_rate": 5.031124564018462e-06, + "loss": 1.8407, + "mean_token_accuracy": 0.5827912092208862, + "num_tokens": 6984573597.0, + "step": 13663 + }, + { + "epoch": 3.694970254191455, + "grad_norm": 0.8981504440307617, + "learning_rate": 5.029944756221275e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5869849920272827, + "num_tokens": 6985097773.0, + "step": 13664 + }, + { + "epoch": 3.6952406706327743, + "grad_norm": 0.8009594678878784, + "learning_rate": 5.028765131602937e-06, + "loss": 1.89, + "mean_token_accuracy": 0.5762279629707336, + "num_tokens": 6985592019.0, + "step": 13665 + }, + { + "epoch": 3.6955110870740944, + "grad_norm": 1.18830144405365, + "learning_rate": 5.027585690199648e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5863892436027527, + "num_tokens": 6986082243.0, + "step": 13666 + }, + { + "epoch": 3.6957815035154136, + "grad_norm": 1.2440185546875, + "learning_rate": 5.026406432047593e-06, + "loss": 1.8214, + "mean_token_accuracy": 0.582201361656189, + "num_tokens": 6986596642.0, + "step": 13667 + }, + { + "epoch": 3.696051919956733, + "grad_norm": 1.0059210062026978, + "learning_rate": 5.0252273571829555e-06, + "loss": 1.7673, + "mean_token_accuracy": 0.5921694040298462, + "num_tokens": 6987103982.0, + "step": 13668 + }, + { + "epoch": 3.696322336398053, + "grad_norm": 0.946006178855896, + "learning_rate": 5.024048465641914e-06, + "loss": 1.7824, + "mean_token_accuracy": 0.5695121884346008, + "num_tokens": 6987628128.0, + "step": 13669 + }, + { + "epoch": 3.6965927528393725, + "grad_norm": 0.9643545746803284, + "learning_rate": 5.022869757460638e-06, + "loss": 1.8013, + "mean_token_accuracy": 0.5812686085700989, + "num_tokens": 6988152272.0, + "step": 13670 + }, + { + "epoch": 3.696863169280692, + "grad_norm": 0.8894075751304626, + "learning_rate": 5.0216912326752946e-06, + "loss": 1.7157, + "mean_token_accuracy": 0.6024537682533264, + "num_tokens": 6988618410.0, + "step": 13671 + }, + { + "epoch": 3.697133585722012, + "grad_norm": 1.0027192831039429, + "learning_rate": 5.020512891322047e-06, + "loss": 1.8398, + "mean_token_accuracy": 0.5676417946815491, + "num_tokens": 6989142661.0, + "step": 13672 + }, + { + "epoch": 3.6974040021633314, + "grad_norm": 0.873727560043335, + "learning_rate": 5.019334733437047e-06, + "loss": 1.7833, + "mean_token_accuracy": 0.5786263942718506, + "num_tokens": 6989666748.0, + "step": 13673 + }, + { + "epoch": 3.697674418604651, + "grad_norm": 1.0502327680587769, + "learning_rate": 5.018156759056448e-06, + "loss": 1.8957, + "mean_token_accuracy": 0.5572811961174011, + "num_tokens": 6990190845.0, + "step": 13674 + }, + { + "epoch": 3.6979448350459707, + "grad_norm": 1.173176646232605, + "learning_rate": 5.016978968216392e-06, + "loss": 1.9706, + "mean_token_accuracy": 0.540542483329773, + "num_tokens": 6990715126.0, + "step": 13675 + }, + { + "epoch": 3.6982152514872904, + "grad_norm": 1.0348825454711914, + "learning_rate": 5.015801360953012e-06, + "loss": 1.8105, + "mean_token_accuracy": 0.5832090973854065, + "num_tokens": 6991239405.0, + "step": 13676 + }, + { + "epoch": 3.69848566792861, + "grad_norm": 0.9000723958015442, + "learning_rate": 5.014623937302448e-06, + "loss": 1.9098, + "mean_token_accuracy": 0.5519853830337524, + "num_tokens": 6991763634.0, + "step": 13677 + }, + { + "epoch": 3.6987560843699296, + "grad_norm": 0.8594477772712708, + "learning_rate": 5.013446697300823e-06, + "loss": 1.749, + "mean_token_accuracy": 0.602413535118103, + "num_tokens": 6992287817.0, + "step": 13678 + }, + { + "epoch": 3.6990265008112493, + "grad_norm": 1.134766697883606, + "learning_rate": 5.012269640984258e-06, + "loss": 1.8949, + "mean_token_accuracy": 0.5602502822875977, + "num_tokens": 6992812039.0, + "step": 13679 + }, + { + "epoch": 3.699296917252569, + "grad_norm": 1.1231025457382202, + "learning_rate": 5.011092768388869e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.5879095792770386, + "num_tokens": 6993336263.0, + "step": 13680 + }, + { + "epoch": 3.6995673336938886, + "grad_norm": 0.3815806806087494, + "learning_rate": 5.009916079550768e-06, + "loss": 1.1221, + "mean_token_accuracy": 0.7068638801574707, + "num_tokens": 6993860438.0, + "step": 13681 + }, + { + "epoch": 3.699837750135208, + "grad_norm": 0.9988996982574463, + "learning_rate": 5.008739574506053e-06, + "loss": 1.8815, + "mean_token_accuracy": 0.577289342880249, + "num_tokens": 6994384636.0, + "step": 13682 + }, + { + "epoch": 3.700108166576528, + "grad_norm": 0.916825532913208, + "learning_rate": 5.007563253290831e-06, + "loss": 1.8144, + "mean_token_accuracy": 0.5480862259864807, + "num_tokens": 6994908911.0, + "step": 13683 + }, + { + "epoch": 3.7003785830178475, + "grad_norm": 0.9101539850234985, + "learning_rate": 5.006387115941188e-06, + "loss": 1.9155, + "mean_token_accuracy": 0.5589759349822998, + "num_tokens": 6995421939.0, + "step": 13684 + }, + { + "epoch": 3.700648999459167, + "grad_norm": 0.9657363891601562, + "learning_rate": 5.0052111624932175e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5756689310073853, + "num_tokens": 6995946113.0, + "step": 13685 + }, + { + "epoch": 3.700919415900487, + "grad_norm": 1.0109665393829346, + "learning_rate": 5.004035392982994e-06, + "loss": 1.9579, + "mean_token_accuracy": 0.5480287671089172, + "num_tokens": 6996470300.0, + "step": 13686 + }, + { + "epoch": 3.7011898323418064, + "grad_norm": 0.9839174151420593, + "learning_rate": 5.002859807446602e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.598873496055603, + "num_tokens": 6996917275.0, + "step": 13687 + }, + { + "epoch": 3.701460248783126, + "grad_norm": 1.0553940534591675, + "learning_rate": 5.001684405920104e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5583367347717285, + "num_tokens": 6997413600.0, + "step": 13688 + }, + { + "epoch": 3.7017306652244457, + "grad_norm": 1.1813396215438843, + "learning_rate": 5.000509188439571e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.5800303220748901, + "num_tokens": 6997937823.0, + "step": 13689 + }, + { + "epoch": 3.7020010816657654, + "grad_norm": 0.9530778527259827, + "learning_rate": 4.99933415504106e-06, + "loss": 1.9201, + "mean_token_accuracy": 0.5693080425262451, + "num_tokens": 6998461862.0, + "step": 13690 + }, + { + "epoch": 3.702271498107085, + "grad_norm": 1.091770052909851, + "learning_rate": 4.9981593057606194e-06, + "loss": 1.7783, + "mean_token_accuracy": 0.5879835486412048, + "num_tokens": 6998986091.0, + "step": 13691 + }, + { + "epoch": 3.7025419145484046, + "grad_norm": 1.1736633777618408, + "learning_rate": 4.996984640634306e-06, + "loss": 1.8118, + "mean_token_accuracy": 0.5847417116165161, + "num_tokens": 6999510288.0, + "step": 13692 + }, + { + "epoch": 3.7028123309897243, + "grad_norm": 0.9862314462661743, + "learning_rate": 4.995810159698157e-06, + "loss": 1.6595, + "mean_token_accuracy": 0.6186585426330566, + "num_tokens": 7000034552.0, + "step": 13693 + }, + { + "epoch": 3.703082747431044, + "grad_norm": 0.9896219372749329, + "learning_rate": 4.994635862988207e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.5764821767807007, + "num_tokens": 7000558688.0, + "step": 13694 + }, + { + "epoch": 3.7033531638723636, + "grad_norm": 0.9220229387283325, + "learning_rate": 4.993461750540493e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.5880948901176453, + "num_tokens": 7001082875.0, + "step": 13695 + }, + { + "epoch": 3.703623580313683, + "grad_norm": 1.1412019729614258, + "learning_rate": 4.9922878223910325e-06, + "loss": 1.7149, + "mean_token_accuracy": 0.5996349453926086, + "num_tokens": 7001607072.0, + "step": 13696 + }, + { + "epoch": 3.703893996755003, + "grad_norm": 0.993007481098175, + "learning_rate": 4.991114078575852e-06, + "loss": 1.9323, + "mean_token_accuracy": 0.5729915499687195, + "num_tokens": 7002127569.0, + "step": 13697 + }, + { + "epoch": 3.7041644131963225, + "grad_norm": 0.9354172348976135, + "learning_rate": 4.98994051913096e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5869411826133728, + "num_tokens": 7002611143.0, + "step": 13698 + }, + { + "epoch": 3.704434829637642, + "grad_norm": 0.8967642784118652, + "learning_rate": 4.9887671440923705e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5731035470962524, + "num_tokens": 7003135426.0, + "step": 13699 + }, + { + "epoch": 3.7047052460789613, + "grad_norm": 1.0558717250823975, + "learning_rate": 4.987593953496079e-06, + "loss": 1.891, + "mean_token_accuracy": 0.5700706243515015, + "num_tokens": 7003659701.0, + "step": 13700 + }, + { + "epoch": 3.7049756625202814, + "grad_norm": 0.40866559743881226, + "learning_rate": 4.98642094737809e-06, + "loss": 1.148, + "mean_token_accuracy": 0.6864534616470337, + "num_tokens": 7004183820.0, + "step": 13701 + }, + { + "epoch": 3.7052460789616006, + "grad_norm": 1.1230276823043823, + "learning_rate": 4.985248125774387e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5782259702682495, + "num_tokens": 7004665642.0, + "step": 13702 + }, + { + "epoch": 3.7055164954029207, + "grad_norm": 0.904096782207489, + "learning_rate": 4.984075488720964e-06, + "loss": 1.847, + "mean_token_accuracy": 0.5792801976203918, + "num_tokens": 7005189918.0, + "step": 13703 + }, + { + "epoch": 3.70578691184424, + "grad_norm": 0.9086605310440063, + "learning_rate": 4.982903036253797e-06, + "loss": 1.8041, + "mean_token_accuracy": 0.5858637094497681, + "num_tokens": 7005690861.0, + "step": 13704 + }, + { + "epoch": 3.70605732828556, + "grad_norm": 0.9673711061477661, + "learning_rate": 4.981730768408857e-06, + "loss": 1.8277, + "mean_token_accuracy": 0.5942520499229431, + "num_tokens": 7006188414.0, + "step": 13705 + }, + { + "epoch": 3.706327744726879, + "grad_norm": 0.9461871981620789, + "learning_rate": 4.980558685222118e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.578052282333374, + "num_tokens": 7006712499.0, + "step": 13706 + }, + { + "epoch": 3.7065981611681993, + "grad_norm": 1.0923787355422974, + "learning_rate": 4.9793867867295404e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5523144006729126, + "num_tokens": 7007236709.0, + "step": 13707 + }, + { + "epoch": 3.7068685776095185, + "grad_norm": 0.9526399970054626, + "learning_rate": 4.978215072967078e-06, + "loss": 1.8107, + "mean_token_accuracy": 0.5860082507133484, + "num_tokens": 7007760920.0, + "step": 13708 + }, + { + "epoch": 3.707138994050838, + "grad_norm": 0.9403485655784607, + "learning_rate": 4.9770435439706905e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5836929082870483, + "num_tokens": 7008271299.0, + "step": 13709 + }, + { + "epoch": 3.7074094104921578, + "grad_norm": 1.0013116598129272, + "learning_rate": 4.975872199776319e-06, + "loss": 1.8555, + "mean_token_accuracy": 0.5922985076904297, + "num_tokens": 7008688764.0, + "step": 13710 + }, + { + "epoch": 3.7076798269334774, + "grad_norm": 0.9336028099060059, + "learning_rate": 4.974701040419902e-06, + "loss": 1.8989, + "mean_token_accuracy": 0.5615508556365967, + "num_tokens": 7009212804.0, + "step": 13711 + }, + { + "epoch": 3.707950243374797, + "grad_norm": 1.0424108505249023, + "learning_rate": 4.973530065937377e-06, + "loss": 1.6485, + "mean_token_accuracy": 0.6244207620620728, + "num_tokens": 7009672835.0, + "step": 13712 + }, + { + "epoch": 3.7082206598161167, + "grad_norm": 0.8925424814224243, + "learning_rate": 4.972359276364671e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.5846470594406128, + "num_tokens": 7010142274.0, + "step": 13713 + }, + { + "epoch": 3.7084910762574363, + "grad_norm": 1.366498351097107, + "learning_rate": 4.971188671737708e-06, + "loss": 1.7727, + "mean_token_accuracy": 0.57879638671875, + "num_tokens": 7010666521.0, + "step": 13714 + }, + { + "epoch": 3.708761492698756, + "grad_norm": 1.0924173593521118, + "learning_rate": 4.970018252092409e-06, + "loss": 1.7748, + "mean_token_accuracy": 0.5904451608657837, + "num_tokens": 7011190677.0, + "step": 13715 + }, + { + "epoch": 3.7090319091400756, + "grad_norm": 1.2595844268798828, + "learning_rate": 4.968848017464684e-06, + "loss": 1.9185, + "mean_token_accuracy": 0.5801892280578613, + "num_tokens": 7011653272.0, + "step": 13716 + }, + { + "epoch": 3.7093023255813953, + "grad_norm": 1.016823410987854, + "learning_rate": 4.967677967890435e-06, + "loss": 1.7498, + "mean_token_accuracy": 0.5960004925727844, + "num_tokens": 7012142924.0, + "step": 13717 + }, + { + "epoch": 3.709572742022715, + "grad_norm": 0.9355729222297668, + "learning_rate": 4.966508103405568e-06, + "loss": 1.8612, + "mean_token_accuracy": 0.5716671347618103, + "num_tokens": 7012667071.0, + "step": 13718 + }, + { + "epoch": 3.7098431584640346, + "grad_norm": 0.8077888488769531, + "learning_rate": 4.965338424045974e-06, + "loss": 1.7411, + "mean_token_accuracy": 0.5900735259056091, + "num_tokens": 7013191335.0, + "step": 13719 + }, + { + "epoch": 3.710113574905354, + "grad_norm": 0.9447668194770813, + "learning_rate": 4.9641689298475455e-06, + "loss": 1.7417, + "mean_token_accuracy": 0.5928274989128113, + "num_tokens": 7013715574.0, + "step": 13720 + }, + { + "epoch": 3.710383991346674, + "grad_norm": 0.34451499581336975, + "learning_rate": 4.962999620846165e-06, + "loss": 1.1142, + "mean_token_accuracy": 0.7079804539680481, + "num_tokens": 7014239821.0, + "step": 13721 + }, + { + "epoch": 3.7106544077879935, + "grad_norm": 1.288529396057129, + "learning_rate": 4.9618304970777064e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5634313225746155, + "num_tokens": 7014713612.0, + "step": 13722 + }, + { + "epoch": 3.710924824229313, + "grad_norm": 1.1953034400939941, + "learning_rate": 4.9606615585780495e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5775086879730225, + "num_tokens": 7015237668.0, + "step": 13723 + }, + { + "epoch": 3.7111952406706328, + "grad_norm": 0.9278761744499207, + "learning_rate": 4.959492805383056e-06, + "loss": 1.8419, + "mean_token_accuracy": 0.5749200582504272, + "num_tokens": 7015761921.0, + "step": 13724 + }, + { + "epoch": 3.7114656571119524, + "grad_norm": 0.9193712472915649, + "learning_rate": 4.958324237528584e-06, + "loss": 1.8492, + "mean_token_accuracy": 0.5724509358406067, + "num_tokens": 7016285984.0, + "step": 13725 + }, + { + "epoch": 3.711736073553272, + "grad_norm": 1.0651969909667969, + "learning_rate": 4.957155855050494e-06, + "loss": 1.7377, + "mean_token_accuracy": 0.5968495607376099, + "num_tokens": 7016810151.0, + "step": 13726 + }, + { + "epoch": 3.7120064899945917, + "grad_norm": 0.9987075328826904, + "learning_rate": 4.955987657984632e-06, + "loss": 1.91, + "mean_token_accuracy": 0.5674829483032227, + "num_tokens": 7017334377.0, + "step": 13727 + }, + { + "epoch": 3.7122769064359114, + "grad_norm": 0.9344524145126343, + "learning_rate": 4.954819646366842e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5624335408210754, + "num_tokens": 7017858565.0, + "step": 13728 + }, + { + "epoch": 3.712547322877231, + "grad_norm": 0.9609349966049194, + "learning_rate": 4.953651820232965e-06, + "loss": 1.7314, + "mean_token_accuracy": 0.5909098386764526, + "num_tokens": 7018372035.0, + "step": 13729 + }, + { + "epoch": 3.7128177393185506, + "grad_norm": 0.9056394696235657, + "learning_rate": 4.952484179618832e-06, + "loss": 1.7077, + "mean_token_accuracy": 0.5767110586166382, + "num_tokens": 7018896173.0, + "step": 13730 + }, + { + "epoch": 3.7130881557598703, + "grad_norm": 1.0647997856140137, + "learning_rate": 4.951316724560267e-06, + "loss": 1.8107, + "mean_token_accuracy": 0.5593236684799194, + "num_tokens": 7019420330.0, + "step": 13731 + }, + { + "epoch": 3.71335857220119, + "grad_norm": 0.9800715446472168, + "learning_rate": 4.9501494550930955e-06, + "loss": 1.7104, + "mean_token_accuracy": 0.5956348180770874, + "num_tokens": 7019936479.0, + "step": 13732 + }, + { + "epoch": 3.7136289886425096, + "grad_norm": 0.8980493545532227, + "learning_rate": 4.948982371253129e-06, + "loss": 1.7531, + "mean_token_accuracy": 0.5964902639389038, + "num_tokens": 7020439110.0, + "step": 13733 + }, + { + "epoch": 3.713899405083829, + "grad_norm": 0.8822407126426697, + "learning_rate": 4.947815473076176e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5714002847671509, + "num_tokens": 7020934636.0, + "step": 13734 + }, + { + "epoch": 3.714169821525149, + "grad_norm": 0.9218325018882751, + "learning_rate": 4.946648760598046e-06, + "loss": 1.7644, + "mean_token_accuracy": 0.5880842208862305, + "num_tokens": 7021458867.0, + "step": 13735 + }, + { + "epoch": 3.7144402379664685, + "grad_norm": 1.2020455598831177, + "learning_rate": 4.945482233854533e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5946060419082642, + "num_tokens": 7021937327.0, + "step": 13736 + }, + { + "epoch": 3.714710654407788, + "grad_norm": 0.9679661393165588, + "learning_rate": 4.9443158928814274e-06, + "loss": 1.8184, + "mean_token_accuracy": 0.5840022563934326, + "num_tokens": 7022461551.0, + "step": 13737 + }, + { + "epoch": 3.714981070849108, + "grad_norm": 0.9790723323822021, + "learning_rate": 4.943149737714523e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5874118804931641, + "num_tokens": 7022880501.0, + "step": 13738 + }, + { + "epoch": 3.7152514872904274, + "grad_norm": 1.0328830480575562, + "learning_rate": 4.941983768389595e-06, + "loss": 1.9153, + "mean_token_accuracy": 0.5603481531143188, + "num_tokens": 7023349089.0, + "step": 13739 + }, + { + "epoch": 3.715521903731747, + "grad_norm": 0.9648366570472717, + "learning_rate": 4.940817984942418e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5805891156196594, + "num_tokens": 7023868059.0, + "step": 13740 + }, + { + "epoch": 3.7157923201730663, + "grad_norm": 0.34694886207580566, + "learning_rate": 4.939652387408765e-06, + "loss": 1.1407, + "mean_token_accuracy": 0.6945446729660034, + "num_tokens": 7024392209.0, + "step": 13741 + }, + { + "epoch": 3.7160627366143864, + "grad_norm": 0.9024336934089661, + "learning_rate": 4.938486975824401e-06, + "loss": 1.716, + "mean_token_accuracy": 0.6017841100692749, + "num_tokens": 7024916402.0, + "step": 13742 + }, + { + "epoch": 3.7163331530557056, + "grad_norm": 1.017695426940918, + "learning_rate": 4.93732175022508e-06, + "loss": 1.8812, + "mean_token_accuracy": 0.5598872900009155, + "num_tokens": 7025440684.0, + "step": 13743 + }, + { + "epoch": 3.7166035694970256, + "grad_norm": 1.0262031555175781, + "learning_rate": 4.936156710646558e-06, + "loss": 1.7713, + "mean_token_accuracy": 0.5815160870552063, + "num_tokens": 7025964859.0, + "step": 13744 + }, + { + "epoch": 3.716873985938345, + "grad_norm": 0.8356749415397644, + "learning_rate": 4.9349918571245805e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.558671236038208, + "num_tokens": 7026489137.0, + "step": 13745 + }, + { + "epoch": 3.717144402379665, + "grad_norm": 1.0872693061828613, + "learning_rate": 4.93382718969489e-06, + "loss": 1.7889, + "mean_token_accuracy": 0.5877393484115601, + "num_tokens": 7027013170.0, + "step": 13746 + }, + { + "epoch": 3.717414818820984, + "grad_norm": 0.9718643426895142, + "learning_rate": 4.932662708393222e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5652240514755249, + "num_tokens": 7027537321.0, + "step": 13747 + }, + { + "epoch": 3.717685235262304, + "grad_norm": 1.0710628032684326, + "learning_rate": 4.931498413255301e-06, + "loss": 1.9123, + "mean_token_accuracy": 0.56547611951828, + "num_tokens": 7028053183.0, + "step": 13748 + }, + { + "epoch": 3.7179556517036234, + "grad_norm": 2.187169075012207, + "learning_rate": 4.930334304316859e-06, + "loss": 1.6772, + "mean_token_accuracy": 0.6240634918212891, + "num_tokens": 7028577460.0, + "step": 13749 + }, + { + "epoch": 3.718226068144943, + "grad_norm": 1.1083961725234985, + "learning_rate": 4.9291703816136095e-06, + "loss": 1.6979, + "mean_token_accuracy": 0.6100989580154419, + "num_tokens": 7029101622.0, + "step": 13750 + }, + { + "epoch": 3.7184964845862627, + "grad_norm": 1.052690863609314, + "learning_rate": 4.928006645181263e-06, + "loss": 1.7626, + "mean_token_accuracy": 0.5597257614135742, + "num_tokens": 7029625842.0, + "step": 13751 + }, + { + "epoch": 3.7187669010275823, + "grad_norm": 0.999033510684967, + "learning_rate": 4.9268430950555325e-06, + "loss": 1.8618, + "mean_token_accuracy": 0.583189070224762, + "num_tokens": 7030150127.0, + "step": 13752 + }, + { + "epoch": 3.719037317468902, + "grad_norm": 0.8790379166603088, + "learning_rate": 4.925679731272115e-06, + "loss": 1.7453, + "mean_token_accuracy": 0.595814049243927, + "num_tokens": 7030674139.0, + "step": 13753 + }, + { + "epoch": 3.7193077339102216, + "grad_norm": 1.0156748294830322, + "learning_rate": 4.924516553866705e-06, + "loss": 1.8541, + "mean_token_accuracy": 0.5471197366714478, + "num_tokens": 7031175701.0, + "step": 13754 + }, + { + "epoch": 3.7195781503515413, + "grad_norm": 0.935508668422699, + "learning_rate": 4.923353562874995e-06, + "loss": 1.7519, + "mean_token_accuracy": 0.5961707830429077, + "num_tokens": 7031699937.0, + "step": 13755 + }, + { + "epoch": 3.719848566792861, + "grad_norm": 0.8456510305404663, + "learning_rate": 4.922190758332667e-06, + "loss": 1.9431, + "mean_token_accuracy": 0.5615063309669495, + "num_tokens": 7032224218.0, + "step": 13756 + }, + { + "epoch": 3.7201189832341806, + "grad_norm": 0.9800190925598145, + "learning_rate": 4.921028140275399e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5533725619316101, + "num_tokens": 7032748449.0, + "step": 13757 + }, + { + "epoch": 3.7203893996755, + "grad_norm": 0.9210811853408813, + "learning_rate": 4.919865708738868e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5812298059463501, + "num_tokens": 7033272632.0, + "step": 13758 + }, + { + "epoch": 3.72065981611682, + "grad_norm": 0.9827813506126404, + "learning_rate": 4.918703463758738e-06, + "loss": 1.8283, + "mean_token_accuracy": 0.5920089483261108, + "num_tokens": 7033796829.0, + "step": 13759 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 0.849932849407196, + "learning_rate": 4.917541405370666e-06, + "loss": 1.8385, + "mean_token_accuracy": 0.5807818174362183, + "num_tokens": 7034320996.0, + "step": 13760 + }, + { + "epoch": 3.721200648999459, + "grad_norm": 0.3909761905670166, + "learning_rate": 4.916379533610315e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.7122905254364014, + "num_tokens": 7034753196.0, + "step": 13761 + }, + { + "epoch": 3.7214710654407788, + "grad_norm": 0.9515851736068726, + "learning_rate": 4.915217848513329e-06, + "loss": 1.8921, + "mean_token_accuracy": 0.5629100799560547, + "num_tokens": 7035277366.0, + "step": 13762 + }, + { + "epoch": 3.7217414818820984, + "grad_norm": 0.9649173021316528, + "learning_rate": 4.914056350115352e-06, + "loss": 1.824, + "mean_token_accuracy": 0.585737407207489, + "num_tokens": 7035801546.0, + "step": 13763 + }, + { + "epoch": 3.722011898323418, + "grad_norm": 0.8427545428276062, + "learning_rate": 4.912895038452026e-06, + "loss": 1.9302, + "mean_token_accuracy": 0.5661036968231201, + "num_tokens": 7036325808.0, + "step": 13764 + }, + { + "epoch": 3.7222823147647377, + "grad_norm": 0.7889971733093262, + "learning_rate": 4.9117339135589795e-06, + "loss": 1.8005, + "mean_token_accuracy": 0.5792432427406311, + "num_tokens": 7036849989.0, + "step": 13765 + }, + { + "epoch": 3.7225527312060573, + "grad_norm": 1.153139591217041, + "learning_rate": 4.910572975471842e-06, + "loss": 1.8089, + "mean_token_accuracy": 0.600898802280426, + "num_tokens": 7037303551.0, + "step": 13766 + }, + { + "epoch": 3.722823147647377, + "grad_norm": 0.9190277457237244, + "learning_rate": 4.909412224226235e-06, + "loss": 1.9347, + "mean_token_accuracy": 0.5742157697677612, + "num_tokens": 7037827774.0, + "step": 13767 + }, + { + "epoch": 3.7230935640886966, + "grad_norm": 1.0482845306396484, + "learning_rate": 4.908251659857769e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5607557892799377, + "num_tokens": 7038351894.0, + "step": 13768 + }, + { + "epoch": 3.7233639805300163, + "grad_norm": 0.9766481518745422, + "learning_rate": 4.90709128240206e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.5869982242584229, + "num_tokens": 7038876143.0, + "step": 13769 + }, + { + "epoch": 3.723634396971336, + "grad_norm": 0.9425657391548157, + "learning_rate": 4.905931091894705e-06, + "loss": 1.7708, + "mean_token_accuracy": 0.575542688369751, + "num_tokens": 7039400369.0, + "step": 13770 + }, + { + "epoch": 3.7239048134126556, + "grad_norm": 0.917873203754425, + "learning_rate": 4.9047710883713055e-06, + "loss": 1.9925, + "mean_token_accuracy": 0.5573249459266663, + "num_tokens": 7039924436.0, + "step": 13771 + }, + { + "epoch": 3.724175229853975, + "grad_norm": 0.9048293828964233, + "learning_rate": 4.9036112718674565e-06, + "loss": 1.6828, + "mean_token_accuracy": 0.5845690965652466, + "num_tokens": 7040448587.0, + "step": 13772 + }, + { + "epoch": 3.724445646295295, + "grad_norm": 1.2901089191436768, + "learning_rate": 4.902451642418742e-06, + "loss": 1.9739, + "mean_token_accuracy": 0.564156174659729, + "num_tokens": 7040916490.0, + "step": 13773 + }, + { + "epoch": 3.7247160627366145, + "grad_norm": 0.9347721338272095, + "learning_rate": 4.901292200060741e-06, + "loss": 1.8221, + "mean_token_accuracy": 0.5824079513549805, + "num_tokens": 7041440646.0, + "step": 13774 + }, + { + "epoch": 3.724986479177934, + "grad_norm": 0.9598848819732666, + "learning_rate": 4.900132944829033e-06, + "loss": 1.8054, + "mean_token_accuracy": 0.5883381366729736, + "num_tokens": 7041964865.0, + "step": 13775 + }, + { + "epoch": 3.7252568956192538, + "grad_norm": 0.9723331928253174, + "learning_rate": 4.898973876759184e-06, + "loss": 1.8045, + "mean_token_accuracy": 0.5668041110038757, + "num_tokens": 7042489126.0, + "step": 13776 + }, + { + "epoch": 3.7255273120605734, + "grad_norm": 1.027707576751709, + "learning_rate": 4.897814995886756e-06, + "loss": 1.966, + "mean_token_accuracy": 0.5465936660766602, + "num_tokens": 7043013388.0, + "step": 13777 + }, + { + "epoch": 3.725797728501893, + "grad_norm": 0.908513069152832, + "learning_rate": 4.896656302247313e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5718172788619995, + "num_tokens": 7043537667.0, + "step": 13778 + }, + { + "epoch": 3.7260681449432127, + "grad_norm": 0.9580246210098267, + "learning_rate": 4.895497795876402e-06, + "loss": 1.7763, + "mean_token_accuracy": 0.5726180672645569, + "num_tokens": 7044061823.0, + "step": 13779 + }, + { + "epoch": 3.7263385613845323, + "grad_norm": 1.120772361755371, + "learning_rate": 4.894339476809569e-06, + "loss": 1.8754, + "mean_token_accuracy": 0.5722943544387817, + "num_tokens": 7044578023.0, + "step": 13780 + }, + { + "epoch": 3.726608977825852, + "grad_norm": 0.36245325207710266, + "learning_rate": 4.893181345082359e-06, + "loss": 1.1256, + "mean_token_accuracy": 0.6839898824691772, + "num_tokens": 7045102209.0, + "step": 13781 + }, + { + "epoch": 3.726879394267171, + "grad_norm": 1.0441131591796875, + "learning_rate": 4.892023400730304e-06, + "loss": 1.9044, + "mean_token_accuracy": 0.5808253288269043, + "num_tokens": 7045564700.0, + "step": 13782 + }, + { + "epoch": 3.7271498107084913, + "grad_norm": 0.9396569132804871, + "learning_rate": 4.890865643788931e-06, + "loss": 1.8361, + "mean_token_accuracy": 0.5795730352401733, + "num_tokens": 7046088949.0, + "step": 13783 + }, + { + "epoch": 3.7274202271498105, + "grad_norm": 0.9139387011528015, + "learning_rate": 4.889708074293766e-06, + "loss": 1.795, + "mean_token_accuracy": 0.5856704711914062, + "num_tokens": 7046613191.0, + "step": 13784 + }, + { + "epoch": 3.7276906435911306, + "grad_norm": 0.8291611671447754, + "learning_rate": 4.8885506922803285e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.573789119720459, + "num_tokens": 7047137444.0, + "step": 13785 + }, + { + "epoch": 3.7279610600324498, + "grad_norm": 0.8250753283500671, + "learning_rate": 4.887393497784126e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5698676109313965, + "num_tokens": 7047661675.0, + "step": 13786 + }, + { + "epoch": 3.72823147647377, + "grad_norm": 0.8861955404281616, + "learning_rate": 4.886236490840669e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5856450796127319, + "num_tokens": 7048185707.0, + "step": 13787 + }, + { + "epoch": 3.728501892915089, + "grad_norm": 1.0120036602020264, + "learning_rate": 4.885079671485454e-06, + "loss": 1.9526, + "mean_token_accuracy": 0.5659805536270142, + "num_tokens": 7048709879.0, + "step": 13788 + }, + { + "epoch": 3.728772309356409, + "grad_norm": 0.8622460961341858, + "learning_rate": 4.883923039753981e-06, + "loss": 1.8443, + "mean_token_accuracy": 0.5878932476043701, + "num_tokens": 7049234159.0, + "step": 13789 + }, + { + "epoch": 3.7290427257977283, + "grad_norm": 0.9312993884086609, + "learning_rate": 4.882766595681734e-06, + "loss": 1.7405, + "mean_token_accuracy": 0.5844829082489014, + "num_tokens": 7049758334.0, + "step": 13790 + }, + { + "epoch": 3.729313142239048, + "grad_norm": 1.1265093088150024, + "learning_rate": 4.881610339304194e-06, + "loss": 2.0174, + "mean_token_accuracy": 0.5526016354560852, + "num_tokens": 7050193762.0, + "step": 13791 + }, + { + "epoch": 3.7295835586803676, + "grad_norm": 0.97264164686203, + "learning_rate": 4.8804542706568466e-06, + "loss": 1.7132, + "mean_token_accuracy": 0.6110628843307495, + "num_tokens": 7050681316.0, + "step": 13792 + }, + { + "epoch": 3.7298539751216873, + "grad_norm": 1.0031874179840088, + "learning_rate": 4.879298389775158e-06, + "loss": 1.8231, + "mean_token_accuracy": 0.5998033285140991, + "num_tokens": 7051205602.0, + "step": 13793 + }, + { + "epoch": 3.730124391563007, + "grad_norm": 0.8454312682151794, + "learning_rate": 4.878142696694591e-06, + "loss": 1.8038, + "mean_token_accuracy": 0.5736554861068726, + "num_tokens": 7051696506.0, + "step": 13794 + }, + { + "epoch": 3.7303948080043265, + "grad_norm": 0.9600716829299927, + "learning_rate": 4.876987191450612e-06, + "loss": 1.7863, + "mean_token_accuracy": 0.581914484500885, + "num_tokens": 7052220729.0, + "step": 13795 + }, + { + "epoch": 3.730665224445646, + "grad_norm": 0.9884223937988281, + "learning_rate": 4.875831874078674e-06, + "loss": 1.7735, + "mean_token_accuracy": 0.5741754174232483, + "num_tokens": 7052744939.0, + "step": 13796 + }, + { + "epoch": 3.730935640886966, + "grad_norm": 0.8843846321105957, + "learning_rate": 4.8746767446142205e-06, + "loss": 1.944, + "mean_token_accuracy": 0.5612632632255554, + "num_tokens": 7053269131.0, + "step": 13797 + }, + { + "epoch": 3.7312060573282855, + "grad_norm": 0.8651273250579834, + "learning_rate": 4.873521803092698e-06, + "loss": 1.8382, + "mean_token_accuracy": 0.567064642906189, + "num_tokens": 7053793348.0, + "step": 13798 + }, + { + "epoch": 3.731476473769605, + "grad_norm": 0.9495331048965454, + "learning_rate": 4.872367049549546e-06, + "loss": 1.7897, + "mean_token_accuracy": 0.6046413779258728, + "num_tokens": 7054317536.0, + "step": 13799 + }, + { + "epoch": 3.7317468902109248, + "grad_norm": 0.9476959109306335, + "learning_rate": 4.87121248402019e-06, + "loss": 1.7249, + "mean_token_accuracy": 0.590660810470581, + "num_tokens": 7054808643.0, + "step": 13800 + }, + { + "epoch": 3.7320173066522444, + "grad_norm": 0.4628053307533264, + "learning_rate": 4.870058106540062e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7345471978187561, + "num_tokens": 7055182042.0, + "step": 13801 + }, + { + "epoch": 3.732287723093564, + "grad_norm": 0.9798858761787415, + "learning_rate": 4.868903917144578e-06, + "loss": 1.7457, + "mean_token_accuracy": 0.602832555770874, + "num_tokens": 7055658562.0, + "step": 13802 + }, + { + "epoch": 3.7325581395348837, + "grad_norm": 1.0045993328094482, + "learning_rate": 4.86774991586915e-06, + "loss": 1.8236, + "mean_token_accuracy": 0.5875763893127441, + "num_tokens": 7056182779.0, + "step": 13803 + }, + { + "epoch": 3.7328285559762033, + "grad_norm": 0.9353926777839661, + "learning_rate": 4.86659610274919e-06, + "loss": 1.8058, + "mean_token_accuracy": 0.5844299793243408, + "num_tokens": 7056689380.0, + "step": 13804 + }, + { + "epoch": 3.733098972417523, + "grad_norm": 0.7840754985809326, + "learning_rate": 4.865442477820099e-06, + "loss": 1.9153, + "mean_token_accuracy": 0.5589599609375, + "num_tokens": 7057213657.0, + "step": 13805 + }, + { + "epoch": 3.7333693888588426, + "grad_norm": 0.9269254803657532, + "learning_rate": 4.864289041117271e-06, + "loss": 1.7424, + "mean_token_accuracy": 0.6191166043281555, + "num_tokens": 7057737906.0, + "step": 13806 + }, + { + "epoch": 3.7336398053001623, + "grad_norm": 0.924794614315033, + "learning_rate": 4.863135792676103e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.581108570098877, + "num_tokens": 7058262077.0, + "step": 13807 + }, + { + "epoch": 3.733910221741482, + "grad_norm": 1.1727843284606934, + "learning_rate": 4.861982732531974e-06, + "loss": 1.77, + "mean_token_accuracy": 0.5793393850326538, + "num_tokens": 7058786109.0, + "step": 13808 + }, + { + "epoch": 3.7341806381828015, + "grad_norm": 0.9442147612571716, + "learning_rate": 4.860829860720264e-06, + "loss": 1.9074, + "mean_token_accuracy": 0.5619298219680786, + "num_tokens": 7059310376.0, + "step": 13809 + }, + { + "epoch": 3.734451054624121, + "grad_norm": 0.8289070725440979, + "learning_rate": 4.85967717727635e-06, + "loss": 1.9378, + "mean_token_accuracy": 0.5525306463241577, + "num_tokens": 7059834565.0, + "step": 13810 + }, + { + "epoch": 3.734721471065441, + "grad_norm": 0.7798708081245422, + "learning_rate": 4.858524682235596e-06, + "loss": 1.754, + "mean_token_accuracy": 0.5612471103668213, + "num_tokens": 7060358839.0, + "step": 13811 + }, + { + "epoch": 3.7349918875067605, + "grad_norm": 0.9568336009979248, + "learning_rate": 4.857372375633368e-06, + "loss": 1.8768, + "mean_token_accuracy": 0.5604462623596191, + "num_tokens": 7060883010.0, + "step": 13812 + }, + { + "epoch": 3.73526230394808, + "grad_norm": 1.0438729524612427, + "learning_rate": 4.856220257505017e-06, + "loss": 1.8568, + "mean_token_accuracy": 0.578913152217865, + "num_tokens": 7061364592.0, + "step": 13813 + }, + { + "epoch": 3.7355327203893998, + "grad_norm": 0.918742835521698, + "learning_rate": 4.855068327885896e-06, + "loss": 1.9282, + "mean_token_accuracy": 0.5699552297592163, + "num_tokens": 7061888858.0, + "step": 13814 + }, + { + "epoch": 3.7358031368307194, + "grad_norm": 0.880983293056488, + "learning_rate": 4.853916586811353e-06, + "loss": 1.897, + "mean_token_accuracy": 0.5617570877075195, + "num_tokens": 7062413117.0, + "step": 13815 + }, + { + "epoch": 3.736073553272039, + "grad_norm": 0.7821206450462341, + "learning_rate": 4.852765034316722e-06, + "loss": 1.664, + "mean_token_accuracy": 0.6124030351638794, + "num_tokens": 7062937351.0, + "step": 13816 + }, + { + "epoch": 3.7363439697133587, + "grad_norm": 1.0183744430541992, + "learning_rate": 4.851613670437336e-06, + "loss": 1.8592, + "mean_token_accuracy": 0.5819422006607056, + "num_tokens": 7063461626.0, + "step": 13817 + }, + { + "epoch": 3.7366143861546783, + "grad_norm": 1.0008972883224487, + "learning_rate": 4.850462495208527e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.5758832693099976, + "num_tokens": 7063949205.0, + "step": 13818 + }, + { + "epoch": 3.736884802595998, + "grad_norm": 0.8796659708023071, + "learning_rate": 4.849311508665612e-06, + "loss": 1.7566, + "mean_token_accuracy": 0.5881403684616089, + "num_tokens": 7064473372.0, + "step": 13819 + }, + { + "epoch": 3.7371552190373176, + "grad_norm": 0.902911365032196, + "learning_rate": 4.848160710843907e-06, + "loss": 1.8316, + "mean_token_accuracy": 0.5751552581787109, + "num_tokens": 7064997571.0, + "step": 13820 + }, + { + "epoch": 3.7374256354786373, + "grad_norm": 0.41044819355010986, + "learning_rate": 4.8470101017787244e-06, + "loss": 1.1062, + "mean_token_accuracy": 0.6966570615768433, + "num_tokens": 7065521829.0, + "step": 13821 + }, + { + "epoch": 3.737696051919957, + "grad_norm": 1.2313261032104492, + "learning_rate": 4.845859681505366e-06, + "loss": 1.862, + "mean_token_accuracy": 0.5711150169372559, + "num_tokens": 7066045980.0, + "step": 13822 + }, + { + "epoch": 3.737966468361276, + "grad_norm": 1.0311847925186157, + "learning_rate": 4.844709450059129e-06, + "loss": 1.6942, + "mean_token_accuracy": 0.6100770831108093, + "num_tokens": 7066511156.0, + "step": 13823 + }, + { + "epoch": 3.738236884802596, + "grad_norm": 1.033471941947937, + "learning_rate": 4.843559407475311e-06, + "loss": 1.8769, + "mean_token_accuracy": 0.5683344602584839, + "num_tokens": 7067035321.0, + "step": 13824 + }, + { + "epoch": 3.7385073012439154, + "grad_norm": 0.8152682185173035, + "learning_rate": 4.842409553789195e-06, + "loss": 1.7798, + "mean_token_accuracy": 0.5798007845878601, + "num_tokens": 7067559467.0, + "step": 13825 + }, + { + "epoch": 3.7387777176852355, + "grad_norm": 1.0599613189697266, + "learning_rate": 4.8412598890360576e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5894793272018433, + "num_tokens": 7068083666.0, + "step": 13826 + }, + { + "epoch": 3.7390481341265547, + "grad_norm": 1.262458324432373, + "learning_rate": 4.84011041325118e-06, + "loss": 1.8179, + "mean_token_accuracy": 0.5976039171218872, + "num_tokens": 7068545660.0, + "step": 13827 + }, + { + "epoch": 3.7393185505678748, + "grad_norm": 1.068101167678833, + "learning_rate": 4.838961126469833e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.585559606552124, + "num_tokens": 7069061988.0, + "step": 13828 + }, + { + "epoch": 3.739588967009194, + "grad_norm": 0.8892855644226074, + "learning_rate": 4.837812028727273e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.579815685749054, + "num_tokens": 7069586268.0, + "step": 13829 + }, + { + "epoch": 3.739859383450514, + "grad_norm": 1.0448683500289917, + "learning_rate": 4.836663120058765e-06, + "loss": 1.9203, + "mean_token_accuracy": 0.5691378116607666, + "num_tokens": 7070089550.0, + "step": 13830 + }, + { + "epoch": 3.7401297998918333, + "grad_norm": 0.9676066040992737, + "learning_rate": 4.835514400499558e-06, + "loss": 1.7482, + "mean_token_accuracy": 0.598684549331665, + "num_tokens": 7070613728.0, + "step": 13831 + }, + { + "epoch": 3.740400216333153, + "grad_norm": 0.8420841097831726, + "learning_rate": 4.834365870084895e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5689607858657837, + "num_tokens": 7071072759.0, + "step": 13832 + }, + { + "epoch": 3.7406706327744725, + "grad_norm": 0.8858576416969299, + "learning_rate": 4.833217528850022e-06, + "loss": 1.9311, + "mean_token_accuracy": 0.545315146446228, + "num_tokens": 7071596956.0, + "step": 13833 + }, + { + "epoch": 3.740941049215792, + "grad_norm": 1.1040685176849365, + "learning_rate": 4.832069376830167e-06, + "loss": 1.8847, + "mean_token_accuracy": 0.5707091093063354, + "num_tokens": 7072065527.0, + "step": 13834 + }, + { + "epoch": 3.741211465657112, + "grad_norm": 0.8774160742759705, + "learning_rate": 4.830921414060566e-06, + "loss": 1.6792, + "mean_token_accuracy": 0.6232014894485474, + "num_tokens": 7072589628.0, + "step": 13835 + }, + { + "epoch": 3.7414818820984315, + "grad_norm": 0.9690852165222168, + "learning_rate": 4.829773640576437e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5745164155960083, + "num_tokens": 7073113820.0, + "step": 13836 + }, + { + "epoch": 3.741752298539751, + "grad_norm": 3.6328999996185303, + "learning_rate": 4.828626056412997e-06, + "loss": 1.6989, + "mean_token_accuracy": 0.6040513515472412, + "num_tokens": 7073638030.0, + "step": 13837 + }, + { + "epoch": 3.7420227149810708, + "grad_norm": 1.0173957347869873, + "learning_rate": 4.8274786616054595e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.579587459564209, + "num_tokens": 7074114144.0, + "step": 13838 + }, + { + "epoch": 3.7422931314223904, + "grad_norm": 1.236854910850525, + "learning_rate": 4.826331456189029e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5921294689178467, + "num_tokens": 7074638317.0, + "step": 13839 + }, + { + "epoch": 3.74256354786371, + "grad_norm": 1.0328028202056885, + "learning_rate": 4.825184440198903e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5673177242279053, + "num_tokens": 7075162569.0, + "step": 13840 + }, + { + "epoch": 3.7428339643050297, + "grad_norm": 0.39484190940856934, + "learning_rate": 4.824037613670277e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.7132102251052856, + "num_tokens": 7075642927.0, + "step": 13841 + }, + { + "epoch": 3.7431043807463493, + "grad_norm": 1.0001850128173828, + "learning_rate": 4.822890976638343e-06, + "loss": 1.9042, + "mean_token_accuracy": 0.566430926322937, + "num_tokens": 7076160668.0, + "step": 13842 + }, + { + "epoch": 3.743374797187669, + "grad_norm": 1.0247585773468018, + "learning_rate": 4.821744529138276e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5757445693016052, + "num_tokens": 7076658477.0, + "step": 13843 + }, + { + "epoch": 3.7436452136289886, + "grad_norm": 0.9521510601043701, + "learning_rate": 4.820598271205258e-06, + "loss": 1.755, + "mean_token_accuracy": 0.601361870765686, + "num_tokens": 7077117848.0, + "step": 13844 + }, + { + "epoch": 3.7439156300703083, + "grad_norm": 1.1978352069854736, + "learning_rate": 4.819452202874459e-06, + "loss": 1.8681, + "mean_token_accuracy": 0.5565047264099121, + "num_tokens": 7077622189.0, + "step": 13845 + }, + { + "epoch": 3.744186046511628, + "grad_norm": 1.1223511695861816, + "learning_rate": 4.818306324181039e-06, + "loss": 1.9103, + "mean_token_accuracy": 0.5816965699195862, + "num_tokens": 7078045549.0, + "step": 13846 + }, + { + "epoch": 3.7444564629529475, + "grad_norm": 0.9991121292114258, + "learning_rate": 4.817160635160162e-06, + "loss": 1.8117, + "mean_token_accuracy": 0.5834674835205078, + "num_tokens": 7078524353.0, + "step": 13847 + }, + { + "epoch": 3.744726879394267, + "grad_norm": 1.067034125328064, + "learning_rate": 4.816015135846979e-06, + "loss": 1.856, + "mean_token_accuracy": 0.5852349400520325, + "num_tokens": 7079048580.0, + "step": 13848 + }, + { + "epoch": 3.744997295835587, + "grad_norm": 1.0123449563980103, + "learning_rate": 4.814869826276637e-06, + "loss": 1.6987, + "mean_token_accuracy": 0.60310959815979, + "num_tokens": 7079547419.0, + "step": 13849 + }, + { + "epoch": 3.7452677122769065, + "grad_norm": 0.8887708783149719, + "learning_rate": 4.813724706484279e-06, + "loss": 1.8981, + "mean_token_accuracy": 0.5718458890914917, + "num_tokens": 7080071595.0, + "step": 13850 + }, + { + "epoch": 3.745538128718226, + "grad_norm": 0.9200968742370605, + "learning_rate": 4.81257977650504e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5830953121185303, + "num_tokens": 7080536622.0, + "step": 13851 + }, + { + "epoch": 3.7458085451595458, + "grad_norm": 0.9050000309944153, + "learning_rate": 4.811435036374046e-06, + "loss": 1.8595, + "mean_token_accuracy": 0.5717387795448303, + "num_tokens": 7081060724.0, + "step": 13852 + }, + { + "epoch": 3.7460789616008654, + "grad_norm": 1.0006412267684937, + "learning_rate": 4.810290486126428e-06, + "loss": 1.9868, + "mean_token_accuracy": 0.5411096811294556, + "num_tokens": 7081584872.0, + "step": 13853 + }, + { + "epoch": 3.746349378042185, + "grad_norm": 0.903861403465271, + "learning_rate": 4.809146125797298e-06, + "loss": 1.7548, + "mean_token_accuracy": 0.5896320939064026, + "num_tokens": 7082109139.0, + "step": 13854 + }, + { + "epoch": 3.7466197944835047, + "grad_norm": 1.1326959133148193, + "learning_rate": 4.808001955421774e-06, + "loss": 1.8115, + "mean_token_accuracy": 0.6036312580108643, + "num_tokens": 7082593280.0, + "step": 13855 + }, + { + "epoch": 3.7468902109248243, + "grad_norm": 1.0104472637176514, + "learning_rate": 4.8068579750349556e-06, + "loss": 1.7484, + "mean_token_accuracy": 0.5964901447296143, + "num_tokens": 7083117550.0, + "step": 13856 + }, + { + "epoch": 3.747160627366144, + "grad_norm": 0.9342523217201233, + "learning_rate": 4.8057141846719514e-06, + "loss": 1.8605, + "mean_token_accuracy": 0.5639299154281616, + "num_tokens": 7083601438.0, + "step": 13857 + }, + { + "epoch": 3.7474310438074636, + "grad_norm": 0.8744915127754211, + "learning_rate": 4.804570584367848e-06, + "loss": 1.908, + "mean_token_accuracy": 0.557593822479248, + "num_tokens": 7084125549.0, + "step": 13858 + }, + { + "epoch": 3.7477014602487833, + "grad_norm": 0.9662179350852966, + "learning_rate": 4.803427174157743e-06, + "loss": 1.559, + "mean_token_accuracy": 0.64430832862854, + "num_tokens": 7084649832.0, + "step": 13859 + }, + { + "epoch": 3.747971876690103, + "grad_norm": 0.86613929271698, + "learning_rate": 4.802283954076712e-06, + "loss": 1.8052, + "mean_token_accuracy": 0.6009345054626465, + "num_tokens": 7085117930.0, + "step": 13860 + }, + { + "epoch": 3.7482422931314225, + "grad_norm": 0.3847516179084778, + "learning_rate": 4.801140924159838e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6907578706741333, + "num_tokens": 7085642035.0, + "step": 13861 + }, + { + "epoch": 3.748512709572742, + "grad_norm": 1.085314154624939, + "learning_rate": 4.799998084442189e-06, + "loss": 1.8335, + "mean_token_accuracy": 0.5784586668014526, + "num_tokens": 7086142681.0, + "step": 13862 + }, + { + "epoch": 3.748783126014062, + "grad_norm": 1.042245864868164, + "learning_rate": 4.798855434958831e-06, + "loss": 1.7871, + "mean_token_accuracy": 0.5880953669548035, + "num_tokens": 7086646916.0, + "step": 13863 + }, + { + "epoch": 3.749053542455381, + "grad_norm": 1.0769890546798706, + "learning_rate": 4.797712975744826e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5731620788574219, + "num_tokens": 7087171147.0, + "step": 13864 + }, + { + "epoch": 3.749323958896701, + "grad_norm": 0.8290109634399414, + "learning_rate": 4.796570706835227e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.5630619525909424, + "num_tokens": 7087695333.0, + "step": 13865 + }, + { + "epoch": 3.7495943753380203, + "grad_norm": 1.0193639993667603, + "learning_rate": 4.795428628265081e-06, + "loss": 1.8769, + "mean_token_accuracy": 0.5695331692695618, + "num_tokens": 7088180473.0, + "step": 13866 + }, + { + "epoch": 3.7498647917793404, + "grad_norm": 1.0197231769561768, + "learning_rate": 4.794286740069432e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5802243947982788, + "num_tokens": 7088704678.0, + "step": 13867 + }, + { + "epoch": 3.7501352082206596, + "grad_norm": 1.1328847408294678, + "learning_rate": 4.793145042283313e-06, + "loss": 1.7751, + "mean_token_accuracy": 0.58632493019104, + "num_tokens": 7089182224.0, + "step": 13868 + }, + { + "epoch": 3.7504056246619797, + "grad_norm": 0.8122513890266418, + "learning_rate": 4.7920035349417595e-06, + "loss": 1.7595, + "mean_token_accuracy": 0.5748342871665955, + "num_tokens": 7089706407.0, + "step": 13869 + }, + { + "epoch": 3.750676041103299, + "grad_norm": 0.8463119268417358, + "learning_rate": 4.7908622180797915e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5847408771514893, + "num_tokens": 7090230521.0, + "step": 13870 + }, + { + "epoch": 3.750946457544619, + "grad_norm": 1.1089097261428833, + "learning_rate": 4.789721091732434e-06, + "loss": 1.9208, + "mean_token_accuracy": 0.5197499990463257, + "num_tokens": 7090754747.0, + "step": 13871 + }, + { + "epoch": 3.751216873985938, + "grad_norm": 1.0483548641204834, + "learning_rate": 4.7885801559346935e-06, + "loss": 1.845, + "mean_token_accuracy": 0.5719553232192993, + "num_tokens": 7091278837.0, + "step": 13872 + }, + { + "epoch": 3.751487290427258, + "grad_norm": 0.8757784962654114, + "learning_rate": 4.787439410721583e-06, + "loss": 1.7365, + "mean_token_accuracy": 0.5943965911865234, + "num_tokens": 7091779249.0, + "step": 13873 + }, + { + "epoch": 3.7517577068685775, + "grad_norm": 0.8380997180938721, + "learning_rate": 4.786298856128101e-06, + "loss": 1.9373, + "mean_token_accuracy": 0.5467151403427124, + "num_tokens": 7092303518.0, + "step": 13874 + }, + { + "epoch": 3.752028123309897, + "grad_norm": 1.0106936693191528, + "learning_rate": 4.785158492189242e-06, + "loss": 1.9406, + "mean_token_accuracy": 0.5532708168029785, + "num_tokens": 7092794038.0, + "step": 13875 + }, + { + "epoch": 3.7522985397512167, + "grad_norm": 1.0804648399353027, + "learning_rate": 4.78401831894e-06, + "loss": 1.8849, + "mean_token_accuracy": 0.5877000093460083, + "num_tokens": 7093305376.0, + "step": 13876 + }, + { + "epoch": 3.7525689561925364, + "grad_norm": 1.172896385192871, + "learning_rate": 4.782878336415354e-06, + "loss": 1.6799, + "mean_token_accuracy": 0.6010823249816895, + "num_tokens": 7093829512.0, + "step": 13877 + }, + { + "epoch": 3.752839372633856, + "grad_norm": 0.9762663245201111, + "learning_rate": 4.781738544650284e-06, + "loss": 1.9646, + "mean_token_accuracy": 0.5484116077423096, + "num_tokens": 7094353784.0, + "step": 13878 + }, + { + "epoch": 3.7531097890751757, + "grad_norm": 0.8840844035148621, + "learning_rate": 4.780598943679762e-06, + "loss": 1.9543, + "mean_token_accuracy": 0.5540139079093933, + "num_tokens": 7094846033.0, + "step": 13879 + }, + { + "epoch": 3.7533802055164953, + "grad_norm": 0.9446637630462646, + "learning_rate": 4.779459533538756e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5818220376968384, + "num_tokens": 7095370173.0, + "step": 13880 + }, + { + "epoch": 3.753650621957815, + "grad_norm": 0.3650868237018585, + "learning_rate": 4.778320314262222e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.7097715139389038, + "num_tokens": 7095894363.0, + "step": 13881 + }, + { + "epoch": 3.7539210383991346, + "grad_norm": 0.9411190152168274, + "learning_rate": 4.77718128588512e-06, + "loss": 1.7335, + "mean_token_accuracy": 0.5728683471679688, + "num_tokens": 7096411382.0, + "step": 13882 + }, + { + "epoch": 3.7541914548404542, + "grad_norm": 0.9643193483352661, + "learning_rate": 4.776042448442394e-06, + "loss": 1.7887, + "mean_token_accuracy": 0.5839323997497559, + "num_tokens": 7096935566.0, + "step": 13883 + }, + { + "epoch": 3.754461871281774, + "grad_norm": 0.9420430660247803, + "learning_rate": 4.774903801968988e-06, + "loss": 1.9036, + "mean_token_accuracy": 0.5789584517478943, + "num_tokens": 7097459819.0, + "step": 13884 + }, + { + "epoch": 3.7547322877230935, + "grad_norm": 0.9655845761299133, + "learning_rate": 4.7737653464998445e-06, + "loss": 1.9219, + "mean_token_accuracy": 0.5537382364273071, + "num_tokens": 7097947281.0, + "step": 13885 + }, + { + "epoch": 3.755002704164413, + "grad_norm": 0.9375752210617065, + "learning_rate": 4.772627082069887e-06, + "loss": 1.8576, + "mean_token_accuracy": 0.5679800510406494, + "num_tokens": 7098459661.0, + "step": 13886 + }, + { + "epoch": 3.755273120605733, + "grad_norm": 1.032711148262024, + "learning_rate": 4.771489008714046e-06, + "loss": 1.8502, + "mean_token_accuracy": 0.5655987858772278, + "num_tokens": 7098983917.0, + "step": 13887 + }, + { + "epoch": 3.7555435370470525, + "grad_norm": 0.9678378105163574, + "learning_rate": 4.7703511264672395e-06, + "loss": 1.9482, + "mean_token_accuracy": 0.5441673994064331, + "num_tokens": 7099508055.0, + "step": 13888 + }, + { + "epoch": 3.755813953488372, + "grad_norm": 1.1891403198242188, + "learning_rate": 4.769213435364379e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5767937898635864, + "num_tokens": 7100032254.0, + "step": 13889 + }, + { + "epoch": 3.7560843699296917, + "grad_norm": 1.0957846641540527, + "learning_rate": 4.768075935440376e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.5624643564224243, + "num_tokens": 7100527797.0, + "step": 13890 + }, + { + "epoch": 3.7563547863710114, + "grad_norm": 2.7615160942077637, + "learning_rate": 4.76693862673013e-06, + "loss": 1.657, + "mean_token_accuracy": 0.6171057820320129, + "num_tokens": 7101022476.0, + "step": 13891 + }, + { + "epoch": 3.756625202812331, + "grad_norm": 0.9532301425933838, + "learning_rate": 4.765801509268534e-06, + "loss": 1.6447, + "mean_token_accuracy": 0.6029740571975708, + "num_tokens": 7101546520.0, + "step": 13892 + }, + { + "epoch": 3.7568956192536507, + "grad_norm": 0.8682999014854431, + "learning_rate": 4.7646645830904846e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5796657800674438, + "num_tokens": 7102070799.0, + "step": 13893 + }, + { + "epoch": 3.7571660356949703, + "grad_norm": 0.9888520240783691, + "learning_rate": 4.763527848230863e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5792833566665649, + "num_tokens": 7102595077.0, + "step": 13894 + }, + { + "epoch": 3.75743645213629, + "grad_norm": 0.978577196598053, + "learning_rate": 4.762391304724544e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5798940658569336, + "num_tokens": 7103119200.0, + "step": 13895 + }, + { + "epoch": 3.7577068685776096, + "grad_norm": 1.0132406949996948, + "learning_rate": 4.761254952606406e-06, + "loss": 1.8251, + "mean_token_accuracy": 0.5778073668479919, + "num_tokens": 7103643413.0, + "step": 13896 + }, + { + "epoch": 3.7579772850189292, + "grad_norm": 0.9753726720809937, + "learning_rate": 4.7601187919113105e-06, + "loss": 1.8597, + "mean_token_accuracy": 0.5770342350006104, + "num_tokens": 7104108250.0, + "step": 13897 + }, + { + "epoch": 3.758247701460249, + "grad_norm": 0.9547118544578552, + "learning_rate": 4.75898282267412e-06, + "loss": 1.9301, + "mean_token_accuracy": 0.5465519428253174, + "num_tokens": 7104632469.0, + "step": 13898 + }, + { + "epoch": 3.7585181179015685, + "grad_norm": 1.0009701251983643, + "learning_rate": 4.757847044929694e-06, + "loss": 1.869, + "mean_token_accuracy": 0.5739306211471558, + "num_tokens": 7105156728.0, + "step": 13899 + }, + { + "epoch": 3.758788534342888, + "grad_norm": 0.9625346660614014, + "learning_rate": 4.756711458712877e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5655732154846191, + "num_tokens": 7105680908.0, + "step": 13900 + }, + { + "epoch": 3.759058950784208, + "grad_norm": 0.3270147442817688, + "learning_rate": 4.755576064058511e-06, + "loss": 1.1061, + "mean_token_accuracy": 0.7061275243759155, + "num_tokens": 7106205115.0, + "step": 13901 + }, + { + "epoch": 3.7593293672255275, + "grad_norm": 0.8725432753562927, + "learning_rate": 4.754440861001437e-06, + "loss": 1.7507, + "mean_token_accuracy": 0.5798065066337585, + "num_tokens": 7106729239.0, + "step": 13902 + }, + { + "epoch": 3.759599783666847, + "grad_norm": 0.969020664691925, + "learning_rate": 4.753305849576484e-06, + "loss": 1.558, + "mean_token_accuracy": 0.6469732522964478, + "num_tokens": 7107253441.0, + "step": 13903 + }, + { + "epoch": 3.7598702001081667, + "grad_norm": 0.9950698614120483, + "learning_rate": 4.752171029818475e-06, + "loss": 1.7382, + "mean_token_accuracy": 0.5702519416809082, + "num_tokens": 7107777686.0, + "step": 13904 + }, + { + "epoch": 3.760140616549486, + "grad_norm": 0.967491626739502, + "learning_rate": 4.751036401762237e-06, + "loss": 1.7863, + "mean_token_accuracy": 0.5887682437896729, + "num_tokens": 7108301962.0, + "step": 13905 + }, + { + "epoch": 3.760411032990806, + "grad_norm": 0.8941317200660706, + "learning_rate": 4.749901965442575e-06, + "loss": 1.7943, + "mean_token_accuracy": 0.5965045094490051, + "num_tokens": 7108765026.0, + "step": 13906 + }, + { + "epoch": 3.7606814494321252, + "grad_norm": 0.8531571626663208, + "learning_rate": 4.748767720894303e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5845934748649597, + "num_tokens": 7109289225.0, + "step": 13907 + }, + { + "epoch": 3.7609518658734453, + "grad_norm": 0.8332147002220154, + "learning_rate": 4.7476336681522235e-06, + "loss": 1.8495, + "mean_token_accuracy": 0.5704419612884521, + "num_tokens": 7109813482.0, + "step": 13908 + }, + { + "epoch": 3.7612222823147645, + "grad_norm": 0.8012495040893555, + "learning_rate": 4.746499807251125e-06, + "loss": 1.9441, + "mean_token_accuracy": 0.5553774237632751, + "num_tokens": 7110337759.0, + "step": 13909 + }, + { + "epoch": 3.7614926987560846, + "grad_norm": 0.8183944225311279, + "learning_rate": 4.7453661382258055e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.5871409177780151, + "num_tokens": 7110862004.0, + "step": 13910 + }, + { + "epoch": 3.761763115197404, + "grad_norm": 0.8754880428314209, + "learning_rate": 4.744232661111046e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5835953950881958, + "num_tokens": 7111374482.0, + "step": 13911 + }, + { + "epoch": 3.762033531638724, + "grad_norm": 1.0957165956497192, + "learning_rate": 4.7430993759416265e-06, + "loss": 1.891, + "mean_token_accuracy": 0.5351934432983398, + "num_tokens": 7111896516.0, + "step": 13912 + }, + { + "epoch": 3.762303948080043, + "grad_norm": 0.884289562702179, + "learning_rate": 4.741966282752317e-06, + "loss": 1.947, + "mean_token_accuracy": 0.5705875158309937, + "num_tokens": 7112324661.0, + "step": 13913 + }, + { + "epoch": 3.7625743645213627, + "grad_norm": 0.8632816076278687, + "learning_rate": 4.740833381577886e-06, + "loss": 1.7494, + "mean_token_accuracy": 0.5875523090362549, + "num_tokens": 7112848876.0, + "step": 13914 + }, + { + "epoch": 3.7628447809626824, + "grad_norm": 1.0985301733016968, + "learning_rate": 4.739700672453094e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5823203921318054, + "num_tokens": 7113373043.0, + "step": 13915 + }, + { + "epoch": 3.763115197404002, + "grad_norm": 0.7974504232406616, + "learning_rate": 4.738568155412696e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5806972980499268, + "num_tokens": 7113897312.0, + "step": 13916 + }, + { + "epoch": 3.7633856138453217, + "grad_norm": 0.9490395188331604, + "learning_rate": 4.737435830491442e-06, + "loss": 1.9141, + "mean_token_accuracy": 0.5563555955886841, + "num_tokens": 7114421484.0, + "step": 13917 + }, + { + "epoch": 3.7636560302866413, + "grad_norm": 0.9889411926269531, + "learning_rate": 4.736303697724071e-06, + "loss": 1.8681, + "mean_token_accuracy": 0.5829753875732422, + "num_tokens": 7114945668.0, + "step": 13918 + }, + { + "epoch": 3.763926446727961, + "grad_norm": 0.8925540447235107, + "learning_rate": 4.7351717571453245e-06, + "loss": 1.9296, + "mean_token_accuracy": 0.5392611026763916, + "num_tokens": 7115469866.0, + "step": 13919 + }, + { + "epoch": 3.7641968631692806, + "grad_norm": 0.9846659302711487, + "learning_rate": 4.734040008789934e-06, + "loss": 1.8633, + "mean_token_accuracy": 0.574409544467926, + "num_tokens": 7115994039.0, + "step": 13920 + }, + { + "epoch": 3.7644672796106002, + "grad_norm": 0.3410714268684387, + "learning_rate": 4.732908452692619e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.718427836894989, + "num_tokens": 7116518188.0, + "step": 13921 + }, + { + "epoch": 3.76473769605192, + "grad_norm": 0.8982222676277161, + "learning_rate": 4.731777088888106e-06, + "loss": 1.6751, + "mean_token_accuracy": 0.59889817237854, + "num_tokens": 7117042431.0, + "step": 13922 + }, + { + "epoch": 3.7650081124932395, + "grad_norm": 0.9283817410469055, + "learning_rate": 4.730645917411105e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.573907196521759, + "num_tokens": 7117566541.0, + "step": 13923 + }, + { + "epoch": 3.765278528934559, + "grad_norm": 0.8440595865249634, + "learning_rate": 4.729514938296323e-06, + "loss": 1.7457, + "mean_token_accuracy": 0.5876030325889587, + "num_tokens": 7118090619.0, + "step": 13924 + }, + { + "epoch": 3.765548945375879, + "grad_norm": 0.8354089260101318, + "learning_rate": 4.728384151578464e-06, + "loss": 1.6165, + "mean_token_accuracy": 0.6163884401321411, + "num_tokens": 7118556697.0, + "step": 13925 + }, + { + "epoch": 3.7658193618171985, + "grad_norm": 0.9724668264389038, + "learning_rate": 4.727253557292221e-06, + "loss": 1.7919, + "mean_token_accuracy": 0.5877420902252197, + "num_tokens": 7119080924.0, + "step": 13926 + }, + { + "epoch": 3.766089778258518, + "grad_norm": 0.8628682494163513, + "learning_rate": 4.726123155472286e-06, + "loss": 1.8129, + "mean_token_accuracy": 0.582215428352356, + "num_tokens": 7119605205.0, + "step": 13927 + }, + { + "epoch": 3.7663601946998377, + "grad_norm": 0.812743067741394, + "learning_rate": 4.724992946153345e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.5634501576423645, + "num_tokens": 7120129406.0, + "step": 13928 + }, + { + "epoch": 3.7666306111411574, + "grad_norm": 0.7911322712898254, + "learning_rate": 4.723862929370071e-06, + "loss": 1.8644, + "mean_token_accuracy": 0.5787380933761597, + "num_tokens": 7120649364.0, + "step": 13929 + }, + { + "epoch": 3.766901027582477, + "grad_norm": 0.9236786365509033, + "learning_rate": 4.722733105157142e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5779628157615662, + "num_tokens": 7121122896.0, + "step": 13930 + }, + { + "epoch": 3.7671714440237967, + "grad_norm": 0.8618905544281006, + "learning_rate": 4.721603473549223e-06, + "loss": 1.7359, + "mean_token_accuracy": 0.5979932546615601, + "num_tokens": 7121647175.0, + "step": 13931 + }, + { + "epoch": 3.7674418604651163, + "grad_norm": 0.9457484483718872, + "learning_rate": 4.720474034580969e-06, + "loss": 1.7539, + "mean_token_accuracy": 0.6060135960578918, + "num_tokens": 7122171316.0, + "step": 13932 + }, + { + "epoch": 3.767712276906436, + "grad_norm": 0.9182832837104797, + "learning_rate": 4.719344788287042e-06, + "loss": 1.9183, + "mean_token_accuracy": 0.5694165229797363, + "num_tokens": 7122695563.0, + "step": 13933 + }, + { + "epoch": 3.7679826933477556, + "grad_norm": 0.8297290205955505, + "learning_rate": 4.718215734702087e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.5777795910835266, + "num_tokens": 7123219677.0, + "step": 13934 + }, + { + "epoch": 3.7682531097890752, + "grad_norm": 0.9256823062896729, + "learning_rate": 4.717086873860743e-06, + "loss": 1.6403, + "mean_token_accuracy": 0.5852885246276855, + "num_tokens": 7123743892.0, + "step": 13935 + }, + { + "epoch": 3.768523526230395, + "grad_norm": 1.0848335027694702, + "learning_rate": 4.715958205797654e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5821255445480347, + "num_tokens": 7124268152.0, + "step": 13936 + }, + { + "epoch": 3.7687939426717145, + "grad_norm": 1.115024447441101, + "learning_rate": 4.714829730547448e-06, + "loss": 1.7007, + "mean_token_accuracy": 0.6011166572570801, + "num_tokens": 7124774482.0, + "step": 13937 + }, + { + "epoch": 3.769064359113034, + "grad_norm": 0.8889979124069214, + "learning_rate": 4.713701448144746e-06, + "loss": 1.7075, + "mean_token_accuracy": 0.6044781804084778, + "num_tokens": 7125298704.0, + "step": 13938 + }, + { + "epoch": 3.769334775554354, + "grad_norm": 0.8379408121109009, + "learning_rate": 4.712573358624173e-06, + "loss": 1.6118, + "mean_token_accuracy": 0.6198552846908569, + "num_tokens": 7125822855.0, + "step": 13939 + }, + { + "epoch": 3.7696051919956735, + "grad_norm": 1.0331507921218872, + "learning_rate": 4.711445462020337e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.5723875761032104, + "num_tokens": 7126347082.0, + "step": 13940 + }, + { + "epoch": 3.769875608436993, + "grad_norm": 0.36108362674713135, + "learning_rate": 4.710317758367847e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.7218345403671265, + "num_tokens": 7126854575.0, + "step": 13941 + }, + { + "epoch": 3.7701460248783127, + "grad_norm": 1.1142300367355347, + "learning_rate": 4.709190247701308e-06, + "loss": 1.7023, + "mean_token_accuracy": 0.5960395336151123, + "num_tokens": 7127378841.0, + "step": 13942 + }, + { + "epoch": 3.7704164413196324, + "grad_norm": 1.1197165250778198, + "learning_rate": 4.708062930055313e-06, + "loss": 1.8347, + "mean_token_accuracy": 0.5845121145248413, + "num_tokens": 7127903044.0, + "step": 13943 + }, + { + "epoch": 3.770686857760952, + "grad_norm": 0.9613603353500366, + "learning_rate": 4.706935805464448e-06, + "loss": 1.8992, + "mean_token_accuracy": 0.5656500458717346, + "num_tokens": 7128427320.0, + "step": 13944 + }, + { + "epoch": 3.7709572742022717, + "grad_norm": 0.9197956323623657, + "learning_rate": 4.7058088739633e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5625325441360474, + "num_tokens": 7128951452.0, + "step": 13945 + }, + { + "epoch": 3.771227690643591, + "grad_norm": 1.0955828428268433, + "learning_rate": 4.704682135586448e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.5619832277297974, + "num_tokens": 7129475637.0, + "step": 13946 + }, + { + "epoch": 3.771498107084911, + "grad_norm": 1.090384840965271, + "learning_rate": 4.703555590368458e-06, + "loss": 1.7637, + "mean_token_accuracy": 0.6026578545570374, + "num_tokens": 7129999738.0, + "step": 13947 + }, + { + "epoch": 3.77176852352623, + "grad_norm": 0.9863815307617188, + "learning_rate": 4.702429238343901e-06, + "loss": 1.7596, + "mean_token_accuracy": 0.5868271589279175, + "num_tokens": 7130480356.0, + "step": 13948 + }, + { + "epoch": 3.7720389399675502, + "grad_norm": 0.9457159042358398, + "learning_rate": 4.701303079547336e-06, + "loss": 1.7267, + "mean_token_accuracy": 0.5849375128746033, + "num_tokens": 7131004450.0, + "step": 13949 + }, + { + "epoch": 3.7723093564088694, + "grad_norm": 1.0647637844085693, + "learning_rate": 4.700177114013312e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.5758025646209717, + "num_tokens": 7131528669.0, + "step": 13950 + }, + { + "epoch": 3.7725797728501895, + "grad_norm": 0.8780775666236877, + "learning_rate": 4.699051341776384e-06, + "loss": 1.7426, + "mean_token_accuracy": 0.587997555732727, + "num_tokens": 7132052874.0, + "step": 13951 + }, + { + "epoch": 3.7728501892915087, + "grad_norm": 0.9408179521560669, + "learning_rate": 4.6979257628710874e-06, + "loss": 1.9062, + "mean_token_accuracy": 0.5662556886672974, + "num_tokens": 7132564259.0, + "step": 13952 + }, + { + "epoch": 3.773120605732829, + "grad_norm": 1.0038976669311523, + "learning_rate": 4.696800377331966e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5851538777351379, + "num_tokens": 7133088487.0, + "step": 13953 + }, + { + "epoch": 3.773391022174148, + "grad_norm": 0.9083520174026489, + "learning_rate": 4.695675185193541e-06, + "loss": 1.8801, + "mean_token_accuracy": 0.5474730730056763, + "num_tokens": 7133612595.0, + "step": 13954 + }, + { + "epoch": 3.7736614386154677, + "grad_norm": 1.054334282875061, + "learning_rate": 4.694550186490345e-06, + "loss": 1.9197, + "mean_token_accuracy": 0.5675859451293945, + "num_tokens": 7134136835.0, + "step": 13955 + }, + { + "epoch": 3.7739318550567873, + "grad_norm": 1.0366051197052002, + "learning_rate": 4.693425381256889e-06, + "loss": 1.75, + "mean_token_accuracy": 0.583609938621521, + "num_tokens": 7134642750.0, + "step": 13956 + }, + { + "epoch": 3.774202271498107, + "grad_norm": 0.9343360066413879, + "learning_rate": 4.692300769527693e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5842931270599365, + "num_tokens": 7135134970.0, + "step": 13957 + }, + { + "epoch": 3.7744726879394266, + "grad_norm": 1.132145881652832, + "learning_rate": 4.691176351337254e-06, + "loss": 1.9499, + "mean_token_accuracy": 0.56364905834198, + "num_tokens": 7135659235.0, + "step": 13958 + }, + { + "epoch": 3.7747431043807462, + "grad_norm": 1.0854421854019165, + "learning_rate": 4.690052126720082e-06, + "loss": 1.725, + "mean_token_accuracy": 0.5752631425857544, + "num_tokens": 7136183350.0, + "step": 13959 + }, + { + "epoch": 3.775013520822066, + "grad_norm": 0.897505521774292, + "learning_rate": 4.688928095710667e-06, + "loss": 1.6097, + "mean_token_accuracy": 0.6444881558418274, + "num_tokens": 7136707589.0, + "step": 13960 + }, + { + "epoch": 3.7752839372633855, + "grad_norm": 0.3731604516506195, + "learning_rate": 4.687804258343494e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.7155004739761353, + "num_tokens": 7137214479.0, + "step": 13961 + }, + { + "epoch": 3.775554353704705, + "grad_norm": 1.0255554914474487, + "learning_rate": 4.686680614653052e-06, + "loss": 1.757, + "mean_token_accuracy": 0.5786751508712769, + "num_tokens": 7137738655.0, + "step": 13962 + }, + { + "epoch": 3.775824770146025, + "grad_norm": 1.0391780138015747, + "learning_rate": 4.685557164673816e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.5610274076461792, + "num_tokens": 7138257664.0, + "step": 13963 + }, + { + "epoch": 3.7760951865873444, + "grad_norm": 1.1222902536392212, + "learning_rate": 4.684433908440252e-06, + "loss": 1.9433, + "mean_token_accuracy": 0.5640541315078735, + "num_tokens": 7138750298.0, + "step": 13964 + }, + { + "epoch": 3.776365603028664, + "grad_norm": 0.9696990847587585, + "learning_rate": 4.683310845986832e-06, + "loss": 1.8831, + "mean_token_accuracy": 0.5720404386520386, + "num_tokens": 7139212348.0, + "step": 13965 + }, + { + "epoch": 3.7766360194699837, + "grad_norm": 0.8858650326728821, + "learning_rate": 4.682187977348011e-06, + "loss": 1.8894, + "mean_token_accuracy": 0.5872929692268372, + "num_tokens": 7139700743.0, + "step": 13966 + }, + { + "epoch": 3.7769064359113034, + "grad_norm": 0.9694337248802185, + "learning_rate": 4.68106530255824e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.572507381439209, + "num_tokens": 7140224901.0, + "step": 13967 + }, + { + "epoch": 3.777176852352623, + "grad_norm": 1.1447659730911255, + "learning_rate": 4.679942821651969e-06, + "loss": 1.9566, + "mean_token_accuracy": 0.5502451658248901, + "num_tokens": 7140690271.0, + "step": 13968 + }, + { + "epoch": 3.7774472687939427, + "grad_norm": 1.0176770687103271, + "learning_rate": 4.6788205346636395e-06, + "loss": 1.9051, + "mean_token_accuracy": 0.5607650876045227, + "num_tokens": 7141214516.0, + "step": 13969 + }, + { + "epoch": 3.7777176852352623, + "grad_norm": 0.9322683215141296, + "learning_rate": 4.6776984416276835e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5711662769317627, + "num_tokens": 7141727044.0, + "step": 13970 + }, + { + "epoch": 3.777988101676582, + "grad_norm": 0.9689330458641052, + "learning_rate": 4.676576542578535e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5770367383956909, + "num_tokens": 7142251057.0, + "step": 13971 + }, + { + "epoch": 3.7782585181179016, + "grad_norm": 1.0132715702056885, + "learning_rate": 4.675454837550613e-06, + "loss": 1.7662, + "mean_token_accuracy": 0.5857087969779968, + "num_tokens": 7142722354.0, + "step": 13972 + }, + { + "epoch": 3.7785289345592212, + "grad_norm": 1.0232174396514893, + "learning_rate": 4.674333326578333e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.5584629774093628, + "num_tokens": 7143246562.0, + "step": 13973 + }, + { + "epoch": 3.778799351000541, + "grad_norm": 1.1461421251296997, + "learning_rate": 4.673212009696114e-06, + "loss": 1.8137, + "mean_token_accuracy": 0.6221139430999756, + "num_tokens": 7143645941.0, + "step": 13974 + }, + { + "epoch": 3.7790697674418605, + "grad_norm": 1.0079214572906494, + "learning_rate": 4.672090886938352e-06, + "loss": 1.8852, + "mean_token_accuracy": 0.5687541961669922, + "num_tokens": 7144170185.0, + "step": 13975 + }, + { + "epoch": 3.77934018388318, + "grad_norm": 1.38954758644104, + "learning_rate": 4.670969958339454e-06, + "loss": 1.8114, + "mean_token_accuracy": 0.5748742818832397, + "num_tokens": 7144654862.0, + "step": 13976 + }, + { + "epoch": 3.7796106003245, + "grad_norm": 1.0017329454421997, + "learning_rate": 4.66984922393381e-06, + "loss": 1.8141, + "mean_token_accuracy": 0.5604089498519897, + "num_tokens": 7145179112.0, + "step": 13977 + }, + { + "epoch": 3.7798810167658194, + "grad_norm": 0.8995741605758667, + "learning_rate": 4.668728683755804e-06, + "loss": 1.8197, + "mean_token_accuracy": 0.5892243981361389, + "num_tokens": 7145683915.0, + "step": 13978 + }, + { + "epoch": 3.780151433207139, + "grad_norm": 0.8009076714515686, + "learning_rate": 4.667608337839825e-06, + "loss": 1.7615, + "mean_token_accuracy": 0.5950617790222168, + "num_tokens": 7146188789.0, + "step": 13979 + }, + { + "epoch": 3.7804218496484587, + "grad_norm": 0.9254591464996338, + "learning_rate": 4.666488186220244e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5599396824836731, + "num_tokens": 7146713069.0, + "step": 13980 + }, + { + "epoch": 3.7806922660897784, + "grad_norm": 0.35009151697158813, + "learning_rate": 4.6653682289314285e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.7137564420700073, + "num_tokens": 7147182642.0, + "step": 13981 + }, + { + "epoch": 3.780962682531098, + "grad_norm": 0.9257825016975403, + "learning_rate": 4.664248466007747e-06, + "loss": 1.7933, + "mean_token_accuracy": 0.5764990448951721, + "num_tokens": 7147696278.0, + "step": 13982 + }, + { + "epoch": 3.7812330989724177, + "grad_norm": 0.9592824578285217, + "learning_rate": 4.663128897483553e-06, + "loss": 1.8584, + "mean_token_accuracy": 0.560454785823822, + "num_tokens": 7148220412.0, + "step": 13983 + }, + { + "epoch": 3.7815035154137373, + "grad_norm": 0.9127463698387146, + "learning_rate": 4.662009523393199e-06, + "loss": 1.7748, + "mean_token_accuracy": 0.593677818775177, + "num_tokens": 7148744632.0, + "step": 13984 + }, + { + "epoch": 3.781773931855057, + "grad_norm": 0.8029187321662903, + "learning_rate": 4.660890343771035e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.566369891166687, + "num_tokens": 7149268867.0, + "step": 13985 + }, + { + "epoch": 3.7820443482963766, + "grad_norm": 0.9336532950401306, + "learning_rate": 4.6597713586513975e-06, + "loss": 1.8441, + "mean_token_accuracy": 0.5697646737098694, + "num_tokens": 7149738071.0, + "step": 13986 + }, + { + "epoch": 3.782314764737696, + "grad_norm": 0.9074380993843079, + "learning_rate": 4.658652568068617e-06, + "loss": 1.8753, + "mean_token_accuracy": 0.5714854001998901, + "num_tokens": 7150262335.0, + "step": 13987 + }, + { + "epoch": 3.782585181179016, + "grad_norm": 0.9328793287277222, + "learning_rate": 4.657533972057026e-06, + "loss": 1.8221, + "mean_token_accuracy": 0.6005399227142334, + "num_tokens": 7150752812.0, + "step": 13988 + }, + { + "epoch": 3.782855597620335, + "grad_norm": 0.9552906155586243, + "learning_rate": 4.656415570650947e-06, + "loss": 1.8381, + "mean_token_accuracy": 0.5786675214767456, + "num_tokens": 7151260113.0, + "step": 13989 + }, + { + "epoch": 3.783126014061655, + "grad_norm": 0.8292282223701477, + "learning_rate": 4.6552973638846895e-06, + "loss": 1.7445, + "mean_token_accuracy": 0.5923537611961365, + "num_tokens": 7151784173.0, + "step": 13990 + }, + { + "epoch": 3.7833964305029744, + "grad_norm": 0.99163818359375, + "learning_rate": 4.654179351792571e-06, + "loss": 1.6337, + "mean_token_accuracy": 0.6083065867424011, + "num_tokens": 7152308421.0, + "step": 13991 + }, + { + "epoch": 3.7836668469442944, + "grad_norm": 1.0652375221252441, + "learning_rate": 4.653061534408892e-06, + "loss": 1.8821, + "mean_token_accuracy": 0.5676000118255615, + "num_tokens": 7152832592.0, + "step": 13992 + }, + { + "epoch": 3.7839372633856136, + "grad_norm": 0.8914517760276794, + "learning_rate": 4.6519439117679455e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5606673359870911, + "num_tokens": 7153356799.0, + "step": 13993 + }, + { + "epoch": 3.7842076798269337, + "grad_norm": 0.8960496187210083, + "learning_rate": 4.650826483904032e-06, + "loss": 1.8004, + "mean_token_accuracy": 0.587409496307373, + "num_tokens": 7153881006.0, + "step": 13994 + }, + { + "epoch": 3.784478096268253, + "grad_norm": 0.8902910947799683, + "learning_rate": 4.649709250851434e-06, + "loss": 1.7411, + "mean_token_accuracy": 0.5828360319137573, + "num_tokens": 7154405290.0, + "step": 13995 + }, + { + "epoch": 3.7847485127095726, + "grad_norm": 0.8614115715026855, + "learning_rate": 4.6485922126444286e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5696067810058594, + "num_tokens": 7154929485.0, + "step": 13996 + }, + { + "epoch": 3.785018929150892, + "grad_norm": 1.1082521677017212, + "learning_rate": 4.647475369317292e-06, + "loss": 1.7434, + "mean_token_accuracy": 0.5826637744903564, + "num_tokens": 7155453755.0, + "step": 13997 + }, + { + "epoch": 3.785289345592212, + "grad_norm": 1.0331673622131348, + "learning_rate": 4.646358720904295e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.575037956237793, + "num_tokens": 7155977900.0, + "step": 13998 + }, + { + "epoch": 3.7855597620335315, + "grad_norm": 0.8776357173919678, + "learning_rate": 4.645242267439695e-06, + "loss": 1.7631, + "mean_token_accuracy": 0.5779415965080261, + "num_tokens": 7156449586.0, + "step": 13999 + }, + { + "epoch": 3.785830178474851, + "grad_norm": 0.9622188210487366, + "learning_rate": 4.6441260089577525e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5630784034729004, + "num_tokens": 7156973765.0, + "step": 14000 + }, + { + "epoch": 3.786100594916171, + "grad_norm": 0.36229202151298523, + "learning_rate": 4.643009945492713e-06, + "loss": 1.1036, + "mean_token_accuracy": 0.7013488411903381, + "num_tokens": 7157446362.0, + "step": 14001 + }, + { + "epoch": 3.7863710113574904, + "grad_norm": 0.9052359461784363, + "learning_rate": 4.6418940770788266e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.5798509120941162, + "num_tokens": 7157951157.0, + "step": 14002 + }, + { + "epoch": 3.78664142779881, + "grad_norm": 1.22537362575531, + "learning_rate": 4.640778403750327e-06, + "loss": 1.9267, + "mean_token_accuracy": 0.5603059530258179, + "num_tokens": 7158475390.0, + "step": 14003 + }, + { + "epoch": 3.7869118442401297, + "grad_norm": 0.9677452445030212, + "learning_rate": 4.639662925541445e-06, + "loss": 1.778, + "mean_token_accuracy": 0.5816866755485535, + "num_tokens": 7158999598.0, + "step": 14004 + }, + { + "epoch": 3.7871822606814494, + "grad_norm": 0.8556852340698242, + "learning_rate": 4.638547642486412e-06, + "loss": 1.6628, + "mean_token_accuracy": 0.6025185585021973, + "num_tokens": 7159523818.0, + "step": 14005 + }, + { + "epoch": 3.787452677122769, + "grad_norm": 0.9845592379570007, + "learning_rate": 4.6374325546194445e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.562147319316864, + "num_tokens": 7160048040.0, + "step": 14006 + }, + { + "epoch": 3.7877230935640886, + "grad_norm": 1.1330156326293945, + "learning_rate": 4.636317661974755e-06, + "loss": 1.8772, + "mean_token_accuracy": 0.5745126008987427, + "num_tokens": 7160523733.0, + "step": 14007 + }, + { + "epoch": 3.7879935100054083, + "grad_norm": 0.91964191198349, + "learning_rate": 4.635202964586556e-06, + "loss": 1.7729, + "mean_token_accuracy": 0.5934051275253296, + "num_tokens": 7161047883.0, + "step": 14008 + }, + { + "epoch": 3.788263926446728, + "grad_norm": 1.0059014558792114, + "learning_rate": 4.634088462489048e-06, + "loss": 1.9608, + "mean_token_accuracy": 0.5479806661605835, + "num_tokens": 7161572157.0, + "step": 14009 + }, + { + "epoch": 3.7885343428880476, + "grad_norm": 0.9592495560646057, + "learning_rate": 4.6329741557164254e-06, + "loss": 1.9682, + "mean_token_accuracy": 0.5590238571166992, + "num_tokens": 7162096433.0, + "step": 14010 + }, + { + "epoch": 3.7888047593293672, + "grad_norm": 1.017443060874939, + "learning_rate": 4.631860044302879e-06, + "loss": 1.8167, + "mean_token_accuracy": 0.5710165500640869, + "num_tokens": 7162620689.0, + "step": 14011 + }, + { + "epoch": 3.789075175770687, + "grad_norm": 1.0862301588058472, + "learning_rate": 4.6307461282825965e-06, + "loss": 1.861, + "mean_token_accuracy": 0.5823757648468018, + "num_tokens": 7163115616.0, + "step": 14012 + }, + { + "epoch": 3.7893455922120065, + "grad_norm": 0.9547654390335083, + "learning_rate": 4.62963240768975e-06, + "loss": 1.9662, + "mean_token_accuracy": 0.5649470090866089, + "num_tokens": 7163614772.0, + "step": 14013 + }, + { + "epoch": 3.789616008653326, + "grad_norm": 0.980137288570404, + "learning_rate": 4.6285188825585195e-06, + "loss": 1.8926, + "mean_token_accuracy": 0.562386691570282, + "num_tokens": 7164138913.0, + "step": 14014 + }, + { + "epoch": 3.789886425094646, + "grad_norm": 0.8861856460571289, + "learning_rate": 4.627405552923067e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.5876435041427612, + "num_tokens": 7164663146.0, + "step": 14015 + }, + { + "epoch": 3.7901568415359654, + "grad_norm": 1.0820683240890503, + "learning_rate": 4.626292418817548e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5781996846199036, + "num_tokens": 7165187284.0, + "step": 14016 + }, + { + "epoch": 3.790427257977285, + "grad_norm": 1.0400136709213257, + "learning_rate": 4.625179480276125e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.5957999229431152, + "num_tokens": 7165618725.0, + "step": 14017 + }, + { + "epoch": 3.7906976744186047, + "grad_norm": 0.9842260479927063, + "learning_rate": 4.624066737332944e-06, + "loss": 1.9345, + "mean_token_accuracy": 0.5570687055587769, + "num_tokens": 7166142918.0, + "step": 14018 + }, + { + "epoch": 3.7909680908599244, + "grad_norm": 1.044751524925232, + "learning_rate": 4.622954190022142e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5577547550201416, + "num_tokens": 7166667096.0, + "step": 14019 + }, + { + "epoch": 3.791238507301244, + "grad_norm": 1.0312267541885376, + "learning_rate": 4.621841838377862e-06, + "loss": 1.9243, + "mean_token_accuracy": 0.5733764171600342, + "num_tokens": 7167191276.0, + "step": 14020 + }, + { + "epoch": 3.7915089237425637, + "grad_norm": 0.360150009393692, + "learning_rate": 4.620729682434229e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.714280366897583, + "num_tokens": 7167715494.0, + "step": 14021 + }, + { + "epoch": 3.7917793401838833, + "grad_norm": 0.947127103805542, + "learning_rate": 4.619617722225372e-06, + "loss": 1.8331, + "mean_token_accuracy": 0.5833228230476379, + "num_tokens": 7168239762.0, + "step": 14022 + }, + { + "epoch": 3.792049756625203, + "grad_norm": 0.8514626622200012, + "learning_rate": 4.618505957785407e-06, + "loss": 1.7676, + "mean_token_accuracy": 0.5788702964782715, + "num_tokens": 7168764044.0, + "step": 14023 + }, + { + "epoch": 3.7923201730665226, + "grad_norm": 0.9503388404846191, + "learning_rate": 4.617394389148443e-06, + "loss": 2.0059, + "mean_token_accuracy": 0.5482184886932373, + "num_tokens": 7169288291.0, + "step": 14024 + }, + { + "epoch": 3.7925905895078422, + "grad_norm": 0.9153778553009033, + "learning_rate": 4.6162830163485926e-06, + "loss": 1.7713, + "mean_token_accuracy": 0.6102128028869629, + "num_tokens": 7169812549.0, + "step": 14025 + }, + { + "epoch": 3.792861005949162, + "grad_norm": 0.817124605178833, + "learning_rate": 4.615171839419949e-06, + "loss": 1.8278, + "mean_token_accuracy": 0.5792927145957947, + "num_tokens": 7170336834.0, + "step": 14026 + }, + { + "epoch": 3.7931314223904815, + "grad_norm": 0.8196430802345276, + "learning_rate": 4.61406085839661e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.5604745149612427, + "num_tokens": 7170861099.0, + "step": 14027 + }, + { + "epoch": 3.7934018388318007, + "grad_norm": 0.9012004137039185, + "learning_rate": 4.612950073312666e-06, + "loss": 1.9651, + "mean_token_accuracy": 0.578109622001648, + "num_tokens": 7171321397.0, + "step": 14028 + }, + { + "epoch": 3.793672255273121, + "grad_norm": 1.0825024843215942, + "learning_rate": 4.611839484202197e-06, + "loss": 1.8544, + "mean_token_accuracy": 0.5634241104125977, + "num_tokens": 7171827441.0, + "step": 14029 + }, + { + "epoch": 3.79394267171444, + "grad_norm": 0.9132717251777649, + "learning_rate": 4.610729091099277e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5712577700614929, + "num_tokens": 7172351572.0, + "step": 14030 + }, + { + "epoch": 3.79421308815576, + "grad_norm": 1.0099276304244995, + "learning_rate": 4.60961889403798e-06, + "loss": 1.8009, + "mean_token_accuracy": 0.5944814682006836, + "num_tokens": 7172875721.0, + "step": 14031 + }, + { + "epoch": 3.7944835045970793, + "grad_norm": 0.9764655828475952, + "learning_rate": 4.6085088930523695e-06, + "loss": 1.7498, + "mean_token_accuracy": 0.5964375734329224, + "num_tokens": 7173392895.0, + "step": 14032 + }, + { + "epoch": 3.7947539210383994, + "grad_norm": 0.9090791344642639, + "learning_rate": 4.607399088176499e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.5848595499992371, + "num_tokens": 7173899227.0, + "step": 14033 + }, + { + "epoch": 3.7950243374797186, + "grad_norm": 0.9182701706886292, + "learning_rate": 4.606289479444425e-06, + "loss": 1.8201, + "mean_token_accuracy": 0.5787890553474426, + "num_tokens": 7174387384.0, + "step": 14034 + }, + { + "epoch": 3.7952947539210387, + "grad_norm": 0.905597448348999, + "learning_rate": 4.605180066890195e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5697975158691406, + "num_tokens": 7174911578.0, + "step": 14035 + }, + { + "epoch": 3.795565170362358, + "grad_norm": 1.0175012350082397, + "learning_rate": 4.604070850547842e-06, + "loss": 1.9464, + "mean_token_accuracy": 0.564944863319397, + "num_tokens": 7175435853.0, + "step": 14036 + }, + { + "epoch": 3.7958355868036775, + "grad_norm": 0.9696654081344604, + "learning_rate": 4.602961830451408e-06, + "loss": 1.8372, + "mean_token_accuracy": 0.5762667655944824, + "num_tokens": 7175927329.0, + "step": 14037 + }, + { + "epoch": 3.796106003244997, + "grad_norm": 1.1089088916778564, + "learning_rate": 4.601853006634915e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5736324787139893, + "num_tokens": 7176451442.0, + "step": 14038 + }, + { + "epoch": 3.796376419686317, + "grad_norm": 1.0988496541976929, + "learning_rate": 4.6007443791323896e-06, + "loss": 2.0512, + "mean_token_accuracy": 0.5332598686218262, + "num_tokens": 7176975699.0, + "step": 14039 + }, + { + "epoch": 3.7966468361276364, + "grad_norm": 1.0920617580413818, + "learning_rate": 4.599635947977843e-06, + "loss": 1.98, + "mean_token_accuracy": 0.5572836399078369, + "num_tokens": 7177499959.0, + "step": 14040 + }, + { + "epoch": 3.796917252568956, + "grad_norm": 0.3550908863544464, + "learning_rate": 4.598527713205291e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7185678482055664, + "num_tokens": 7177998423.0, + "step": 14041 + }, + { + "epoch": 3.7971876690102757, + "grad_norm": 1.081615686416626, + "learning_rate": 4.597419674848731e-06, + "loss": 1.7649, + "mean_token_accuracy": 0.5866890549659729, + "num_tokens": 7178490061.0, + "step": 14042 + }, + { + "epoch": 3.7974580854515954, + "grad_norm": 1.0946295261383057, + "learning_rate": 4.5963118329421675e-06, + "loss": 1.7927, + "mean_token_accuracy": 0.5925506353378296, + "num_tokens": 7178968015.0, + "step": 14043 + }, + { + "epoch": 3.797728501892915, + "grad_norm": 0.995517373085022, + "learning_rate": 4.595204187519585e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.588627815246582, + "num_tokens": 7179492222.0, + "step": 14044 + }, + { + "epoch": 3.7979989183342346, + "grad_norm": 0.9688107371330261, + "learning_rate": 4.594096738614977e-06, + "loss": 1.854, + "mean_token_accuracy": 0.5857785940170288, + "num_tokens": 7180016468.0, + "step": 14045 + }, + { + "epoch": 3.7982693347755543, + "grad_norm": 1.3032926321029663, + "learning_rate": 4.592989486262319e-06, + "loss": 1.7277, + "mean_token_accuracy": 0.6030269861221313, + "num_tokens": 7180540704.0, + "step": 14046 + }, + { + "epoch": 3.798539751216874, + "grad_norm": 1.0224977731704712, + "learning_rate": 4.591882430495583e-06, + "loss": 1.747, + "mean_token_accuracy": 0.5851010084152222, + "num_tokens": 7181064736.0, + "step": 14047 + }, + { + "epoch": 3.7988101676581936, + "grad_norm": 1.0217422246932983, + "learning_rate": 4.590775571348743e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.57138592004776, + "num_tokens": 7181588890.0, + "step": 14048 + }, + { + "epoch": 3.799080584099513, + "grad_norm": 1.045393943786621, + "learning_rate": 4.589668908855755e-06, + "loss": 1.8252, + "mean_token_accuracy": 0.5742710828781128, + "num_tokens": 7182113164.0, + "step": 14049 + }, + { + "epoch": 3.799351000540833, + "grad_norm": 1.0823510885238647, + "learning_rate": 4.5885624430505745e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.5695415735244751, + "num_tokens": 7182637352.0, + "step": 14050 + }, + { + "epoch": 3.7996214169821525, + "grad_norm": 0.9147940278053284, + "learning_rate": 4.587456173967155e-06, + "loss": 1.8024, + "mean_token_accuracy": 0.5855773091316223, + "num_tokens": 7183161525.0, + "step": 14051 + }, + { + "epoch": 3.799891833423472, + "grad_norm": 0.9696314334869385, + "learning_rate": 4.586350101639439e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5900266766548157, + "num_tokens": 7183685713.0, + "step": 14052 + }, + { + "epoch": 3.800162249864792, + "grad_norm": 1.255990982055664, + "learning_rate": 4.58524422610136e-06, + "loss": 1.9201, + "mean_token_accuracy": 0.593708872795105, + "num_tokens": 7184089099.0, + "step": 14053 + }, + { + "epoch": 3.8004326663061114, + "grad_norm": 1.0096213817596436, + "learning_rate": 4.584138547386853e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.5522434115409851, + "num_tokens": 7184613238.0, + "step": 14054 + }, + { + "epoch": 3.800703082747431, + "grad_norm": 1.0543262958526611, + "learning_rate": 4.583033065529846e-06, + "loss": 1.7299, + "mean_token_accuracy": 0.612375795841217, + "num_tokens": 7185137440.0, + "step": 14055 + }, + { + "epoch": 3.8009734991887507, + "grad_norm": 0.9265981912612915, + "learning_rate": 4.581927780564252e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5554972887039185, + "num_tokens": 7185655226.0, + "step": 14056 + }, + { + "epoch": 3.8012439156300704, + "grad_norm": 1.055303931236267, + "learning_rate": 4.5808226925239915e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5721276998519897, + "num_tokens": 7186179319.0, + "step": 14057 + }, + { + "epoch": 3.80151433207139, + "grad_norm": 1.132057785987854, + "learning_rate": 4.579717801442969e-06, + "loss": 1.823, + "mean_token_accuracy": 0.5754299759864807, + "num_tokens": 7186703506.0, + "step": 14058 + }, + { + "epoch": 3.8017847485127096, + "grad_norm": 1.023150086402893, + "learning_rate": 4.578613107355081e-06, + "loss": 1.9345, + "mean_token_accuracy": 0.5824651718139648, + "num_tokens": 7187164835.0, + "step": 14059 + }, + { + "epoch": 3.8020551649540293, + "grad_norm": 1.0335323810577393, + "learning_rate": 4.577508610294231e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.5827406644821167, + "num_tokens": 7187662853.0, + "step": 14060 + }, + { + "epoch": 3.802325581395349, + "grad_norm": 0.3749905526638031, + "learning_rate": 4.576404310294303e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.6878683567047119, + "num_tokens": 7188186863.0, + "step": 14061 + }, + { + "epoch": 3.8025959978366686, + "grad_norm": 1.0283770561218262, + "learning_rate": 4.575300207389179e-06, + "loss": 1.7568, + "mean_token_accuracy": 0.5792269706726074, + "num_tokens": 7188711136.0, + "step": 14062 + }, + { + "epoch": 3.802866414277988, + "grad_norm": 1.016735315322876, + "learning_rate": 4.574196301612741e-06, + "loss": 1.9701, + "mean_token_accuracy": 0.5753282904624939, + "num_tokens": 7189124418.0, + "step": 14063 + }, + { + "epoch": 3.803136830719308, + "grad_norm": 0.8322843909263611, + "learning_rate": 4.573092592998858e-06, + "loss": 1.8058, + "mean_token_accuracy": 0.5872421860694885, + "num_tokens": 7189596419.0, + "step": 14064 + }, + { + "epoch": 3.8034072471606275, + "grad_norm": 1.011763572692871, + "learning_rate": 4.5719890815813925e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5776734352111816, + "num_tokens": 7190120401.0, + "step": 14065 + }, + { + "epoch": 3.803677663601947, + "grad_norm": 0.8462291955947876, + "learning_rate": 4.570885767394208e-06, + "loss": 1.7306, + "mean_token_accuracy": 0.5981112718582153, + "num_tokens": 7190596051.0, + "step": 14066 + }, + { + "epoch": 3.803948080043267, + "grad_norm": 1.032257080078125, + "learning_rate": 4.5697826504711504e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5785922408103943, + "num_tokens": 7191110431.0, + "step": 14067 + }, + { + "epoch": 3.8042184964845864, + "grad_norm": 0.9781187176704407, + "learning_rate": 4.568679730846073e-06, + "loss": 1.9561, + "mean_token_accuracy": 0.5617667436599731, + "num_tokens": 7191609179.0, + "step": 14068 + }, + { + "epoch": 3.8044889129259056, + "grad_norm": 0.8330075740814209, + "learning_rate": 4.5675770085528175e-06, + "loss": 1.8081, + "mean_token_accuracy": 0.5750068426132202, + "num_tokens": 7192133392.0, + "step": 14069 + }, + { + "epoch": 3.8047593293672257, + "grad_norm": 0.9598301649093628, + "learning_rate": 4.566474483625212e-06, + "loss": 1.7903, + "mean_token_accuracy": 0.5704678297042847, + "num_tokens": 7192657651.0, + "step": 14070 + }, + { + "epoch": 3.805029745808545, + "grad_norm": 0.8216448426246643, + "learning_rate": 4.565372156097092e-06, + "loss": 1.8847, + "mean_token_accuracy": 0.5647265911102295, + "num_tokens": 7193181846.0, + "step": 14071 + }, + { + "epoch": 3.805300162249865, + "grad_norm": 1.0022528171539307, + "learning_rate": 4.564270026002278e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5810876488685608, + "num_tokens": 7193652232.0, + "step": 14072 + }, + { + "epoch": 3.805570578691184, + "grad_norm": 0.9134155511856079, + "learning_rate": 4.5631680933745825e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5694580674171448, + "num_tokens": 7194176487.0, + "step": 14073 + }, + { + "epoch": 3.8058409951325043, + "grad_norm": 0.887410581111908, + "learning_rate": 4.5620663582478216e-06, + "loss": 1.7815, + "mean_token_accuracy": 0.5638921856880188, + "num_tokens": 7194655919.0, + "step": 14074 + }, + { + "epoch": 3.8061114115738235, + "grad_norm": 1.0693883895874023, + "learning_rate": 4.560964820655799e-06, + "loss": 1.7243, + "mean_token_accuracy": 0.5947526693344116, + "num_tokens": 7195164751.0, + "step": 14075 + }, + { + "epoch": 3.8063818280151436, + "grad_norm": 0.9913699626922607, + "learning_rate": 4.559863480632308e-06, + "loss": 1.881, + "mean_token_accuracy": 0.5770796537399292, + "num_tokens": 7195688968.0, + "step": 14076 + }, + { + "epoch": 3.8066522444564628, + "grad_norm": 0.8565535545349121, + "learning_rate": 4.558762338211148e-06, + "loss": 1.8954, + "mean_token_accuracy": 0.5720372200012207, + "num_tokens": 7196213177.0, + "step": 14077 + }, + { + "epoch": 3.806922660897783, + "grad_norm": 0.9545878171920776, + "learning_rate": 4.557661393426101e-06, + "loss": 1.9893, + "mean_token_accuracy": 0.5632219314575195, + "num_tokens": 7196689578.0, + "step": 14078 + }, + { + "epoch": 3.807193077339102, + "grad_norm": 1.759676456451416, + "learning_rate": 4.556560646310946e-06, + "loss": 1.7725, + "mean_token_accuracy": 0.6094347834587097, + "num_tokens": 7197169397.0, + "step": 14079 + }, + { + "epoch": 3.8074634937804217, + "grad_norm": 1.2538243532180786, + "learning_rate": 4.555460096899462e-06, + "loss": 1.898, + "mean_token_accuracy": 0.5597131252288818, + "num_tokens": 7197693682.0, + "step": 14080 + }, + { + "epoch": 3.8077339102217413, + "grad_norm": 0.3496958315372467, + "learning_rate": 4.5543597452254115e-06, + "loss": 1.05, + "mean_token_accuracy": 0.7224193811416626, + "num_tokens": 7198217901.0, + "step": 14081 + }, + { + "epoch": 3.808004326663061, + "grad_norm": 1.0459399223327637, + "learning_rate": 4.553259591322562e-06, + "loss": 1.8354, + "mean_token_accuracy": 0.5572141408920288, + "num_tokens": 7198722476.0, + "step": 14082 + }, + { + "epoch": 3.8082747431043806, + "grad_norm": 1.1554267406463623, + "learning_rate": 4.552159635224665e-06, + "loss": 1.8718, + "mean_token_accuracy": 0.5516899824142456, + "num_tokens": 7199246745.0, + "step": 14083 + }, + { + "epoch": 3.8085451595457003, + "grad_norm": 1.1463862657546997, + "learning_rate": 4.551059876965474e-06, + "loss": 2.1069, + "mean_token_accuracy": 0.515471339225769, + "num_tokens": 7199771007.0, + "step": 14084 + }, + { + "epoch": 3.80881557598702, + "grad_norm": 0.9056849479675293, + "learning_rate": 4.549960316578729e-06, + "loss": 1.8991, + "mean_token_accuracy": 0.5609220862388611, + "num_tokens": 7200295256.0, + "step": 14085 + }, + { + "epoch": 3.8090859924283396, + "grad_norm": 0.9057946801185608, + "learning_rate": 4.548860954098171e-06, + "loss": 2.0243, + "mean_token_accuracy": 0.5378372073173523, + "num_tokens": 7200794647.0, + "step": 14086 + }, + { + "epoch": 3.809356408869659, + "grad_norm": 0.9511595368385315, + "learning_rate": 4.547761789557533e-06, + "loss": 1.6867, + "mean_token_accuracy": 0.6107912063598633, + "num_tokens": 7201236554.0, + "step": 14087 + }, + { + "epoch": 3.809626825310979, + "grad_norm": 0.9140512347221375, + "learning_rate": 4.546662822990534e-06, + "loss": 1.7582, + "mean_token_accuracy": 0.5952292680740356, + "num_tokens": 7201760796.0, + "step": 14088 + }, + { + "epoch": 3.8098972417522985, + "grad_norm": 0.9096605181694031, + "learning_rate": 4.5455640544309005e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5866926908493042, + "num_tokens": 7202178150.0, + "step": 14089 + }, + { + "epoch": 3.810167658193618, + "grad_norm": 0.9632577300071716, + "learning_rate": 4.544465483912339e-06, + "loss": 1.7567, + "mean_token_accuracy": 0.598539412021637, + "num_tokens": 7202702411.0, + "step": 14090 + }, + { + "epoch": 3.8104380746349378, + "grad_norm": 1.158598780632019, + "learning_rate": 4.543367111468566e-06, + "loss": 1.8995, + "mean_token_accuracy": 0.5750433206558228, + "num_tokens": 7203226622.0, + "step": 14091 + }, + { + "epoch": 3.8107084910762574, + "grad_norm": 0.9753044247627258, + "learning_rate": 4.542268937133276e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5705198049545288, + "num_tokens": 7203750788.0, + "step": 14092 + }, + { + "epoch": 3.810978907517577, + "grad_norm": 1.0992244482040405, + "learning_rate": 4.541170960940164e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.5762951374053955, + "num_tokens": 7204236812.0, + "step": 14093 + }, + { + "epoch": 3.8112493239588967, + "grad_norm": 1.0338891744613647, + "learning_rate": 4.540073182922925e-06, + "loss": 1.8792, + "mean_token_accuracy": 0.5651682615280151, + "num_tokens": 7204760911.0, + "step": 14094 + }, + { + "epoch": 3.8115197404002163, + "grad_norm": 1.015924334526062, + "learning_rate": 4.538975603115236e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.58693528175354, + "num_tokens": 7205285046.0, + "step": 14095 + }, + { + "epoch": 3.811790156841536, + "grad_norm": 1.0258243083953857, + "learning_rate": 4.537878221550775e-06, + "loss": 1.8057, + "mean_token_accuracy": 0.5878620147705078, + "num_tokens": 7205795588.0, + "step": 14096 + }, + { + "epoch": 3.8120605732828556, + "grad_norm": 0.926324725151062, + "learning_rate": 4.5367810382632125e-06, + "loss": 1.6986, + "mean_token_accuracy": 0.6033176779747009, + "num_tokens": 7206305517.0, + "step": 14097 + }, + { + "epoch": 3.8123309897241753, + "grad_norm": 0.8634114265441895, + "learning_rate": 4.5356840532862185e-06, + "loss": 1.8272, + "mean_token_accuracy": 0.5718313455581665, + "num_tokens": 7206829715.0, + "step": 14098 + }, + { + "epoch": 3.812601406165495, + "grad_norm": 0.8254830241203308, + "learning_rate": 4.534587266653445e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5807374715805054, + "num_tokens": 7207353975.0, + "step": 14099 + }, + { + "epoch": 3.8128718226068146, + "grad_norm": 0.8244414925575256, + "learning_rate": 4.53349067839855e-06, + "loss": 1.6938, + "mean_token_accuracy": 0.626314103603363, + "num_tokens": 7207878248.0, + "step": 14100 + }, + { + "epoch": 3.813142239048134, + "grad_norm": 0.37381815910339355, + "learning_rate": 4.532394288555179e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.7071942090988159, + "num_tokens": 7208340584.0, + "step": 14101 + }, + { + "epoch": 3.813412655489454, + "grad_norm": 1.2643635272979736, + "learning_rate": 4.531298097156967e-06, + "loss": 1.8382, + "mean_token_accuracy": 0.5916852951049805, + "num_tokens": 7208864805.0, + "step": 14102 + }, + { + "epoch": 3.8136830719307735, + "grad_norm": 1.0476524829864502, + "learning_rate": 4.5302021042375574e-06, + "loss": 1.9311, + "mean_token_accuracy": 0.5770263671875, + "num_tokens": 7209326674.0, + "step": 14103 + }, + { + "epoch": 3.813953488372093, + "grad_norm": 0.9668064713478088, + "learning_rate": 4.529106309830573e-06, + "loss": 1.652, + "mean_token_accuracy": 0.6090558767318726, + "num_tokens": 7209791289.0, + "step": 14104 + }, + { + "epoch": 3.814223904813413, + "grad_norm": 1.1484278440475464, + "learning_rate": 4.528010713969632e-06, + "loss": 1.8634, + "mean_token_accuracy": 0.5719716548919678, + "num_tokens": 7210260038.0, + "step": 14105 + }, + { + "epoch": 3.8144943212547324, + "grad_norm": 1.1428489685058594, + "learning_rate": 4.526915316688361e-06, + "loss": 1.8129, + "mean_token_accuracy": 0.5723854303359985, + "num_tokens": 7210784303.0, + "step": 14106 + }, + { + "epoch": 3.814764737696052, + "grad_norm": 1.0155584812164307, + "learning_rate": 4.525820118020361e-06, + "loss": 1.9163, + "mean_token_accuracy": 0.5695446729660034, + "num_tokens": 7211308564.0, + "step": 14107 + }, + { + "epoch": 3.8150351541373717, + "grad_norm": 0.9071988463401794, + "learning_rate": 4.524725117999239e-06, + "loss": 1.815, + "mean_token_accuracy": 0.587875247001648, + "num_tokens": 7211832833.0, + "step": 14108 + }, + { + "epoch": 3.8153055705786914, + "grad_norm": 0.9363766312599182, + "learning_rate": 4.523630316658594e-06, + "loss": 1.8487, + "mean_token_accuracy": 0.5813592076301575, + "num_tokens": 7212299898.0, + "step": 14109 + }, + { + "epoch": 3.8155759870200106, + "grad_norm": 1.1129469871520996, + "learning_rate": 4.5225357140320145e-06, + "loss": 1.7871, + "mean_token_accuracy": 0.5902096033096313, + "num_tokens": 7212805509.0, + "step": 14110 + }, + { + "epoch": 3.8158464034613306, + "grad_norm": 1.1599922180175781, + "learning_rate": 4.521441310153088e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5688304901123047, + "num_tokens": 7213329700.0, + "step": 14111 + }, + { + "epoch": 3.81611681990265, + "grad_norm": 0.929515540599823, + "learning_rate": 4.520347105055398e-06, + "loss": 1.8356, + "mean_token_accuracy": 0.5773264765739441, + "num_tokens": 7213853863.0, + "step": 14112 + }, + { + "epoch": 3.81638723634397, + "grad_norm": 1.036564588546753, + "learning_rate": 4.51925309877251e-06, + "loss": 1.7279, + "mean_token_accuracy": 0.5958265662193298, + "num_tokens": 7214378000.0, + "step": 14113 + }, + { + "epoch": 3.816657652785289, + "grad_norm": 1.2078242301940918, + "learning_rate": 4.518159291337999e-06, + "loss": 1.8537, + "mean_token_accuracy": 0.6226611137390137, + "num_tokens": 7214752216.0, + "step": 14114 + }, + { + "epoch": 3.816928069226609, + "grad_norm": 0.9637798070907593, + "learning_rate": 4.517065682785423e-06, + "loss": 1.7101, + "mean_token_accuracy": 0.6059873104095459, + "num_tokens": 7215276492.0, + "step": 14115 + }, + { + "epoch": 3.8171984856679284, + "grad_norm": 0.9414442777633667, + "learning_rate": 4.515972273148334e-06, + "loss": 1.8955, + "mean_token_accuracy": 0.5587111115455627, + "num_tokens": 7215769421.0, + "step": 14116 + }, + { + "epoch": 3.8174689021092485, + "grad_norm": 0.9225088953971863, + "learning_rate": 4.514879062460287e-06, + "loss": 1.7923, + "mean_token_accuracy": 0.5613951086997986, + "num_tokens": 7216293681.0, + "step": 14117 + }, + { + "epoch": 3.8177393185505677, + "grad_norm": 1.0730191469192505, + "learning_rate": 4.513786050754822e-06, + "loss": 1.7373, + "mean_token_accuracy": 0.5852349400520325, + "num_tokens": 7216798081.0, + "step": 14118 + }, + { + "epoch": 3.818009734991888, + "grad_norm": 0.8811949491500854, + "learning_rate": 4.5126932380654725e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5734571218490601, + "num_tokens": 7217322240.0, + "step": 14119 + }, + { + "epoch": 3.818280151433207, + "grad_norm": 0.8587490320205688, + "learning_rate": 4.5116006244257745e-06, + "loss": 1.7692, + "mean_token_accuracy": 0.5838836431503296, + "num_tokens": 7217845552.0, + "step": 14120 + }, + { + "epoch": 3.8185505678745266, + "grad_norm": 0.346591591835022, + "learning_rate": 4.510508209869252e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7295241355895996, + "num_tokens": 7218369749.0, + "step": 14121 + }, + { + "epoch": 3.8188209843158463, + "grad_norm": 1.1944175958633423, + "learning_rate": 4.509415994429418e-06, + "loss": 1.7361, + "mean_token_accuracy": 0.5829019546508789, + "num_tokens": 7218893926.0, + "step": 14122 + }, + { + "epoch": 3.819091400757166, + "grad_norm": 1.5222324132919312, + "learning_rate": 4.508323978139792e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.5648105144500732, + "num_tokens": 7219418203.0, + "step": 14123 + }, + { + "epoch": 3.8193618171984856, + "grad_norm": 1.0836125612258911, + "learning_rate": 4.507232161033874e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5862560272216797, + "num_tokens": 7219897392.0, + "step": 14124 + }, + { + "epoch": 3.819632233639805, + "grad_norm": 0.864991307258606, + "learning_rate": 4.5061405431451714e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.565644383430481, + "num_tokens": 7220421467.0, + "step": 14125 + }, + { + "epoch": 3.819902650081125, + "grad_norm": 1.1302846670150757, + "learning_rate": 4.505049124507169e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5489506721496582, + "num_tokens": 7220945722.0, + "step": 14126 + }, + { + "epoch": 3.8201730665224445, + "grad_norm": 1.5350289344787598, + "learning_rate": 4.503957905153364e-06, + "loss": 1.834, + "mean_token_accuracy": 0.578936755657196, + "num_tokens": 7221469918.0, + "step": 14127 + }, + { + "epoch": 3.820443482963764, + "grad_norm": 1.2580450773239136, + "learning_rate": 4.502866885117232e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5860288739204407, + "num_tokens": 7221931160.0, + "step": 14128 + }, + { + "epoch": 3.8207138994050838, + "grad_norm": 1.1686465740203857, + "learning_rate": 4.501776064432253e-06, + "loss": 1.6916, + "mean_token_accuracy": 0.5969281196594238, + "num_tokens": 7222453131.0, + "step": 14129 + }, + { + "epoch": 3.8209843158464034, + "grad_norm": 1.0698583126068115, + "learning_rate": 4.500685443131894e-06, + "loss": 1.7999, + "mean_token_accuracy": 0.5931318998336792, + "num_tokens": 7222874086.0, + "step": 14130 + }, + { + "epoch": 3.821254732287723, + "grad_norm": 1.0121245384216309, + "learning_rate": 4.499595021249616e-06, + "loss": 1.9306, + "mean_token_accuracy": 0.5359134674072266, + "num_tokens": 7223332896.0, + "step": 14131 + }, + { + "epoch": 3.8215251487290427, + "grad_norm": 1.0485858917236328, + "learning_rate": 4.498504798818883e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.5644211769104004, + "num_tokens": 7223857096.0, + "step": 14132 + }, + { + "epoch": 3.8217955651703623, + "grad_norm": 1.035580039024353, + "learning_rate": 4.497414775873141e-06, + "loss": 1.7699, + "mean_token_accuracy": 0.5888729095458984, + "num_tokens": 7224381337.0, + "step": 14133 + }, + { + "epoch": 3.822065981611682, + "grad_norm": 0.9358431696891785, + "learning_rate": 4.496324952445835e-06, + "loss": 1.7704, + "mean_token_accuracy": 0.5925255417823792, + "num_tokens": 7224905601.0, + "step": 14134 + }, + { + "epoch": 3.8223363980530016, + "grad_norm": 0.9478966593742371, + "learning_rate": 4.495235328570407e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5797982215881348, + "num_tokens": 7225401611.0, + "step": 14135 + }, + { + "epoch": 3.8226068144943213, + "grad_norm": 0.9961602687835693, + "learning_rate": 4.494145904280287e-06, + "loss": 1.6642, + "mean_token_accuracy": 0.6019703149795532, + "num_tokens": 7225898046.0, + "step": 14136 + }, + { + "epoch": 3.822877230935641, + "grad_norm": 0.9368333220481873, + "learning_rate": 4.493056679608905e-06, + "loss": 1.756, + "mean_token_accuracy": 0.5816053152084351, + "num_tokens": 7226422326.0, + "step": 14137 + }, + { + "epoch": 3.8231476473769606, + "grad_norm": 1.0706068277359009, + "learning_rate": 4.491967654589677e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5811439752578735, + "num_tokens": 7226890391.0, + "step": 14138 + }, + { + "epoch": 3.82341806381828, + "grad_norm": 1.1705310344696045, + "learning_rate": 4.490878829256023e-06, + "loss": 1.9069, + "mean_token_accuracy": 0.5759422779083252, + "num_tokens": 7227414608.0, + "step": 14139 + }, + { + "epoch": 3.8236884802596, + "grad_norm": 1.1353243589401245, + "learning_rate": 4.489790203641346e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5491471886634827, + "num_tokens": 7227938621.0, + "step": 14140 + }, + { + "epoch": 3.8239588967009195, + "grad_norm": 0.34838083386421204, + "learning_rate": 4.488701777779054e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6956131458282471, + "num_tokens": 7228462777.0, + "step": 14141 + }, + { + "epoch": 3.824229313142239, + "grad_norm": 1.6319888830184937, + "learning_rate": 4.487613551702537e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5698282122612, + "num_tokens": 7228929268.0, + "step": 14142 + }, + { + "epoch": 3.8244997295835588, + "grad_norm": 1.458672046661377, + "learning_rate": 4.486525525445191e-06, + "loss": 1.7724, + "mean_token_accuracy": 0.5813314914703369, + "num_tokens": 7229453480.0, + "step": 14143 + }, + { + "epoch": 3.8247701460248784, + "grad_norm": 1.330505132675171, + "learning_rate": 4.485437699040395e-06, + "loss": 1.8012, + "mean_token_accuracy": 0.5792047381401062, + "num_tokens": 7229934809.0, + "step": 14144 + }, + { + "epoch": 3.825040562466198, + "grad_norm": 0.9316632151603699, + "learning_rate": 4.4843500725215274e-06, + "loss": 1.9047, + "mean_token_accuracy": 0.5619592666625977, + "num_tokens": 7230459060.0, + "step": 14145 + }, + { + "epoch": 3.8253109789075177, + "grad_norm": 0.9864047765731812, + "learning_rate": 4.483262645921963e-06, + "loss": 1.7964, + "mean_token_accuracy": 0.5778605937957764, + "num_tokens": 7230983131.0, + "step": 14146 + }, + { + "epoch": 3.8255813953488373, + "grad_norm": 1.106053113937378, + "learning_rate": 4.482175419275065e-06, + "loss": 1.8815, + "mean_token_accuracy": 0.5811312198638916, + "num_tokens": 7231398714.0, + "step": 14147 + }, + { + "epoch": 3.825851811790157, + "grad_norm": 1.2226791381835938, + "learning_rate": 4.481088392614192e-06, + "loss": 1.9223, + "mean_token_accuracy": 0.5631470084190369, + "num_tokens": 7231922885.0, + "step": 14148 + }, + { + "epoch": 3.8261222282314766, + "grad_norm": 1.15639328956604, + "learning_rate": 4.480001565972698e-06, + "loss": 1.8019, + "mean_token_accuracy": 0.5663681030273438, + "num_tokens": 7232447045.0, + "step": 14149 + }, + { + "epoch": 3.8263926446727963, + "grad_norm": 0.7814843058586121, + "learning_rate": 4.478914939383932e-06, + "loss": 1.7734, + "mean_token_accuracy": 0.5929200649261475, + "num_tokens": 7232971109.0, + "step": 14150 + }, + { + "epoch": 3.8266630611141155, + "grad_norm": 0.9896889328956604, + "learning_rate": 4.477828512881229e-06, + "loss": 1.783, + "mean_token_accuracy": 0.5731827020645142, + "num_tokens": 7233495293.0, + "step": 14151 + }, + { + "epoch": 3.8269334775554356, + "grad_norm": 1.429761290550232, + "learning_rate": 4.47674228649793e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5907682180404663, + "num_tokens": 7234019449.0, + "step": 14152 + }, + { + "epoch": 3.8272038939967548, + "grad_norm": 1.0342246294021606, + "learning_rate": 4.4756562602673595e-06, + "loss": 1.9039, + "mean_token_accuracy": 0.5629713535308838, + "num_tokens": 7234543604.0, + "step": 14153 + }, + { + "epoch": 3.827474310438075, + "grad_norm": 0.995498538017273, + "learning_rate": 4.474570434222841e-06, + "loss": 1.8912, + "mean_token_accuracy": 0.5540347099304199, + "num_tokens": 7235067833.0, + "step": 14154 + }, + { + "epoch": 3.827744726879394, + "grad_norm": 1.0371673107147217, + "learning_rate": 4.473484808397696e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.5868713855743408, + "num_tokens": 7235592100.0, + "step": 14155 + }, + { + "epoch": 3.828015143320714, + "grad_norm": 1.0477818250656128, + "learning_rate": 4.472399382825229e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.6069663763046265, + "num_tokens": 7236050829.0, + "step": 14156 + }, + { + "epoch": 3.8282855597620333, + "grad_norm": 0.966414749622345, + "learning_rate": 4.471314157538743e-06, + "loss": 1.8146, + "mean_token_accuracy": 0.5779241919517517, + "num_tokens": 7236574849.0, + "step": 14157 + }, + { + "epoch": 3.8285559762033534, + "grad_norm": 1.0122523307800293, + "learning_rate": 4.470229132571541e-06, + "loss": 1.727, + "mean_token_accuracy": 0.6031403541564941, + "num_tokens": 7237099112.0, + "step": 14158 + }, + { + "epoch": 3.8288263926446726, + "grad_norm": 0.8439648151397705, + "learning_rate": 4.469144307956908e-06, + "loss": 1.7634, + "mean_token_accuracy": 0.5772980451583862, + "num_tokens": 7237623241.0, + "step": 14159 + }, + { + "epoch": 3.8290968090859927, + "grad_norm": 0.88750159740448, + "learning_rate": 4.468059683728138e-06, + "loss": 1.9169, + "mean_token_accuracy": 0.5628389716148376, + "num_tokens": 7238147505.0, + "step": 14160 + }, + { + "epoch": 3.829367225527312, + "grad_norm": 0.35052168369293213, + "learning_rate": 4.4669752599185055e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7159945964813232, + "num_tokens": 7238671699.0, + "step": 14161 + }, + { + "epoch": 3.8296376419686315, + "grad_norm": 1.0547560453414917, + "learning_rate": 4.465891036561282e-06, + "loss": 1.8799, + "mean_token_accuracy": 0.5741895437240601, + "num_tokens": 7239195888.0, + "step": 14162 + }, + { + "epoch": 3.829908058409951, + "grad_norm": 1.0078648328781128, + "learning_rate": 4.46480701368974e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5707845687866211, + "num_tokens": 7239720146.0, + "step": 14163 + }, + { + "epoch": 3.830178474851271, + "grad_norm": 0.9965008497238159, + "learning_rate": 4.463723191337136e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.559416651725769, + "num_tokens": 7240244172.0, + "step": 14164 + }, + { + "epoch": 3.8304488912925905, + "grad_norm": 0.8187299370765686, + "learning_rate": 4.462639569536725e-06, + "loss": 1.8449, + "mean_token_accuracy": 0.5675737857818604, + "num_tokens": 7240768256.0, + "step": 14165 + }, + { + "epoch": 3.83071930773391, + "grad_norm": 1.1330245733261108, + "learning_rate": 4.46155614832176e-06, + "loss": 1.9518, + "mean_token_accuracy": 0.5585986971855164, + "num_tokens": 7241242968.0, + "step": 14166 + }, + { + "epoch": 3.8309897241752298, + "grad_norm": 1.1620546579360962, + "learning_rate": 4.460472927725477e-06, + "loss": 1.8582, + "mean_token_accuracy": 0.5777015089988708, + "num_tokens": 7241767235.0, + "step": 14167 + }, + { + "epoch": 3.8312601406165494, + "grad_norm": 0.9081850051879883, + "learning_rate": 4.45938990778112e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.6017135977745056, + "num_tokens": 7242291517.0, + "step": 14168 + }, + { + "epoch": 3.831530557057869, + "grad_norm": 0.8972366452217102, + "learning_rate": 4.458307088521912e-06, + "loss": 1.8719, + "mean_token_accuracy": 0.5588269233703613, + "num_tokens": 7242815673.0, + "step": 14169 + }, + { + "epoch": 3.8318009734991887, + "grad_norm": 0.9963995814323425, + "learning_rate": 4.4572244699810835e-06, + "loss": 1.7582, + "mean_token_accuracy": 0.5868105292320251, + "num_tokens": 7243339788.0, + "step": 14170 + }, + { + "epoch": 3.8320713899405083, + "grad_norm": 0.9544175267219543, + "learning_rate": 4.456142052191845e-06, + "loss": 1.8305, + "mean_token_accuracy": 0.5836238861083984, + "num_tokens": 7243820358.0, + "step": 14171 + }, + { + "epoch": 3.832341806381828, + "grad_norm": 0.8332932591438293, + "learning_rate": 4.455059835187417e-06, + "loss": 1.806, + "mean_token_accuracy": 0.6049094200134277, + "num_tokens": 7244341370.0, + "step": 14172 + }, + { + "epoch": 3.8326122228231476, + "grad_norm": 0.8696148991584778, + "learning_rate": 4.4539778190010005e-06, + "loss": 1.7762, + "mean_token_accuracy": 0.5948044061660767, + "num_tokens": 7244865519.0, + "step": 14173 + }, + { + "epoch": 3.8328826392644673, + "grad_norm": 0.9227671027183533, + "learning_rate": 4.452896003665792e-06, + "loss": 1.793, + "mean_token_accuracy": 0.61423659324646, + "num_tokens": 7245326669.0, + "step": 14174 + }, + { + "epoch": 3.833153055705787, + "grad_norm": 0.9895995259284973, + "learning_rate": 4.45181438921499e-06, + "loss": 1.8171, + "mean_token_accuracy": 0.5885047912597656, + "num_tokens": 7245832985.0, + "step": 14175 + }, + { + "epoch": 3.8334234721471065, + "grad_norm": 0.9055052399635315, + "learning_rate": 4.4507329756817805e-06, + "loss": 1.798, + "mean_token_accuracy": 0.5819653272628784, + "num_tokens": 7246357073.0, + "step": 14176 + }, + { + "epoch": 3.833693888588426, + "grad_norm": 0.7938344478607178, + "learning_rate": 4.44965176309934e-06, + "loss": 1.7437, + "mean_token_accuracy": 0.5897350311279297, + "num_tokens": 7246881212.0, + "step": 14177 + }, + { + "epoch": 3.833964305029746, + "grad_norm": 0.9342606067657471, + "learning_rate": 4.44857075150085e-06, + "loss": 1.767, + "mean_token_accuracy": 0.5914946794509888, + "num_tokens": 7247405352.0, + "step": 14178 + }, + { + "epoch": 3.8342347214710655, + "grad_norm": 0.9585455656051636, + "learning_rate": 4.447489940919476e-06, + "loss": 1.7118, + "mean_token_accuracy": 0.6054009199142456, + "num_tokens": 7247929601.0, + "step": 14179 + }, + { + "epoch": 3.834505137912385, + "grad_norm": 0.9454344511032104, + "learning_rate": 4.446409331388377e-06, + "loss": 1.8962, + "mean_token_accuracy": 0.5709063410758972, + "num_tokens": 7248433135.0, + "step": 14180 + }, + { + "epoch": 3.8347755543537048, + "grad_norm": 0.37375080585479736, + "learning_rate": 4.445328922940714e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7206754684448242, + "num_tokens": 7248902957.0, + "step": 14181 + }, + { + "epoch": 3.8350459707950244, + "grad_norm": 0.9201812744140625, + "learning_rate": 4.444248715609636e-06, + "loss": 1.825, + "mean_token_accuracy": 0.5793452262878418, + "num_tokens": 7249398213.0, + "step": 14182 + }, + { + "epoch": 3.835316387236344, + "grad_norm": 0.8652355074882507, + "learning_rate": 4.443168709428285e-06, + "loss": 1.872, + "mean_token_accuracy": 0.573006808757782, + "num_tokens": 7249922335.0, + "step": 14183 + }, + { + "epoch": 3.8355868036776637, + "grad_norm": 0.8226770162582397, + "learning_rate": 4.442088904429803e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.5635613799095154, + "num_tokens": 7250446565.0, + "step": 14184 + }, + { + "epoch": 3.8358572201189833, + "grad_norm": 0.9666095972061157, + "learning_rate": 4.441009300647317e-06, + "loss": 1.8993, + "mean_token_accuracy": 0.5832041501998901, + "num_tokens": 7250940152.0, + "step": 14185 + }, + { + "epoch": 3.836127636560303, + "grad_norm": 0.8730437755584717, + "learning_rate": 4.439929898113956e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5784091949462891, + "num_tokens": 7251464412.0, + "step": 14186 + }, + { + "epoch": 3.8363980530016226, + "grad_norm": 0.8179212212562561, + "learning_rate": 4.438850696862839e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5810152888298035, + "num_tokens": 7251988544.0, + "step": 14187 + }, + { + "epoch": 3.8366684694429423, + "grad_norm": 0.8925336003303528, + "learning_rate": 4.437771696927073e-06, + "loss": 1.9107, + "mean_token_accuracy": 0.562466025352478, + "num_tokens": 7252506504.0, + "step": 14188 + }, + { + "epoch": 3.836938885884262, + "grad_norm": 0.981200635433197, + "learning_rate": 4.436692898339775e-06, + "loss": 1.7047, + "mean_token_accuracy": 0.6190187931060791, + "num_tokens": 7253030657.0, + "step": 14189 + }, + { + "epoch": 3.8372093023255816, + "grad_norm": 0.9609411954879761, + "learning_rate": 4.435614301134038e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5752658843994141, + "num_tokens": 7253554931.0, + "step": 14190 + }, + { + "epoch": 3.837479718766901, + "grad_norm": 0.9581348299980164, + "learning_rate": 4.4345359053429575e-06, + "loss": 1.7404, + "mean_token_accuracy": 0.5855960845947266, + "num_tokens": 7254079138.0, + "step": 14191 + }, + { + "epoch": 3.8377501352082204, + "grad_norm": 0.8544425964355469, + "learning_rate": 4.433457710999625e-06, + "loss": 1.7359, + "mean_token_accuracy": 0.5973955392837524, + "num_tokens": 7254552043.0, + "step": 14192 + }, + { + "epoch": 3.8380205516495405, + "grad_norm": 0.9908930063247681, + "learning_rate": 4.432379718137121e-06, + "loss": 1.9059, + "mean_token_accuracy": 0.5752290487289429, + "num_tokens": 7255076255.0, + "step": 14193 + }, + { + "epoch": 3.8382909680908597, + "grad_norm": 0.9309671521186829, + "learning_rate": 4.43130192678852e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5825077891349792, + "num_tokens": 7255600516.0, + "step": 14194 + }, + { + "epoch": 3.8385613845321798, + "grad_norm": 1.0491946935653687, + "learning_rate": 4.430224336986894e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5762056112289429, + "num_tokens": 7256115997.0, + "step": 14195 + }, + { + "epoch": 3.838831800973499, + "grad_norm": 1.0321753025054932, + "learning_rate": 4.429146948765303e-06, + "loss": 1.8151, + "mean_token_accuracy": 0.583046019077301, + "num_tokens": 7256608715.0, + "step": 14196 + }, + { + "epoch": 3.839102217414819, + "grad_norm": 1.1095926761627197, + "learning_rate": 4.428069762156809e-06, + "loss": 1.8036, + "mean_token_accuracy": 0.5872734785079956, + "num_tokens": 7257058775.0, + "step": 14197 + }, + { + "epoch": 3.8393726338561383, + "grad_norm": 1.149146556854248, + "learning_rate": 4.426992777194462e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5904725790023804, + "num_tokens": 7257523066.0, + "step": 14198 + }, + { + "epoch": 3.8396430502974583, + "grad_norm": 1.1550309658050537, + "learning_rate": 4.4259159939113085e-06, + "loss": 1.8823, + "mean_token_accuracy": 0.557917058467865, + "num_tokens": 7258047310.0, + "step": 14199 + }, + { + "epoch": 3.8399134667387775, + "grad_norm": 1.192827820777893, + "learning_rate": 4.424839412340382e-06, + "loss": 1.7785, + "mean_token_accuracy": 0.5921382904052734, + "num_tokens": 7258571487.0, + "step": 14200 + }, + { + "epoch": 3.8401838831800976, + "grad_norm": 0.37601277232170105, + "learning_rate": 4.42376303251472e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.707297682762146, + "num_tokens": 7259092333.0, + "step": 14201 + }, + { + "epoch": 3.840454299621417, + "grad_norm": 1.2170088291168213, + "learning_rate": 4.42268685446735e-06, + "loss": 1.8582, + "mean_token_accuracy": 0.5825487375259399, + "num_tokens": 7259566798.0, + "step": 14202 + }, + { + "epoch": 3.8407247160627365, + "grad_norm": 1.0807809829711914, + "learning_rate": 4.421610878231286e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5722402334213257, + "num_tokens": 7260091002.0, + "step": 14203 + }, + { + "epoch": 3.840995132504056, + "grad_norm": 1.1190948486328125, + "learning_rate": 4.42053510383955e-06, + "loss": 1.7144, + "mean_token_accuracy": 0.5892386436462402, + "num_tokens": 7260615250.0, + "step": 14204 + }, + { + "epoch": 3.8412655489453758, + "grad_norm": 0.9116085767745972, + "learning_rate": 4.419459531325142e-06, + "loss": 1.7241, + "mean_token_accuracy": 0.6001116037368774, + "num_tokens": 7261139519.0, + "step": 14205 + }, + { + "epoch": 3.8415359653866954, + "grad_norm": 0.98187255859375, + "learning_rate": 4.41838416072107e-06, + "loss": 1.8088, + "mean_token_accuracy": 0.5832180976867676, + "num_tokens": 7261663580.0, + "step": 14206 + }, + { + "epoch": 3.841806381828015, + "grad_norm": 1.0089980363845825, + "learning_rate": 4.417308992060327e-06, + "loss": 1.8799, + "mean_token_accuracy": 0.574070394039154, + "num_tokens": 7262187773.0, + "step": 14207 + }, + { + "epoch": 3.8420767982693347, + "grad_norm": 0.9612840414047241, + "learning_rate": 4.416234025375901e-06, + "loss": 1.7988, + "mean_token_accuracy": 0.5733093023300171, + "num_tokens": 7262711905.0, + "step": 14208 + }, + { + "epoch": 3.8423472147106543, + "grad_norm": 1.0118464231491089, + "learning_rate": 4.41515926070078e-06, + "loss": 1.84, + "mean_token_accuracy": 0.5598547458648682, + "num_tokens": 7263236106.0, + "step": 14209 + }, + { + "epoch": 3.842617631151974, + "grad_norm": 1.0013830661773682, + "learning_rate": 4.414084698067933e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5846793055534363, + "num_tokens": 7263760351.0, + "step": 14210 + }, + { + "epoch": 3.8428880475932936, + "grad_norm": 1.083856463432312, + "learning_rate": 4.413010337510337e-06, + "loss": 1.7427, + "mean_token_accuracy": 0.5893356204032898, + "num_tokens": 7264244100.0, + "step": 14211 + }, + { + "epoch": 3.8431584640346133, + "grad_norm": 1.0042225122451782, + "learning_rate": 4.411936179060957e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5696040391921997, + "num_tokens": 7264768378.0, + "step": 14212 + }, + { + "epoch": 3.843428880475933, + "grad_norm": 1.0597350597381592, + "learning_rate": 4.410862222752749e-06, + "loss": 1.8634, + "mean_token_accuracy": 0.5662035942077637, + "num_tokens": 7265274176.0, + "step": 14213 + }, + { + "epoch": 3.8436992969172525, + "grad_norm": 0.9785346984863281, + "learning_rate": 4.409788468618662e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.5821318626403809, + "num_tokens": 7265792090.0, + "step": 14214 + }, + { + "epoch": 3.843969713358572, + "grad_norm": 1.0050824880599976, + "learning_rate": 4.40871491669165e-06, + "loss": 1.7533, + "mean_token_accuracy": 0.5923345685005188, + "num_tokens": 7266296930.0, + "step": 14215 + }, + { + "epoch": 3.844240129799892, + "grad_norm": 1.076473355293274, + "learning_rate": 4.4076415670046465e-06, + "loss": 1.7826, + "mean_token_accuracy": 0.5838062763214111, + "num_tokens": 7266821039.0, + "step": 14216 + }, + { + "epoch": 3.8445105462412115, + "grad_norm": 1.1087088584899902, + "learning_rate": 4.406568419590585e-06, + "loss": 1.8249, + "mean_token_accuracy": 0.5634056925773621, + "num_tokens": 7267345326.0, + "step": 14217 + }, + { + "epoch": 3.844780962682531, + "grad_norm": 0.9469020962715149, + "learning_rate": 4.405495474482396e-06, + "loss": 1.9457, + "mean_token_accuracy": 0.5635972023010254, + "num_tokens": 7267869492.0, + "step": 14218 + }, + { + "epoch": 3.8450513791238508, + "grad_norm": 1.2132883071899414, + "learning_rate": 4.404422731712999e-06, + "loss": 1.8464, + "mean_token_accuracy": 0.5768930912017822, + "num_tokens": 7268387746.0, + "step": 14219 + }, + { + "epoch": 3.8453217955651704, + "grad_norm": 1.0914782285690308, + "learning_rate": 4.4033501913153075e-06, + "loss": 1.9398, + "mean_token_accuracy": 0.566831111907959, + "num_tokens": 7268907566.0, + "step": 14220 + }, + { + "epoch": 3.84559221200649, + "grad_norm": 0.368194580078125, + "learning_rate": 4.4022778533222336e-06, + "loss": 1.2112, + "mean_token_accuracy": 0.6754146814346313, + "num_tokens": 7269431825.0, + "step": 14221 + }, + { + "epoch": 3.8458626284478097, + "grad_norm": 1.1939753293991089, + "learning_rate": 4.401205717766678e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5712429285049438, + "num_tokens": 7269956056.0, + "step": 14222 + }, + { + "epoch": 3.8461330448891293, + "grad_norm": 0.9497389197349548, + "learning_rate": 4.400133784681533e-06, + "loss": 1.7777, + "mean_token_accuracy": 0.5813305974006653, + "num_tokens": 7270480243.0, + "step": 14223 + }, + { + "epoch": 3.846403461330449, + "grad_norm": 1.04523766040802, + "learning_rate": 4.399062054099694e-06, + "loss": 1.8897, + "mean_token_accuracy": 0.5405272245407104, + "num_tokens": 7270998105.0, + "step": 14224 + }, + { + "epoch": 3.8466738777717686, + "grad_norm": 0.8437793254852295, + "learning_rate": 4.397990526054045e-06, + "loss": 1.7356, + "mean_token_accuracy": 0.5958342552185059, + "num_tokens": 7271522263.0, + "step": 14225 + }, + { + "epoch": 3.8469442942130883, + "grad_norm": 1.0009288787841797, + "learning_rate": 4.39691920057746e-06, + "loss": 1.8515, + "mean_token_accuracy": 0.5860979557037354, + "num_tokens": 7272046543.0, + "step": 14226 + }, + { + "epoch": 3.847214710654408, + "grad_norm": 1.0840204954147339, + "learning_rate": 4.395848077702813e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5799328088760376, + "num_tokens": 7272570812.0, + "step": 14227 + }, + { + "epoch": 3.8474851270957275, + "grad_norm": 0.887265682220459, + "learning_rate": 4.394777157462969e-06, + "loss": 1.7413, + "mean_token_accuracy": 0.5710090398788452, + "num_tokens": 7273095052.0, + "step": 14228 + }, + { + "epoch": 3.847755543537047, + "grad_norm": 0.8612394332885742, + "learning_rate": 4.3937064398907866e-06, + "loss": 1.8532, + "mean_token_accuracy": 0.577480673789978, + "num_tokens": 7273571440.0, + "step": 14229 + }, + { + "epoch": 3.848025959978367, + "grad_norm": 0.8600184321403503, + "learning_rate": 4.392635925019118e-06, + "loss": 1.7941, + "mean_token_accuracy": 0.5896657705307007, + "num_tokens": 7274095700.0, + "step": 14230 + }, + { + "epoch": 3.8482963764196865, + "grad_norm": 0.8706966042518616, + "learning_rate": 4.39156561288081e-06, + "loss": 1.8388, + "mean_token_accuracy": 0.5841846466064453, + "num_tokens": 7274581674.0, + "step": 14231 + }, + { + "epoch": 3.848566792861006, + "grad_norm": 0.8634092211723328, + "learning_rate": 4.390495503508704e-06, + "loss": 1.8615, + "mean_token_accuracy": 0.5655184984207153, + "num_tokens": 7275105844.0, + "step": 14232 + }, + { + "epoch": 3.8488372093023253, + "grad_norm": 0.8843279480934143, + "learning_rate": 4.389425596935635e-06, + "loss": 1.851, + "mean_token_accuracy": 0.5882428288459778, + "num_tokens": 7275580237.0, + "step": 14233 + }, + { + "epoch": 3.8491076257436454, + "grad_norm": 1.0629658699035645, + "learning_rate": 4.388355893194425e-06, + "loss": 1.7446, + "mean_token_accuracy": 0.6209371089935303, + "num_tokens": 7276104497.0, + "step": 14234 + }, + { + "epoch": 3.8493780421849646, + "grad_norm": 1.0526574850082397, + "learning_rate": 4.3872863923179026e-06, + "loss": 1.593, + "mean_token_accuracy": 0.6483155488967896, + "num_tokens": 7276585352.0, + "step": 14235 + }, + { + "epoch": 3.8496484586262847, + "grad_norm": 1.0402705669403076, + "learning_rate": 4.386217094338881e-06, + "loss": 1.787, + "mean_token_accuracy": 0.5696990489959717, + "num_tokens": 7277109542.0, + "step": 14236 + }, + { + "epoch": 3.849918875067604, + "grad_norm": 0.9482945799827576, + "learning_rate": 4.3851479992901654e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5764157772064209, + "num_tokens": 7277633821.0, + "step": 14237 + }, + { + "epoch": 3.850189291508924, + "grad_norm": 0.8658008575439453, + "learning_rate": 4.384079107204565e-06, + "loss": 1.7591, + "mean_token_accuracy": 0.5757038593292236, + "num_tokens": 7278146397.0, + "step": 14238 + }, + { + "epoch": 3.850459707950243, + "grad_norm": 0.8171941637992859, + "learning_rate": 4.383010418114872e-06, + "loss": 1.7574, + "mean_token_accuracy": 0.5853393077850342, + "num_tokens": 7278670592.0, + "step": 14239 + }, + { + "epoch": 3.8507301243915633, + "grad_norm": 0.9020592570304871, + "learning_rate": 4.381941932053878e-06, + "loss": 1.8478, + "mean_token_accuracy": 0.5608221292495728, + "num_tokens": 7279194788.0, + "step": 14240 + }, + { + "epoch": 3.8510005408328825, + "grad_norm": 0.35004132986068726, + "learning_rate": 4.3808736490543705e-06, + "loss": 1.1396, + "mean_token_accuracy": 0.6972299218177795, + "num_tokens": 7279718960.0, + "step": 14241 + }, + { + "epoch": 3.8512709572742025, + "grad_norm": 0.9958215355873108, + "learning_rate": 4.379805569149125e-06, + "loss": 1.8513, + "mean_token_accuracy": 0.5763856768608093, + "num_tokens": 7280243132.0, + "step": 14242 + }, + { + "epoch": 3.8515413737155217, + "grad_norm": 1.2676498889923096, + "learning_rate": 4.378737692370911e-06, + "loss": 2.0513, + "mean_token_accuracy": 0.5384029746055603, + "num_tokens": 7280767400.0, + "step": 14243 + }, + { + "epoch": 3.8518117901568414, + "grad_norm": 0.9062688946723938, + "learning_rate": 4.377670018752499e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.5829991102218628, + "num_tokens": 7281291616.0, + "step": 14244 + }, + { + "epoch": 3.852082206598161, + "grad_norm": 0.910868763923645, + "learning_rate": 4.376602548326645e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.5885004997253418, + "num_tokens": 7281796109.0, + "step": 14245 + }, + { + "epoch": 3.8523526230394807, + "grad_norm": 0.8486550450325012, + "learning_rate": 4.375535281126102e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5821812152862549, + "num_tokens": 7282266236.0, + "step": 14246 + }, + { + "epoch": 3.8526230394808003, + "grad_norm": 0.9562634229660034, + "learning_rate": 4.374468217183619e-06, + "loss": 1.8724, + "mean_token_accuracy": 0.5717936158180237, + "num_tokens": 7282774093.0, + "step": 14247 + }, + { + "epoch": 3.85289345592212, + "grad_norm": 0.8481744527816772, + "learning_rate": 4.3734013565319364e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5731570720672607, + "num_tokens": 7283298347.0, + "step": 14248 + }, + { + "epoch": 3.8531638723634396, + "grad_norm": 0.9180846214294434, + "learning_rate": 4.3723346992037845e-06, + "loss": 1.9043, + "mean_token_accuracy": 0.5632601976394653, + "num_tokens": 7283822609.0, + "step": 14249 + }, + { + "epoch": 3.8534342888047592, + "grad_norm": 0.9775071740150452, + "learning_rate": 4.371268245231898e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5652998685836792, + "num_tokens": 7284346825.0, + "step": 14250 + }, + { + "epoch": 3.853704705246079, + "grad_norm": 0.8853356838226318, + "learning_rate": 4.370201994648993e-06, + "loss": 1.7755, + "mean_token_accuracy": 0.6052076816558838, + "num_tokens": 7284768464.0, + "step": 14251 + }, + { + "epoch": 3.8539751216873985, + "grad_norm": 0.9001617431640625, + "learning_rate": 4.36913594748779e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.5691160559654236, + "num_tokens": 7285255051.0, + "step": 14252 + }, + { + "epoch": 3.854245538128718, + "grad_norm": 0.9657721519470215, + "learning_rate": 4.368070103780993e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5901546478271484, + "num_tokens": 7285755623.0, + "step": 14253 + }, + { + "epoch": 3.854515954570038, + "grad_norm": 0.7650156617164612, + "learning_rate": 4.36700446356131e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.5758002996444702, + "num_tokens": 7286279687.0, + "step": 14254 + }, + { + "epoch": 3.8547863710113575, + "grad_norm": 3.4133191108703613, + "learning_rate": 4.365939026861437e-06, + "loss": 1.6381, + "mean_token_accuracy": 0.6363674402236938, + "num_tokens": 7286803893.0, + "step": 14255 + }, + { + "epoch": 3.855056787452677, + "grad_norm": 1.1475714445114136, + "learning_rate": 4.364873793714065e-06, + "loss": 1.9087, + "mean_token_accuracy": 0.5645946264266968, + "num_tokens": 7287328140.0, + "step": 14256 + }, + { + "epoch": 3.8553272038939967, + "grad_norm": 1.1471625566482544, + "learning_rate": 4.363808764151875e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5763053297996521, + "num_tokens": 7287852261.0, + "step": 14257 + }, + { + "epoch": 3.8555976203353164, + "grad_norm": 0.8702263832092285, + "learning_rate": 4.362743938207552e-06, + "loss": 1.8559, + "mean_token_accuracy": 0.5735253691673279, + "num_tokens": 7288376446.0, + "step": 14258 + }, + { + "epoch": 3.855868036776636, + "grad_norm": 0.9292163848876953, + "learning_rate": 4.361679315913762e-06, + "loss": 1.8486, + "mean_token_accuracy": 0.5901605486869812, + "num_tokens": 7288877901.0, + "step": 14259 + }, + { + "epoch": 3.8561384532179557, + "grad_norm": 0.9434449076652527, + "learning_rate": 4.36061489730317e-06, + "loss": 1.6634, + "mean_token_accuracy": 0.623326301574707, + "num_tokens": 7289402097.0, + "step": 14260 + }, + { + "epoch": 3.8564088696592753, + "grad_norm": 0.35128462314605713, + "learning_rate": 4.359550682408442e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6975581645965576, + "num_tokens": 7289926376.0, + "step": 14261 + }, + { + "epoch": 3.856679286100595, + "grad_norm": 1.100030541419983, + "learning_rate": 4.358486671262227e-06, + "loss": 1.9058, + "mean_token_accuracy": 0.5694916248321533, + "num_tokens": 7290450578.0, + "step": 14262 + }, + { + "epoch": 3.8569497025419146, + "grad_norm": 0.8748799562454224, + "learning_rate": 4.357422863897169e-06, + "loss": 1.8335, + "mean_token_accuracy": 0.5784173607826233, + "num_tokens": 7290944470.0, + "step": 14263 + }, + { + "epoch": 3.8572201189832342, + "grad_norm": 0.8338929414749146, + "learning_rate": 4.356359260345916e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5805498361587524, + "num_tokens": 7291433676.0, + "step": 14264 + }, + { + "epoch": 3.857490535424554, + "grad_norm": 1.0071380138397217, + "learning_rate": 4.3552958606410975e-06, + "loss": 1.8085, + "mean_token_accuracy": 0.5978186130523682, + "num_tokens": 7291948630.0, + "step": 14265 + }, + { + "epoch": 3.8577609518658735, + "grad_norm": 1.1969760656356812, + "learning_rate": 4.35423266481534e-06, + "loss": 1.7621, + "mean_token_accuracy": 0.6089391708374023, + "num_tokens": 7292433999.0, + "step": 14266 + }, + { + "epoch": 3.858031368307193, + "grad_norm": 1.0409331321716309, + "learning_rate": 4.353169672901269e-06, + "loss": 1.9164, + "mean_token_accuracy": 0.5494967699050903, + "num_tokens": 7292958152.0, + "step": 14267 + }, + { + "epoch": 3.858301784748513, + "grad_norm": 0.9047608375549316, + "learning_rate": 4.352106884931502e-06, + "loss": 1.8197, + "mean_token_accuracy": 0.5772172212600708, + "num_tokens": 7293481613.0, + "step": 14268 + }, + { + "epoch": 3.8585722011898325, + "grad_norm": 0.9281247854232788, + "learning_rate": 4.3510443009386435e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5825727581977844, + "num_tokens": 7294005705.0, + "step": 14269 + }, + { + "epoch": 3.858842617631152, + "grad_norm": 0.9044612050056458, + "learning_rate": 4.349981920955301e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5698711276054382, + "num_tokens": 7294529950.0, + "step": 14270 + }, + { + "epoch": 3.8591130340724717, + "grad_norm": 0.9816434979438782, + "learning_rate": 4.34891974501407e-06, + "loss": 1.7305, + "mean_token_accuracy": 0.5872142314910889, + "num_tokens": 7295054086.0, + "step": 14271 + }, + { + "epoch": 3.8593834505137914, + "grad_norm": 0.9160886406898499, + "learning_rate": 4.347857773147538e-06, + "loss": 1.8956, + "mean_token_accuracy": 0.5498837232589722, + "num_tokens": 7295578255.0, + "step": 14272 + }, + { + "epoch": 3.859653866955111, + "grad_norm": 0.9183270931243896, + "learning_rate": 4.346796005388295e-06, + "loss": 1.7316, + "mean_token_accuracy": 0.5926867723464966, + "num_tokens": 7296101050.0, + "step": 14273 + }, + { + "epoch": 3.8599242833964302, + "grad_norm": 0.9651400446891785, + "learning_rate": 4.345734441768914e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5789411664009094, + "num_tokens": 7296625169.0, + "step": 14274 + }, + { + "epoch": 3.8601946998377503, + "grad_norm": 0.9115498065948486, + "learning_rate": 4.34467308232197e-06, + "loss": 1.898, + "mean_token_accuracy": 0.5576071739196777, + "num_tokens": 7297149423.0, + "step": 14275 + }, + { + "epoch": 3.8604651162790695, + "grad_norm": 0.875880241394043, + "learning_rate": 4.343611927080031e-06, + "loss": 1.6184, + "mean_token_accuracy": 0.6355308294296265, + "num_tokens": 7297665859.0, + "step": 14276 + }, + { + "epoch": 3.8607355327203896, + "grad_norm": 0.9276069402694702, + "learning_rate": 4.342550976075649e-06, + "loss": 1.7737, + "mean_token_accuracy": 0.5861954689025879, + "num_tokens": 7298190071.0, + "step": 14277 + }, + { + "epoch": 3.861005949161709, + "grad_norm": 0.9247433543205261, + "learning_rate": 4.341490229341383e-06, + "loss": 1.8521, + "mean_token_accuracy": 0.5750572681427002, + "num_tokens": 7298714270.0, + "step": 14278 + }, + { + "epoch": 3.861276365603029, + "grad_norm": 0.9116908311843872, + "learning_rate": 4.34042968690978e-06, + "loss": 1.8509, + "mean_token_accuracy": 0.5782654285430908, + "num_tokens": 7299238548.0, + "step": 14279 + }, + { + "epoch": 3.861546782044348, + "grad_norm": 0.8025333881378174, + "learning_rate": 4.339369348813375e-06, + "loss": 1.8891, + "mean_token_accuracy": 0.5565258264541626, + "num_tokens": 7299762805.0, + "step": 14280 + }, + { + "epoch": 3.861817198485668, + "grad_norm": 0.35941416025161743, + "learning_rate": 4.338309215084706e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.707082986831665, + "num_tokens": 7300287073.0, + "step": 14281 + }, + { + "epoch": 3.8620876149269874, + "grad_norm": 1.1097160577774048, + "learning_rate": 4.337249285756304e-06, + "loss": 1.8241, + "mean_token_accuracy": 0.5827919244766235, + "num_tokens": 7300765159.0, + "step": 14282 + }, + { + "epoch": 3.8623580313683075, + "grad_norm": 1.0519053936004639, + "learning_rate": 4.336189560860686e-06, + "loss": 1.7588, + "mean_token_accuracy": 0.5836495757102966, + "num_tokens": 7301262909.0, + "step": 14283 + }, + { + "epoch": 3.8626284478096267, + "grad_norm": 1.0181632041931152, + "learning_rate": 4.335130040430371e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.5813422203063965, + "num_tokens": 7301787186.0, + "step": 14284 + }, + { + "epoch": 3.8628988642509463, + "grad_norm": 0.8950821757316589, + "learning_rate": 4.334070724497866e-06, + "loss": 1.95, + "mean_token_accuracy": 0.5488949418067932, + "num_tokens": 7302311410.0, + "step": 14285 + }, + { + "epoch": 3.863169280692266, + "grad_norm": 1.021976113319397, + "learning_rate": 4.333011613095672e-06, + "loss": 1.9061, + "mean_token_accuracy": 0.5519976615905762, + "num_tokens": 7302835589.0, + "step": 14286 + }, + { + "epoch": 3.8634396971335856, + "grad_norm": 0.9374200701713562, + "learning_rate": 4.3319527062562905e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5629730224609375, + "num_tokens": 7303359740.0, + "step": 14287 + }, + { + "epoch": 3.8637101135749052, + "grad_norm": 0.9782743453979492, + "learning_rate": 4.33089400401221e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.5693638324737549, + "num_tokens": 7303884015.0, + "step": 14288 + }, + { + "epoch": 3.863980530016225, + "grad_norm": 0.9813785552978516, + "learning_rate": 4.329835506395912e-06, + "loss": 1.9476, + "mean_token_accuracy": 0.561908483505249, + "num_tokens": 7304408060.0, + "step": 14289 + }, + { + "epoch": 3.8642509464575445, + "grad_norm": 0.9054434299468994, + "learning_rate": 4.328777213439879e-06, + "loss": 1.6864, + "mean_token_accuracy": 0.5916925668716431, + "num_tokens": 7304932342.0, + "step": 14290 + }, + { + "epoch": 3.864521362898864, + "grad_norm": 0.9907870292663574, + "learning_rate": 4.327719125176578e-06, + "loss": 1.9162, + "mean_token_accuracy": 0.5727008581161499, + "num_tokens": 7305456552.0, + "step": 14291 + }, + { + "epoch": 3.864791779340184, + "grad_norm": 0.8371323943138123, + "learning_rate": 4.326661241638475e-06, + "loss": 1.7024, + "mean_token_accuracy": 0.5999264717102051, + "num_tokens": 7305980820.0, + "step": 14292 + }, + { + "epoch": 3.8650621957815035, + "grad_norm": 0.801831841468811, + "learning_rate": 4.325603562858031e-06, + "loss": 1.8112, + "mean_token_accuracy": 0.5881260633468628, + "num_tokens": 7306505035.0, + "step": 14293 + }, + { + "epoch": 3.865332612222823, + "grad_norm": 0.987989604473114, + "learning_rate": 4.324546088867697e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5839568376541138, + "num_tokens": 7307029317.0, + "step": 14294 + }, + { + "epoch": 3.8656030286641427, + "grad_norm": 0.9408081769943237, + "learning_rate": 4.323488819699922e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.581471860408783, + "num_tokens": 7307515797.0, + "step": 14295 + }, + { + "epoch": 3.8658734451054624, + "grad_norm": 0.9708539247512817, + "learning_rate": 4.322431755387143e-06, + "loss": 1.7048, + "mean_token_accuracy": 0.6018076539039612, + "num_tokens": 7308035351.0, + "step": 14296 + }, + { + "epoch": 3.866143861546782, + "grad_norm": 0.9702436923980713, + "learning_rate": 4.321374895961794e-06, + "loss": 1.9645, + "mean_token_accuracy": 0.5617209672927856, + "num_tokens": 7308497893.0, + "step": 14297 + }, + { + "epoch": 3.8664142779881017, + "grad_norm": 0.8709896206855774, + "learning_rate": 4.320318241456307e-06, + "loss": 1.7834, + "mean_token_accuracy": 0.5842750072479248, + "num_tokens": 7309022075.0, + "step": 14298 + }, + { + "epoch": 3.8666846944294213, + "grad_norm": 1.2417209148406982, + "learning_rate": 4.3192617919031e-06, + "loss": 1.8528, + "mean_token_accuracy": 0.5624634623527527, + "num_tokens": 7309546339.0, + "step": 14299 + }, + { + "epoch": 3.866955110870741, + "grad_norm": 1.1787269115447998, + "learning_rate": 4.318205547334584e-06, + "loss": 1.8572, + "mean_token_accuracy": 0.5859024524688721, + "num_tokens": 7310022994.0, + "step": 14300 + }, + { + "epoch": 3.8672255273120606, + "grad_norm": 0.3873766362667084, + "learning_rate": 4.317149507783176e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.7124170064926147, + "num_tokens": 7310547135.0, + "step": 14301 + }, + { + "epoch": 3.8674959437533802, + "grad_norm": 1.0846309661865234, + "learning_rate": 4.316093673281273e-06, + "loss": 1.9678, + "mean_token_accuracy": 0.551069974899292, + "num_tokens": 7311071405.0, + "step": 14302 + }, + { + "epoch": 3.8677663601947, + "grad_norm": 0.8519294857978821, + "learning_rate": 4.31503804386127e-06, + "loss": 1.7324, + "mean_token_accuracy": 0.581721842288971, + "num_tokens": 7311595577.0, + "step": 14303 + }, + { + "epoch": 3.8680367766360195, + "grad_norm": 0.9560583233833313, + "learning_rate": 4.313982619555562e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5737053751945496, + "num_tokens": 7312119134.0, + "step": 14304 + }, + { + "epoch": 3.868307193077339, + "grad_norm": 0.8722580075263977, + "learning_rate": 4.312927400396529e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5638826489448547, + "num_tokens": 7312643270.0, + "step": 14305 + }, + { + "epoch": 3.868577609518659, + "grad_norm": 1.1829203367233276, + "learning_rate": 4.311872386416547e-06, + "loss": 1.8274, + "mean_token_accuracy": 0.5875967144966125, + "num_tokens": 7313167516.0, + "step": 14306 + }, + { + "epoch": 3.8688480259599785, + "grad_norm": 0.8887896537780762, + "learning_rate": 4.3108175776479895e-06, + "loss": 1.9344, + "mean_token_accuracy": 0.5518409609794617, + "num_tokens": 7313639378.0, + "step": 14307 + }, + { + "epoch": 3.869118442401298, + "grad_norm": 0.9302428364753723, + "learning_rate": 4.309762974123219e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5694246888160706, + "num_tokens": 7314163605.0, + "step": 14308 + }, + { + "epoch": 3.8693888588426177, + "grad_norm": 0.7897021770477295, + "learning_rate": 4.308708575874598e-06, + "loss": 1.7329, + "mean_token_accuracy": 0.5970827341079712, + "num_tokens": 7314687755.0, + "step": 14309 + }, + { + "epoch": 3.8696592752839374, + "grad_norm": 0.853172779083252, + "learning_rate": 4.307654382934473e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.5705806016921997, + "num_tokens": 7315211994.0, + "step": 14310 + }, + { + "epoch": 3.869929691725257, + "grad_norm": 0.8614776730537415, + "learning_rate": 4.306600395335194e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5734955668449402, + "num_tokens": 7315736153.0, + "step": 14311 + }, + { + "epoch": 3.8702001081665767, + "grad_norm": 0.8666881918907166, + "learning_rate": 4.305546613109097e-06, + "loss": 1.8178, + "mean_token_accuracy": 0.5567559003829956, + "num_tokens": 7316260367.0, + "step": 14312 + }, + { + "epoch": 3.8704705246078963, + "grad_norm": 0.8918739557266235, + "learning_rate": 4.30449303628852e-06, + "loss": 1.8691, + "mean_token_accuracy": 0.5783267617225647, + "num_tokens": 7316784645.0, + "step": 14313 + }, + { + "epoch": 3.870740941049216, + "grad_norm": 0.8069599866867065, + "learning_rate": 4.303439664905787e-06, + "loss": 1.8581, + "mean_token_accuracy": 0.5888122320175171, + "num_tokens": 7317287597.0, + "step": 14314 + }, + { + "epoch": 3.871011357490535, + "grad_norm": 0.8776534199714661, + "learning_rate": 4.302386498993214e-06, + "loss": 1.848, + "mean_token_accuracy": 0.5716145038604736, + "num_tokens": 7317811765.0, + "step": 14315 + }, + { + "epoch": 3.8712817739318552, + "grad_norm": 0.8978070020675659, + "learning_rate": 4.301333538583125e-06, + "loss": 1.8981, + "mean_token_accuracy": 0.561435341835022, + "num_tokens": 7318336046.0, + "step": 14316 + }, + { + "epoch": 3.8715521903731744, + "grad_norm": 0.850055992603302, + "learning_rate": 4.30028078370782e-06, + "loss": 1.8295, + "mean_token_accuracy": 0.5733867883682251, + "num_tokens": 7318860241.0, + "step": 14317 + }, + { + "epoch": 3.8718226068144945, + "grad_norm": 0.925473690032959, + "learning_rate": 4.299228234399601e-06, + "loss": 1.832, + "mean_token_accuracy": 0.5750049352645874, + "num_tokens": 7319384451.0, + "step": 14318 + }, + { + "epoch": 3.8720930232558137, + "grad_norm": 1.0226476192474365, + "learning_rate": 4.298175890690769e-06, + "loss": 1.7302, + "mean_token_accuracy": 0.609192430973053, + "num_tokens": 7319899872.0, + "step": 14319 + }, + { + "epoch": 3.872363439697134, + "grad_norm": 1.1608954668045044, + "learning_rate": 4.2971237526136064e-06, + "loss": 1.814, + "mean_token_accuracy": 0.5771803855895996, + "num_tokens": 7320424138.0, + "step": 14320 + }, + { + "epoch": 3.872633856138453, + "grad_norm": 0.37957116961479187, + "learning_rate": 4.296071820200402e-06, + "loss": 1.2294, + "mean_token_accuracy": 0.6804932355880737, + "num_tokens": 7320908337.0, + "step": 14321 + }, + { + "epoch": 3.872904272579773, + "grad_norm": 1.15290367603302, + "learning_rate": 4.295020093483428e-06, + "loss": 1.9379, + "mean_token_accuracy": 0.5690381526947021, + "num_tokens": 7321400008.0, + "step": 14322 + }, + { + "epoch": 3.8731746890210923, + "grad_norm": 1.2901372909545898, + "learning_rate": 4.293968572494954e-06, + "loss": 1.6883, + "mean_token_accuracy": 0.5936921834945679, + "num_tokens": 7321924278.0, + "step": 14323 + }, + { + "epoch": 3.8734451054624124, + "grad_norm": 0.870604395866394, + "learning_rate": 4.292917257267245e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5771113038063049, + "num_tokens": 7322448338.0, + "step": 14324 + }, + { + "epoch": 3.8737155219037316, + "grad_norm": 0.8693553805351257, + "learning_rate": 4.291866147832562e-06, + "loss": 1.8249, + "mean_token_accuracy": 0.5682756900787354, + "num_tokens": 7322972237.0, + "step": 14325 + }, + { + "epoch": 3.8739859383450512, + "grad_norm": 1.036044955253601, + "learning_rate": 4.29081524422315e-06, + "loss": 1.5814, + "mean_token_accuracy": 0.6468404531478882, + "num_tokens": 7323414101.0, + "step": 14326 + }, + { + "epoch": 3.874256354786371, + "grad_norm": 1.0359176397323608, + "learning_rate": 4.2897645464712586e-06, + "loss": 1.8885, + "mean_token_accuracy": 0.5692579746246338, + "num_tokens": 7323911068.0, + "step": 14327 + }, + { + "epoch": 3.8745267712276905, + "grad_norm": 0.863658607006073, + "learning_rate": 4.288714054609125e-06, + "loss": 1.6973, + "mean_token_accuracy": 0.6095424890518188, + "num_tokens": 7324384446.0, + "step": 14328 + }, + { + "epoch": 3.87479718766901, + "grad_norm": 0.9719439148902893, + "learning_rate": 4.2876637686689785e-06, + "loss": 1.892, + "mean_token_accuracy": 0.5779217481613159, + "num_tokens": 7324887569.0, + "step": 14329 + }, + { + "epoch": 3.87506760411033, + "grad_norm": 0.9408445954322815, + "learning_rate": 4.286613688683049e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5760020017623901, + "num_tokens": 7325362464.0, + "step": 14330 + }, + { + "epoch": 3.8753380205516494, + "grad_norm": 0.8834949135780334, + "learning_rate": 4.285563814683554e-06, + "loss": 1.5985, + "mean_token_accuracy": 0.6092901229858398, + "num_tokens": 7325886494.0, + "step": 14331 + }, + { + "epoch": 3.875608436992969, + "grad_norm": 0.9182982444763184, + "learning_rate": 4.284514146702707e-06, + "loss": 1.7792, + "mean_token_accuracy": 0.5922882556915283, + "num_tokens": 7326410640.0, + "step": 14332 + }, + { + "epoch": 3.8758788534342887, + "grad_norm": 0.8737519979476929, + "learning_rate": 4.283464684772714e-06, + "loss": 1.9126, + "mean_token_accuracy": 0.5593464970588684, + "num_tokens": 7326934834.0, + "step": 14333 + }, + { + "epoch": 3.8761492698756084, + "grad_norm": 0.9299373626708984, + "learning_rate": 4.282415428925779e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.5730514526367188, + "num_tokens": 7327421080.0, + "step": 14334 + }, + { + "epoch": 3.876419686316928, + "grad_norm": 0.9312782883644104, + "learning_rate": 4.28136637919409e-06, + "loss": 1.8605, + "mean_token_accuracy": 0.57940673828125, + "num_tokens": 7327945358.0, + "step": 14335 + }, + { + "epoch": 3.8766901027582477, + "grad_norm": 0.9094299674034119, + "learning_rate": 4.280317535609843e-06, + "loss": 1.9455, + "mean_token_accuracy": 0.5528671145439148, + "num_tokens": 7328469570.0, + "step": 14336 + }, + { + "epoch": 3.8769605191995673, + "grad_norm": 0.8133637309074402, + "learning_rate": 4.279268898205211e-06, + "loss": 1.7889, + "mean_token_accuracy": 0.6041918396949768, + "num_tokens": 7328945503.0, + "step": 14337 + }, + { + "epoch": 3.877230935640887, + "grad_norm": 0.7860866189002991, + "learning_rate": 4.278220467012376e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5871212482452393, + "num_tokens": 7329435295.0, + "step": 14338 + }, + { + "epoch": 3.8775013520822066, + "grad_norm": 0.9052566885948181, + "learning_rate": 4.277172242063503e-06, + "loss": 1.6426, + "mean_token_accuracy": 0.6321589946746826, + "num_tokens": 7329936005.0, + "step": 14339 + }, + { + "epoch": 3.8777717685235262, + "grad_norm": 0.7629485726356506, + "learning_rate": 4.276124223390758e-06, + "loss": 1.7389, + "mean_token_accuracy": 0.5746309757232666, + "num_tokens": 7330460151.0, + "step": 14340 + }, + { + "epoch": 3.878042184964846, + "grad_norm": 0.3500073552131653, + "learning_rate": 4.275076411026294e-06, + "loss": 1.1286, + "mean_token_accuracy": 0.7024326920509338, + "num_tokens": 7330984366.0, + "step": 14341 + }, + { + "epoch": 3.8783126014061655, + "grad_norm": 0.986701250076294, + "learning_rate": 4.274028805002264e-06, + "loss": 1.7832, + "mean_token_accuracy": 0.5775178670883179, + "num_tokens": 7331508650.0, + "step": 14342 + }, + { + "epoch": 3.878583017847485, + "grad_norm": 0.8043249249458313, + "learning_rate": 4.272981405350808e-06, + "loss": 1.8784, + "mean_token_accuracy": 0.5547184944152832, + "num_tokens": 7332032731.0, + "step": 14343 + }, + { + "epoch": 3.878853434288805, + "grad_norm": 0.9720568656921387, + "learning_rate": 4.271934212104067e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5748881101608276, + "num_tokens": 7332557013.0, + "step": 14344 + }, + { + "epoch": 3.8791238507301244, + "grad_norm": 0.9253379702568054, + "learning_rate": 4.270887225294171e-06, + "loss": 1.891, + "mean_token_accuracy": 0.575954258441925, + "num_tokens": 7333056716.0, + "step": 14345 + }, + { + "epoch": 3.879394267171444, + "grad_norm": 1.0988950729370117, + "learning_rate": 4.2698404449532415e-06, + "loss": 1.8491, + "mean_token_accuracy": 0.5789716243743896, + "num_tokens": 7333580920.0, + "step": 14346 + }, + { + "epoch": 3.8796646836127637, + "grad_norm": 1.0259239673614502, + "learning_rate": 4.268793871113402e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5861670970916748, + "num_tokens": 7334105100.0, + "step": 14347 + }, + { + "epoch": 3.8799351000540834, + "grad_norm": 0.8989647626876831, + "learning_rate": 4.2677475038067615e-06, + "loss": 1.855, + "mean_token_accuracy": 0.5668255090713501, + "num_tokens": 7334629360.0, + "step": 14348 + }, + { + "epoch": 3.880205516495403, + "grad_norm": 0.8835946917533875, + "learning_rate": 4.266701343065423e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5822408199310303, + "num_tokens": 7335153585.0, + "step": 14349 + }, + { + "epoch": 3.8804759329367227, + "grad_norm": 0.944176435470581, + "learning_rate": 4.265655388921491e-06, + "loss": 1.8484, + "mean_token_accuracy": 0.560113251209259, + "num_tokens": 7335677797.0, + "step": 14350 + }, + { + "epoch": 3.8807463493780423, + "grad_norm": 0.9191130995750427, + "learning_rate": 4.264609641407054e-06, + "loss": 1.9433, + "mean_token_accuracy": 0.5602186918258667, + "num_tokens": 7336201877.0, + "step": 14351 + }, + { + "epoch": 3.881016765819362, + "grad_norm": 0.8463121056556702, + "learning_rate": 4.263564100554204e-06, + "loss": 1.8553, + "mean_token_accuracy": 0.5640195608139038, + "num_tokens": 7336726127.0, + "step": 14352 + }, + { + "epoch": 3.8812871822606816, + "grad_norm": 0.9778814315795898, + "learning_rate": 4.262518766395014e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.5586020946502686, + "num_tokens": 7337250335.0, + "step": 14353 + }, + { + "epoch": 3.8815575987020012, + "grad_norm": 0.8237777948379517, + "learning_rate": 4.2614736389615654e-06, + "loss": 1.6272, + "mean_token_accuracy": 0.608423113822937, + "num_tokens": 7337774607.0, + "step": 14354 + }, + { + "epoch": 3.881828015143321, + "grad_norm": 1.052241325378418, + "learning_rate": 4.2604287182859195e-06, + "loss": 1.9231, + "mean_token_accuracy": 0.5778889656066895, + "num_tokens": 7338234944.0, + "step": 14355 + }, + { + "epoch": 3.88209843158464, + "grad_norm": 0.83763188123703, + "learning_rate": 4.259384004400142e-06, + "loss": 1.947, + "mean_token_accuracy": 0.5689911842346191, + "num_tokens": 7338727176.0, + "step": 14356 + }, + { + "epoch": 3.88236884802596, + "grad_norm": 0.962203860282898, + "learning_rate": 4.2583394973362865e-06, + "loss": 1.965, + "mean_token_accuracy": 0.562355637550354, + "num_tokens": 7339251443.0, + "step": 14357 + }, + { + "epoch": 3.8826392644672794, + "grad_norm": 0.8099182844161987, + "learning_rate": 4.2572951971263995e-06, + "loss": 1.7333, + "mean_token_accuracy": 0.5930355191230774, + "num_tokens": 7339775643.0, + "step": 14358 + }, + { + "epoch": 3.8829096809085994, + "grad_norm": 1.0963664054870605, + "learning_rate": 4.256251103802527e-06, + "loss": 1.9157, + "mean_token_accuracy": 0.5711404085159302, + "num_tokens": 7340257227.0, + "step": 14359 + }, + { + "epoch": 3.8831800973499186, + "grad_norm": 0.9346554279327393, + "learning_rate": 4.255207217396702e-06, + "loss": 1.6276, + "mean_token_accuracy": 0.630833089351654, + "num_tokens": 7340751244.0, + "step": 14360 + }, + { + "epoch": 3.8834505137912387, + "grad_norm": 0.33640551567077637, + "learning_rate": 4.254163537940953e-06, + "loss": 1.1272, + "mean_token_accuracy": 0.6934071183204651, + "num_tokens": 7341275518.0, + "step": 14361 + }, + { + "epoch": 3.883720930232558, + "grad_norm": 1.1336591243743896, + "learning_rate": 4.253120065467307e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.5881032943725586, + "num_tokens": 7341726620.0, + "step": 14362 + }, + { + "epoch": 3.883991346673878, + "grad_norm": 1.0264241695404053, + "learning_rate": 4.252076800007779e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5926781892776489, + "num_tokens": 7342250826.0, + "step": 14363 + }, + { + "epoch": 3.884261763115197, + "grad_norm": 0.8685559034347534, + "learning_rate": 4.251033741594377e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.5681079626083374, + "num_tokens": 7342775094.0, + "step": 14364 + }, + { + "epoch": 3.8845321795565173, + "grad_norm": 0.8934335112571716, + "learning_rate": 4.249990890259108e-06, + "loss": 1.7442, + "mean_token_accuracy": 0.5884408950805664, + "num_tokens": 7343267730.0, + "step": 14365 + }, + { + "epoch": 3.8848025959978365, + "grad_norm": 0.9481157660484314, + "learning_rate": 4.2489482460339685e-06, + "loss": 1.8855, + "mean_token_accuracy": 0.5613505840301514, + "num_tokens": 7343791879.0, + "step": 14366 + }, + { + "epoch": 3.885073012439156, + "grad_norm": 1.0022401809692383, + "learning_rate": 4.247905808950949e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5807784795761108, + "num_tokens": 7344316119.0, + "step": 14367 + }, + { + "epoch": 3.885343428880476, + "grad_norm": 0.9464559555053711, + "learning_rate": 4.246863579042038e-06, + "loss": 1.7953, + "mean_token_accuracy": 0.5882877707481384, + "num_tokens": 7344840198.0, + "step": 14368 + }, + { + "epoch": 3.8856138453217954, + "grad_norm": 0.8665714859962463, + "learning_rate": 4.24582155633921e-06, + "loss": 1.8882, + "mean_token_accuracy": 0.5701782703399658, + "num_tokens": 7345317982.0, + "step": 14369 + }, + { + "epoch": 3.885884261763115, + "grad_norm": 1.1104594469070435, + "learning_rate": 4.244779740874443e-06, + "loss": 1.7859, + "mean_token_accuracy": 0.582126796245575, + "num_tokens": 7345842191.0, + "step": 14370 + }, + { + "epoch": 3.8861546782044347, + "grad_norm": 1.0546318292617798, + "learning_rate": 4.243738132679699e-06, + "loss": 1.9105, + "mean_token_accuracy": 0.5617178678512573, + "num_tokens": 7346366395.0, + "step": 14371 + }, + { + "epoch": 3.8864250946457544, + "grad_norm": 0.9728121757507324, + "learning_rate": 4.242696731786935e-06, + "loss": 1.8779, + "mean_token_accuracy": 0.5755783319473267, + "num_tokens": 7346867091.0, + "step": 14372 + }, + { + "epoch": 3.886695511087074, + "grad_norm": 0.8617754578590393, + "learning_rate": 4.24165553822811e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.589024543762207, + "num_tokens": 7347391129.0, + "step": 14373 + }, + { + "epoch": 3.8869659275283936, + "grad_norm": 1.037125587463379, + "learning_rate": 4.240614552035168e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.5717779397964478, + "num_tokens": 7347915157.0, + "step": 14374 + }, + { + "epoch": 3.8872363439697133, + "grad_norm": 0.8999741077423096, + "learning_rate": 4.239573773240046e-06, + "loss": 1.8009, + "mean_token_accuracy": 0.5638261437416077, + "num_tokens": 7348439431.0, + "step": 14375 + }, + { + "epoch": 3.887506760411033, + "grad_norm": 1.0484052896499634, + "learning_rate": 4.238533201874686e-06, + "loss": 1.7635, + "mean_token_accuracy": 0.6085877418518066, + "num_tokens": 7348863402.0, + "step": 14376 + }, + { + "epoch": 3.8877771768523526, + "grad_norm": 0.8876430988311768, + "learning_rate": 4.2374928379710104e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.5779814720153809, + "num_tokens": 7349339104.0, + "step": 14377 + }, + { + "epoch": 3.888047593293672, + "grad_norm": 1.2080872058868408, + "learning_rate": 4.23645268156094e-06, + "loss": 1.5322, + "mean_token_accuracy": 0.6373216509819031, + "num_tokens": 7349863385.0, + "step": 14378 + }, + { + "epoch": 3.888318009734992, + "grad_norm": 0.8990398645401001, + "learning_rate": 4.235412732676394e-06, + "loss": 1.8601, + "mean_token_accuracy": 0.5719490051269531, + "num_tokens": 7350387647.0, + "step": 14379 + }, + { + "epoch": 3.8885884261763115, + "grad_norm": 0.860490083694458, + "learning_rate": 4.234372991349276e-06, + "loss": 1.7615, + "mean_token_accuracy": 0.5837100744247437, + "num_tokens": 7350911839.0, + "step": 14380 + }, + { + "epoch": 3.888858842617631, + "grad_norm": 0.3471318483352661, + "learning_rate": 4.233333457611491e-06, + "loss": 1.0997, + "mean_token_accuracy": 0.7116543054580688, + "num_tokens": 7351418293.0, + "step": 14381 + }, + { + "epoch": 3.889129259058951, + "grad_norm": 1.0937587022781372, + "learning_rate": 4.2322941314949376e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.5836176872253418, + "num_tokens": 7351942564.0, + "step": 14382 + }, + { + "epoch": 3.8893996755002704, + "grad_norm": 0.9732063412666321, + "learning_rate": 4.231255013031502e-06, + "loss": 1.6817, + "mean_token_accuracy": 0.6021988987922668, + "num_tokens": 7352444828.0, + "step": 14383 + }, + { + "epoch": 3.88967009194159, + "grad_norm": 0.9242706298828125, + "learning_rate": 4.230216102253066e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.602155864238739, + "num_tokens": 7352957281.0, + "step": 14384 + }, + { + "epoch": 3.8899405083829097, + "grad_norm": 0.7890920042991638, + "learning_rate": 4.229177399191512e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5603002309799194, + "num_tokens": 7353481469.0, + "step": 14385 + }, + { + "epoch": 3.8902109248242294, + "grad_norm": 0.8934500217437744, + "learning_rate": 4.228138903878707e-06, + "loss": 1.8411, + "mean_token_accuracy": 0.5724735260009766, + "num_tokens": 7353981988.0, + "step": 14386 + }, + { + "epoch": 3.890481341265549, + "grad_norm": 1.0484837293624878, + "learning_rate": 4.227100616346512e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5769433975219727, + "num_tokens": 7354506259.0, + "step": 14387 + }, + { + "epoch": 3.8907517577068687, + "grad_norm": 1.207667350769043, + "learning_rate": 4.226062536626791e-06, + "loss": 1.8064, + "mean_token_accuracy": 0.5659723281860352, + "num_tokens": 7355000361.0, + "step": 14388 + }, + { + "epoch": 3.8910221741481883, + "grad_norm": 0.8362321257591248, + "learning_rate": 4.22502466475139e-06, + "loss": 1.91, + "mean_token_accuracy": 0.552409291267395, + "num_tokens": 7355520421.0, + "step": 14389 + }, + { + "epoch": 3.891292590589508, + "grad_norm": 1.0119507312774658, + "learning_rate": 4.223987000752158e-06, + "loss": 1.8098, + "mean_token_accuracy": 0.5844968557357788, + "num_tokens": 7356044697.0, + "step": 14390 + }, + { + "epoch": 3.8915630070308276, + "grad_norm": 0.9136521816253662, + "learning_rate": 4.222949544660931e-06, + "loss": 1.8418, + "mean_token_accuracy": 0.5603108406066895, + "num_tokens": 7356568862.0, + "step": 14391 + }, + { + "epoch": 3.8918334234721472, + "grad_norm": 0.9895609021186829, + "learning_rate": 4.22191229650954e-06, + "loss": 1.9148, + "mean_token_accuracy": 0.5595649480819702, + "num_tokens": 7357040280.0, + "step": 14392 + }, + { + "epoch": 3.892103839913467, + "grad_norm": 0.8833563327789307, + "learning_rate": 4.220875256329817e-06, + "loss": 1.7424, + "mean_token_accuracy": 0.5681257247924805, + "num_tokens": 7357564496.0, + "step": 14393 + }, + { + "epoch": 3.8923742563547865, + "grad_norm": 1.6165367364883423, + "learning_rate": 4.2198384241535724e-06, + "loss": 1.7295, + "mean_token_accuracy": 0.5855929255485535, + "num_tokens": 7358088593.0, + "step": 14394 + }, + { + "epoch": 3.892644672796106, + "grad_norm": 0.9877492785453796, + "learning_rate": 4.218801800012628e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.5760226249694824, + "num_tokens": 7358612816.0, + "step": 14395 + }, + { + "epoch": 3.892915089237426, + "grad_norm": 1.3788000345230103, + "learning_rate": 4.2177653839387835e-06, + "loss": 1.9768, + "mean_token_accuracy": 0.5393990874290466, + "num_tokens": 7359137029.0, + "step": 14396 + }, + { + "epoch": 3.893185505678745, + "grad_norm": 0.9863548874855042, + "learning_rate": 4.216729175963845e-06, + "loss": 1.8972, + "mean_token_accuracy": 0.5592570304870605, + "num_tokens": 7359661245.0, + "step": 14397 + }, + { + "epoch": 3.893455922120065, + "grad_norm": 0.9309565424919128, + "learning_rate": 4.215693176119601e-06, + "loss": 1.7796, + "mean_token_accuracy": 0.58099365234375, + "num_tokens": 7360185512.0, + "step": 14398 + }, + { + "epoch": 3.8937263385613843, + "grad_norm": 0.8751088380813599, + "learning_rate": 4.214657384437845e-06, + "loss": 1.6835, + "mean_token_accuracy": 0.6037421226501465, + "num_tokens": 7360698357.0, + "step": 14399 + }, + { + "epoch": 3.8939967550027044, + "grad_norm": 0.8549619913101196, + "learning_rate": 4.213621800950354e-06, + "loss": 1.7759, + "mean_token_accuracy": 0.5921569466590881, + "num_tokens": 7361222541.0, + "step": 14400 + }, + { + "epoch": 3.8942671714440236, + "grad_norm": 0.3927585780620575, + "learning_rate": 4.2125864256889015e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.7109478712081909, + "num_tokens": 7361746822.0, + "step": 14401 + }, + { + "epoch": 3.8945375878853437, + "grad_norm": 0.8549200296401978, + "learning_rate": 4.2115512586852594e-06, + "loss": 1.8918, + "mean_token_accuracy": 0.5618488788604736, + "num_tokens": 7362270990.0, + "step": 14402 + }, + { + "epoch": 3.894808004326663, + "grad_norm": 0.9686017632484436, + "learning_rate": 4.210516299971189e-06, + "loss": 1.9807, + "mean_token_accuracy": 0.5549174547195435, + "num_tokens": 7362773802.0, + "step": 14403 + }, + { + "epoch": 3.895078420767983, + "grad_norm": 1.0148253440856934, + "learning_rate": 4.209481549578444e-06, + "loss": 1.574, + "mean_token_accuracy": 0.6290913820266724, + "num_tokens": 7363298036.0, + "step": 14404 + }, + { + "epoch": 3.895348837209302, + "grad_norm": 0.9854594469070435, + "learning_rate": 4.208447007538774e-06, + "loss": 1.6322, + "mean_token_accuracy": 0.6299012899398804, + "num_tokens": 7363761781.0, + "step": 14405 + }, + { + "epoch": 3.8956192536506222, + "grad_norm": 1.0998754501342773, + "learning_rate": 4.207412673883925e-06, + "loss": 1.9067, + "mean_token_accuracy": 0.5820472240447998, + "num_tokens": 7364240418.0, + "step": 14406 + }, + { + "epoch": 3.8958896700919414, + "grad_norm": 0.9478581547737122, + "learning_rate": 4.206378548645628e-06, + "loss": 1.7983, + "mean_token_accuracy": 0.5762975811958313, + "num_tokens": 7364764417.0, + "step": 14407 + }, + { + "epoch": 3.896160086533261, + "grad_norm": 0.9664052724838257, + "learning_rate": 4.205344631855618e-06, + "loss": 1.8583, + "mean_token_accuracy": 0.5712175369262695, + "num_tokens": 7365282880.0, + "step": 14408 + }, + { + "epoch": 3.8964305029745807, + "grad_norm": 0.7941681146621704, + "learning_rate": 4.2043109235456125e-06, + "loss": 1.9535, + "mean_token_accuracy": 0.5485028624534607, + "num_tokens": 7365807005.0, + "step": 14409 + }, + { + "epoch": 3.8967009194159004, + "grad_norm": 0.8342077732086182, + "learning_rate": 4.203277423747335e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5824112296104431, + "num_tokens": 7366331204.0, + "step": 14410 + }, + { + "epoch": 3.89697133585722, + "grad_norm": 0.8941748738288879, + "learning_rate": 4.202244132492495e-06, + "loss": 1.7511, + "mean_token_accuracy": 0.6002742052078247, + "num_tokens": 7366831122.0, + "step": 14411 + }, + { + "epoch": 3.8972417522985396, + "grad_norm": 1.0832980871200562, + "learning_rate": 4.2012110498127945e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5738120079040527, + "num_tokens": 7367306573.0, + "step": 14412 + }, + { + "epoch": 3.8975121687398593, + "grad_norm": 0.8806117177009583, + "learning_rate": 4.200178175739935e-06, + "loss": 1.8014, + "mean_token_accuracy": 0.606676459312439, + "num_tokens": 7367765958.0, + "step": 14413 + }, + { + "epoch": 3.897782585181179, + "grad_norm": 0.8861975073814392, + "learning_rate": 4.1991455103056055e-06, + "loss": 1.8418, + "mean_token_accuracy": 0.5732169151306152, + "num_tokens": 7368290009.0, + "step": 14414 + }, + { + "epoch": 3.8980530016224986, + "grad_norm": 0.9379578828811646, + "learning_rate": 4.19811305354149e-06, + "loss": 1.7133, + "mean_token_accuracy": 0.5961795449256897, + "num_tokens": 7368759686.0, + "step": 14415 + }, + { + "epoch": 3.898323418063818, + "grad_norm": 0.8720790147781372, + "learning_rate": 4.197080805479272e-06, + "loss": 1.8185, + "mean_token_accuracy": 0.5765446424484253, + "num_tokens": 7369283868.0, + "step": 14416 + }, + { + "epoch": 3.898593834505138, + "grad_norm": 1.0364481210708618, + "learning_rate": 4.19604876615062e-06, + "loss": 1.9406, + "mean_token_accuracy": 0.5617697834968567, + "num_tokens": 7369739851.0, + "step": 14417 + }, + { + "epoch": 3.8988642509464575, + "grad_norm": 0.9887011051177979, + "learning_rate": 4.195016935587199e-06, + "loss": 1.885, + "mean_token_accuracy": 0.575374186038971, + "num_tokens": 7370264000.0, + "step": 14418 + }, + { + "epoch": 3.899134667387777, + "grad_norm": 0.9058812260627747, + "learning_rate": 4.193985313820673e-06, + "loss": 1.8067, + "mean_token_accuracy": 0.5749146938323975, + "num_tokens": 7370788071.0, + "step": 14419 + }, + { + "epoch": 3.899405083829097, + "grad_norm": 0.8813717365264893, + "learning_rate": 4.192953900882693e-06, + "loss": 1.807, + "mean_token_accuracy": 0.5820751190185547, + "num_tokens": 7371312322.0, + "step": 14420 + }, + { + "epoch": 3.8996755002704164, + "grad_norm": 0.3549862504005432, + "learning_rate": 4.191922696804902e-06, + "loss": 1.1377, + "mean_token_accuracy": 0.704664945602417, + "num_tokens": 7371747780.0, + "step": 14421 + }, + { + "epoch": 3.899945916711736, + "grad_norm": 1.0492537021636963, + "learning_rate": 4.190891701618947e-06, + "loss": 1.886, + "mean_token_accuracy": 0.5822029709815979, + "num_tokens": 7372272028.0, + "step": 14422 + }, + { + "epoch": 3.9002163331530557, + "grad_norm": 1.0758285522460938, + "learning_rate": 4.189860915356456e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5715863704681396, + "num_tokens": 7372796307.0, + "step": 14423 + }, + { + "epoch": 3.9004867495943754, + "grad_norm": 0.9209342002868652, + "learning_rate": 4.18883033804906e-06, + "loss": 1.7532, + "mean_token_accuracy": 0.5903756618499756, + "num_tokens": 7373320491.0, + "step": 14424 + }, + { + "epoch": 3.900757166035695, + "grad_norm": 0.8335006237030029, + "learning_rate": 4.187799969728381e-06, + "loss": 1.9085, + "mean_token_accuracy": 0.5707917213439941, + "num_tokens": 7373844656.0, + "step": 14425 + }, + { + "epoch": 3.9010275824770146, + "grad_norm": 0.8861148357391357, + "learning_rate": 4.1867698104260345e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.570603609085083, + "num_tokens": 7374368859.0, + "step": 14426 + }, + { + "epoch": 3.9012979989183343, + "grad_norm": 0.945537269115448, + "learning_rate": 4.185739860173623e-06, + "loss": 1.8654, + "mean_token_accuracy": 0.5742092728614807, + "num_tokens": 7374893074.0, + "step": 14427 + }, + { + "epoch": 3.901568415359654, + "grad_norm": 0.9474883675575256, + "learning_rate": 4.184710119002754e-06, + "loss": 1.9067, + "mean_token_accuracy": 0.5677149295806885, + "num_tokens": 7375417232.0, + "step": 14428 + }, + { + "epoch": 3.9018388318009736, + "grad_norm": 0.9109538197517395, + "learning_rate": 4.183680586945021e-06, + "loss": 1.8861, + "mean_token_accuracy": 0.5653982758522034, + "num_tokens": 7375907015.0, + "step": 14429 + }, + { + "epoch": 3.902109248242293, + "grad_norm": 0.9594467878341675, + "learning_rate": 4.182651264032011e-06, + "loss": 1.7171, + "mean_token_accuracy": 0.5761722326278687, + "num_tokens": 7376382545.0, + "step": 14430 + }, + { + "epoch": 3.902379664683613, + "grad_norm": 0.9817641973495483, + "learning_rate": 4.181622150295311e-06, + "loss": 1.8948, + "mean_token_accuracy": 0.5608210563659668, + "num_tokens": 7376906562.0, + "step": 14431 + }, + { + "epoch": 3.9026500811249325, + "grad_norm": 0.8524522185325623, + "learning_rate": 4.180593245766496e-06, + "loss": 1.762, + "mean_token_accuracy": 0.6000312566757202, + "num_tokens": 7377430806.0, + "step": 14432 + }, + { + "epoch": 3.902920497566252, + "grad_norm": 0.8880468606948853, + "learning_rate": 4.179564550477131e-06, + "loss": 1.7572, + "mean_token_accuracy": 0.5815910696983337, + "num_tokens": 7377955051.0, + "step": 14433 + }, + { + "epoch": 3.903190914007572, + "grad_norm": 0.9505771994590759, + "learning_rate": 4.1785360644587855e-06, + "loss": 1.909, + "mean_token_accuracy": 0.5657306909561157, + "num_tokens": 7378479194.0, + "step": 14434 + }, + { + "epoch": 3.9034613304488914, + "grad_norm": 0.8546794652938843, + "learning_rate": 4.177507787743013e-06, + "loss": 1.8519, + "mean_token_accuracy": 0.5633471012115479, + "num_tokens": 7379003370.0, + "step": 14435 + }, + { + "epoch": 3.903731746890211, + "grad_norm": 0.8875554203987122, + "learning_rate": 4.176479720361367e-06, + "loss": 1.8159, + "mean_token_accuracy": 0.5759004354476929, + "num_tokens": 7379527517.0, + "step": 14436 + }, + { + "epoch": 3.9040021633315307, + "grad_norm": 1.0661729574203491, + "learning_rate": 4.175451862345388e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.576492428779602, + "num_tokens": 7380007935.0, + "step": 14437 + }, + { + "epoch": 3.90427257977285, + "grad_norm": 0.9279312491416931, + "learning_rate": 4.174424213726617e-06, + "loss": 1.7403, + "mean_token_accuracy": 0.6035371422767639, + "num_tokens": 7380499412.0, + "step": 14438 + }, + { + "epoch": 3.90454299621417, + "grad_norm": 0.9213525056838989, + "learning_rate": 4.173396774536581e-06, + "loss": 1.9234, + "mean_token_accuracy": 0.5643543004989624, + "num_tokens": 7381023574.0, + "step": 14439 + }, + { + "epoch": 3.904813412655489, + "grad_norm": 0.8595417141914368, + "learning_rate": 4.172369544806811e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5878565907478333, + "num_tokens": 7381547768.0, + "step": 14440 + }, + { + "epoch": 3.9050838290968093, + "grad_norm": 0.3335789144039154, + "learning_rate": 4.17134252456882e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.7040998935699463, + "num_tokens": 7382071945.0, + "step": 14441 + }, + { + "epoch": 3.9053542455381285, + "grad_norm": 1.2131949663162231, + "learning_rate": 4.170315713854124e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5779839754104614, + "num_tokens": 7382553553.0, + "step": 14442 + }, + { + "epoch": 3.9056246619794486, + "grad_norm": 0.9339151978492737, + "learning_rate": 4.169289112694226e-06, + "loss": 1.7156, + "mean_token_accuracy": 0.6046677231788635, + "num_tokens": 7383077690.0, + "step": 14443 + }, + { + "epoch": 3.9058950784207678, + "grad_norm": 0.987826943397522, + "learning_rate": 4.168262721120625e-06, + "loss": 1.8807, + "mean_token_accuracy": 0.5726373195648193, + "num_tokens": 7383601976.0, + "step": 14444 + }, + { + "epoch": 3.906165494862088, + "grad_norm": 1.0149544477462769, + "learning_rate": 4.167236539164816e-06, + "loss": 1.9234, + "mean_token_accuracy": 0.5469492077827454, + "num_tokens": 7384126114.0, + "step": 14445 + }, + { + "epoch": 3.906435911303407, + "grad_norm": 0.9126467108726501, + "learning_rate": 4.1662105668582845e-06, + "loss": 1.7842, + "mean_token_accuracy": 0.5868985652923584, + "num_tokens": 7384650196.0, + "step": 14446 + }, + { + "epoch": 3.906706327744727, + "grad_norm": 0.9735530018806458, + "learning_rate": 4.165184804232506e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5634101629257202, + "num_tokens": 7385174355.0, + "step": 14447 + }, + { + "epoch": 3.9069767441860463, + "grad_norm": 0.9054187536239624, + "learning_rate": 4.164159251318962e-06, + "loss": 1.8352, + "mean_token_accuracy": 0.5727167725563049, + "num_tokens": 7385698515.0, + "step": 14448 + }, + { + "epoch": 3.907247160627366, + "grad_norm": 0.8449986577033997, + "learning_rate": 4.163133908149113e-06, + "loss": 1.7683, + "mean_token_accuracy": 0.5774587392807007, + "num_tokens": 7386222750.0, + "step": 14449 + }, + { + "epoch": 3.9075175770686856, + "grad_norm": 0.8426034450531006, + "learning_rate": 4.16210877475442e-06, + "loss": 1.7739, + "mean_token_accuracy": 0.5932482481002808, + "num_tokens": 7386722712.0, + "step": 14450 + }, + { + "epoch": 3.9077879935100053, + "grad_norm": 0.9722859859466553, + "learning_rate": 4.161083851166339e-06, + "loss": 1.7953, + "mean_token_accuracy": 0.581409215927124, + "num_tokens": 7387246911.0, + "step": 14451 + }, + { + "epoch": 3.908058409951325, + "grad_norm": 1.0018643140792847, + "learning_rate": 4.160059137416319e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5693014860153198, + "num_tokens": 7387740838.0, + "step": 14452 + }, + { + "epoch": 3.9083288263926446, + "grad_norm": 0.9046353101730347, + "learning_rate": 4.1590346335357975e-06, + "loss": 1.8503, + "mean_token_accuracy": 0.567121148109436, + "num_tokens": 7388237080.0, + "step": 14453 + }, + { + "epoch": 3.908599242833964, + "grad_norm": 1.0265603065490723, + "learning_rate": 4.158010339556215e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5877764821052551, + "num_tokens": 7388761313.0, + "step": 14454 + }, + { + "epoch": 3.908869659275284, + "grad_norm": 1.1467071771621704, + "learning_rate": 4.156986255508996e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5837453603744507, + "num_tokens": 7389285596.0, + "step": 14455 + }, + { + "epoch": 3.9091400757166035, + "grad_norm": 0.9432716965675354, + "learning_rate": 4.155962381425559e-06, + "loss": 1.8729, + "mean_token_accuracy": 0.5711483955383301, + "num_tokens": 7389809879.0, + "step": 14456 + }, + { + "epoch": 3.909410492157923, + "grad_norm": 0.9730218648910522, + "learning_rate": 4.154938717337326e-06, + "loss": 1.9481, + "mean_token_accuracy": 0.549527645111084, + "num_tokens": 7390292275.0, + "step": 14457 + }, + { + "epoch": 3.9096809085992428, + "grad_norm": 0.8771243691444397, + "learning_rate": 4.153915263275701e-06, + "loss": 1.8347, + "mean_token_accuracy": 0.58304762840271, + "num_tokens": 7390794873.0, + "step": 14458 + }, + { + "epoch": 3.9099513250405624, + "grad_norm": 1.0726022720336914, + "learning_rate": 4.152892019272091e-06, + "loss": 1.7436, + "mean_token_accuracy": 0.595766007900238, + "num_tokens": 7391319081.0, + "step": 14459 + }, + { + "epoch": 3.910221741481882, + "grad_norm": 1.0106565952301025, + "learning_rate": 4.1518689853578905e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5799202919006348, + "num_tokens": 7391822199.0, + "step": 14460 + }, + { + "epoch": 3.9104921579232017, + "grad_norm": 0.34005364775657654, + "learning_rate": 4.150846161564485e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.7175098657608032, + "num_tokens": 7392346396.0, + "step": 14461 + }, + { + "epoch": 3.9107625743645213, + "grad_norm": 1.0123591423034668, + "learning_rate": 4.1498235479232655e-06, + "loss": 1.7968, + "mean_token_accuracy": 0.5644515156745911, + "num_tokens": 7392826058.0, + "step": 14462 + }, + { + "epoch": 3.911032990805841, + "grad_norm": 0.9540358781814575, + "learning_rate": 4.148801144465603e-06, + "loss": 1.8343, + "mean_token_accuracy": 0.5656367540359497, + "num_tokens": 7393350317.0, + "step": 14463 + }, + { + "epoch": 3.9113034072471606, + "grad_norm": 1.0042483806610107, + "learning_rate": 4.147778951222867e-06, + "loss": 1.8167, + "mean_token_accuracy": 0.5767165422439575, + "num_tokens": 7393837992.0, + "step": 14464 + }, + { + "epoch": 3.9115738236884803, + "grad_norm": 0.9636335372924805, + "learning_rate": 4.146756968226428e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.5668268203735352, + "num_tokens": 7394362252.0, + "step": 14465 + }, + { + "epoch": 3.9118442401298, + "grad_norm": 0.8350598216056824, + "learning_rate": 4.145735195507635e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.5739849209785461, + "num_tokens": 7394876250.0, + "step": 14466 + }, + { + "epoch": 3.9121146565711196, + "grad_norm": 1.1106377840042114, + "learning_rate": 4.1447136330978434e-06, + "loss": 1.8919, + "mean_token_accuracy": 0.6000634431838989, + "num_tokens": 7395276794.0, + "step": 14467 + }, + { + "epoch": 3.912385073012439, + "grad_norm": 0.8243338465690613, + "learning_rate": 4.143692281028401e-06, + "loss": 1.906, + "mean_token_accuracy": 0.5744612812995911, + "num_tokens": 7395749716.0, + "step": 14468 + }, + { + "epoch": 3.912655489453759, + "grad_norm": 1.004846453666687, + "learning_rate": 4.142671139330642e-06, + "loss": 1.7549, + "mean_token_accuracy": 0.5936601161956787, + "num_tokens": 7396273973.0, + "step": 14469 + }, + { + "epoch": 3.9129259058950785, + "grad_norm": 0.7551760077476501, + "learning_rate": 4.141650208035896e-06, + "loss": 1.8953, + "mean_token_accuracy": 0.5607703328132629, + "num_tokens": 7396798180.0, + "step": 14470 + }, + { + "epoch": 3.913196322336398, + "grad_norm": 0.8258396983146667, + "learning_rate": 4.140629487175494e-06, + "loss": 1.7923, + "mean_token_accuracy": 0.574315071105957, + "num_tokens": 7397322347.0, + "step": 14471 + }, + { + "epoch": 3.913466738777718, + "grad_norm": 0.9275895953178406, + "learning_rate": 4.13960897678075e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5767809152603149, + "num_tokens": 7397846623.0, + "step": 14472 + }, + { + "epoch": 3.9137371552190374, + "grad_norm": 0.8491427898406982, + "learning_rate": 4.1385886768829764e-06, + "loss": 1.8969, + "mean_token_accuracy": 0.5628868341445923, + "num_tokens": 7398370872.0, + "step": 14473 + }, + { + "epoch": 3.914007571660357, + "grad_norm": 0.7963190674781799, + "learning_rate": 4.137568587513482e-06, + "loss": 1.7837, + "mean_token_accuracy": 0.5641282796859741, + "num_tokens": 7398895054.0, + "step": 14474 + }, + { + "epoch": 3.9142779881016767, + "grad_norm": 0.8636974692344666, + "learning_rate": 4.136548708703565e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.5727546215057373, + "num_tokens": 7399419299.0, + "step": 14475 + }, + { + "epoch": 3.9145484045429964, + "grad_norm": 0.8887652158737183, + "learning_rate": 4.135529040484515e-06, + "loss": 1.9641, + "mean_token_accuracy": 0.565410852432251, + "num_tokens": 7399909071.0, + "step": 14476 + }, + { + "epoch": 3.914818820984316, + "grad_norm": 0.89698326587677, + "learning_rate": 4.134509582887624e-06, + "loss": 1.7169, + "mean_token_accuracy": 0.605751633644104, + "num_tokens": 7400433338.0, + "step": 14477 + }, + { + "epoch": 3.9150892374256356, + "grad_norm": 0.8290793299674988, + "learning_rate": 4.133490335944168e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5917342901229858, + "num_tokens": 7400957530.0, + "step": 14478 + }, + { + "epoch": 3.915359653866955, + "grad_norm": 0.8948462605476379, + "learning_rate": 4.13247129968542e-06, + "loss": 1.9387, + "mean_token_accuracy": 0.5677465200424194, + "num_tokens": 7401479932.0, + "step": 14479 + }, + { + "epoch": 3.915630070308275, + "grad_norm": 0.9052409529685974, + "learning_rate": 4.1314524741426475e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.5850785970687866, + "num_tokens": 7401943937.0, + "step": 14480 + }, + { + "epoch": 3.915900486749594, + "grad_norm": 0.35577407479286194, + "learning_rate": 4.130433859347116e-06, + "loss": 1.101, + "mean_token_accuracy": 0.7149525880813599, + "num_tokens": 7402409349.0, + "step": 14481 + }, + { + "epoch": 3.916170903190914, + "grad_norm": 0.9487166404724121, + "learning_rate": 4.129415455330074e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5609594583511353, + "num_tokens": 7402933587.0, + "step": 14482 + }, + { + "epoch": 3.9164413196322334, + "grad_norm": 0.927034318447113, + "learning_rate": 4.128397262122771e-06, + "loss": 1.7858, + "mean_token_accuracy": 0.5723724365234375, + "num_tokens": 7403457801.0, + "step": 14483 + }, + { + "epoch": 3.9167117360735535, + "grad_norm": 0.7525549530982971, + "learning_rate": 4.127379279756447e-06, + "loss": 1.853, + "mean_token_accuracy": 0.5640965700149536, + "num_tokens": 7403982083.0, + "step": 14484 + }, + { + "epoch": 3.9169821525148727, + "grad_norm": 0.8122437596321106, + "learning_rate": 4.12636150826234e-06, + "loss": 1.8839, + "mean_token_accuracy": 0.5624295473098755, + "num_tokens": 7404448477.0, + "step": 14485 + }, + { + "epoch": 3.917252568956193, + "grad_norm": 0.9380158185958862, + "learning_rate": 4.125343947671676e-06, + "loss": 1.7196, + "mean_token_accuracy": 0.6018317341804504, + "num_tokens": 7404937246.0, + "step": 14486 + }, + { + "epoch": 3.917522985397512, + "grad_norm": 0.9039159417152405, + "learning_rate": 4.124326598015674e-06, + "loss": 1.8952, + "mean_token_accuracy": 0.5613986253738403, + "num_tokens": 7405461520.0, + "step": 14487 + }, + { + "epoch": 3.917793401838832, + "grad_norm": 0.9696972966194153, + "learning_rate": 4.123309459325556e-06, + "loss": 1.8244, + "mean_token_accuracy": 0.5730568170547485, + "num_tokens": 7405985593.0, + "step": 14488 + }, + { + "epoch": 3.9180638182801513, + "grad_norm": 0.9554545879364014, + "learning_rate": 4.122292531632525e-06, + "loss": 1.9158, + "mean_token_accuracy": 0.5689741373062134, + "num_tokens": 7406509798.0, + "step": 14489 + }, + { + "epoch": 3.918334234721471, + "grad_norm": 1.024125576019287, + "learning_rate": 4.121275814967784e-06, + "loss": 1.7285, + "mean_token_accuracy": 0.5924910306930542, + "num_tokens": 7407033952.0, + "step": 14490 + }, + { + "epoch": 3.9186046511627906, + "grad_norm": 0.952806830406189, + "learning_rate": 4.120259309362532e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5582265853881836, + "num_tokens": 7407558159.0, + "step": 14491 + }, + { + "epoch": 3.91887506760411, + "grad_norm": 0.8919315934181213, + "learning_rate": 4.119243014847955e-06, + "loss": 1.8128, + "mean_token_accuracy": 0.5764767527580261, + "num_tokens": 7408082383.0, + "step": 14492 + }, + { + "epoch": 3.91914548404543, + "grad_norm": 0.9107387065887451, + "learning_rate": 4.1182269314552355e-06, + "loss": 1.8537, + "mean_token_accuracy": 0.5830320119857788, + "num_tokens": 7408606654.0, + "step": 14493 + }, + { + "epoch": 3.9194159004867495, + "grad_norm": 0.9792172908782959, + "learning_rate": 4.1172110592155535e-06, + "loss": 1.8357, + "mean_token_accuracy": 0.5779222846031189, + "num_tokens": 7409130879.0, + "step": 14494 + }, + { + "epoch": 3.919686316928069, + "grad_norm": 0.9072718024253845, + "learning_rate": 4.1161953981600775e-06, + "loss": 1.9264, + "mean_token_accuracy": 0.5523967146873474, + "num_tokens": 7409655110.0, + "step": 14495 + }, + { + "epoch": 3.9199567333693888, + "grad_norm": 0.9914073348045349, + "learning_rate": 4.11517994831997e-06, + "loss": 1.8549, + "mean_token_accuracy": 0.5720586180686951, + "num_tokens": 7410179319.0, + "step": 14496 + }, + { + "epoch": 3.9202271498107084, + "grad_norm": 1.0178428888320923, + "learning_rate": 4.11416470972639e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5846574306488037, + "num_tokens": 7410703553.0, + "step": 14497 + }, + { + "epoch": 3.920497566252028, + "grad_norm": 0.9413741230964661, + "learning_rate": 4.113149682410487e-06, + "loss": 1.9078, + "mean_token_accuracy": 0.5583469867706299, + "num_tokens": 7411227766.0, + "step": 14498 + }, + { + "epoch": 3.9207679826933477, + "grad_norm": 0.870104968547821, + "learning_rate": 4.112134866403401e-06, + "loss": 1.8172, + "mean_token_accuracy": 0.573509693145752, + "num_tokens": 7411752023.0, + "step": 14499 + }, + { + "epoch": 3.9210383991346673, + "grad_norm": 1.166735291481018, + "learning_rate": 4.111120261736277e-06, + "loss": 1.8598, + "mean_token_accuracy": 0.5897077322006226, + "num_tokens": 7412249959.0, + "step": 14500 + }, + { + "epoch": 3.921308815575987, + "grad_norm": 0.33901461958885193, + "learning_rate": 4.110105868440241e-06, + "loss": 1.101, + "mean_token_accuracy": 0.687244176864624, + "num_tokens": 7412774205.0, + "step": 14501 + }, + { + "epoch": 3.9215792320173066, + "grad_norm": 0.980000913143158, + "learning_rate": 4.109091686546417e-06, + "loss": 1.8336, + "mean_token_accuracy": 0.5741725564002991, + "num_tokens": 7413298359.0, + "step": 14502 + }, + { + "epoch": 3.9218496484586263, + "grad_norm": 1.050544261932373, + "learning_rate": 4.108077716085928e-06, + "loss": 1.9217, + "mean_token_accuracy": 0.5645641088485718, + "num_tokens": 7413822626.0, + "step": 14503 + }, + { + "epoch": 3.922120064899946, + "grad_norm": 0.9204146862030029, + "learning_rate": 4.107063957089882e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.55515456199646, + "num_tokens": 7414346833.0, + "step": 14504 + }, + { + "epoch": 3.9223904813412656, + "grad_norm": 0.8303235769271851, + "learning_rate": 4.106050409589382e-06, + "loss": 1.8487, + "mean_token_accuracy": 0.5639661550521851, + "num_tokens": 7414871034.0, + "step": 14505 + }, + { + "epoch": 3.922660897782585, + "grad_norm": 0.9372989535331726, + "learning_rate": 4.105037073615532e-06, + "loss": 1.6227, + "mean_token_accuracy": 0.628575325012207, + "num_tokens": 7415395282.0, + "step": 14506 + }, + { + "epoch": 3.922931314223905, + "grad_norm": 0.8113872408866882, + "learning_rate": 4.104023949199419e-06, + "loss": 1.8618, + "mean_token_accuracy": 0.576436460018158, + "num_tokens": 7415919460.0, + "step": 14507 + }, + { + "epoch": 3.9232017306652245, + "grad_norm": 0.8489216566085815, + "learning_rate": 4.103011036372133e-06, + "loss": 1.8916, + "mean_token_accuracy": 0.5674771070480347, + "num_tokens": 7416443581.0, + "step": 14508 + }, + { + "epoch": 3.923472147106544, + "grad_norm": 0.9212630391120911, + "learning_rate": 4.101998335164748e-06, + "loss": 1.8478, + "mean_token_accuracy": 0.5675451755523682, + "num_tokens": 7416967837.0, + "step": 14509 + }, + { + "epoch": 3.9237425635478638, + "grad_norm": 1.0433112382888794, + "learning_rate": 4.10098584560834e-06, + "loss": 1.8623, + "mean_token_accuracy": 0.5804794430732727, + "num_tokens": 7417492116.0, + "step": 14510 + }, + { + "epoch": 3.9240129799891834, + "grad_norm": 0.8329381346702576, + "learning_rate": 4.0999735677339795e-06, + "loss": 1.8684, + "mean_token_accuracy": 0.5747677087783813, + "num_tokens": 7418016387.0, + "step": 14511 + }, + { + "epoch": 3.924283396430503, + "grad_norm": 0.9911014437675476, + "learning_rate": 4.0989615015727205e-06, + "loss": 1.873, + "mean_token_accuracy": 0.577950119972229, + "num_tokens": 7418540599.0, + "step": 14512 + }, + { + "epoch": 3.9245538128718227, + "grad_norm": 0.9551249146461487, + "learning_rate": 4.097949647155614e-06, + "loss": 1.8031, + "mean_token_accuracy": 0.5843877792358398, + "num_tokens": 7419064878.0, + "step": 14513 + }, + { + "epoch": 3.9248242293131423, + "grad_norm": 0.9439067840576172, + "learning_rate": 4.096938004513713e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5780576467514038, + "num_tokens": 7419589058.0, + "step": 14514 + }, + { + "epoch": 3.925094645754462, + "grad_norm": 0.9688807725906372, + "learning_rate": 4.095926573678054e-06, + "loss": 1.8144, + "mean_token_accuracy": 0.5851004719734192, + "num_tokens": 7420113316.0, + "step": 14515 + }, + { + "epoch": 3.9253650621957816, + "grad_norm": 0.8579368591308594, + "learning_rate": 4.094915354679669e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5714788436889648, + "num_tokens": 7420637593.0, + "step": 14516 + }, + { + "epoch": 3.9256354786371013, + "grad_norm": 0.9749387502670288, + "learning_rate": 4.09390434754959e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.5613951086997986, + "num_tokens": 7421161833.0, + "step": 14517 + }, + { + "epoch": 3.925905895078421, + "grad_norm": 1.013915777206421, + "learning_rate": 4.092893552318835e-06, + "loss": 1.9204, + "mean_token_accuracy": 0.5616177916526794, + "num_tokens": 7421686115.0, + "step": 14518 + }, + { + "epoch": 3.9261763115197406, + "grad_norm": 1.0391209125518799, + "learning_rate": 4.091882969018417e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5893049240112305, + "num_tokens": 7422210274.0, + "step": 14519 + }, + { + "epoch": 3.9264467279610598, + "grad_norm": 0.9666358828544617, + "learning_rate": 4.090872597679346e-06, + "loss": 1.7916, + "mean_token_accuracy": 0.5686237812042236, + "num_tokens": 7422696296.0, + "step": 14520 + }, + { + "epoch": 3.92671714440238, + "grad_norm": 0.3412100672721863, + "learning_rate": 4.0898624383326204e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7424546480178833, + "num_tokens": 7423182095.0, + "step": 14521 + }, + { + "epoch": 3.926987560843699, + "grad_norm": 1.036197304725647, + "learning_rate": 4.088852491009238e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5733899474143982, + "num_tokens": 7423706375.0, + "step": 14522 + }, + { + "epoch": 3.927257977285019, + "grad_norm": 0.9754869937896729, + "learning_rate": 4.087842755740184e-06, + "loss": 1.758, + "mean_token_accuracy": 0.5857584476470947, + "num_tokens": 7424191033.0, + "step": 14523 + }, + { + "epoch": 3.9275283937263383, + "grad_norm": 0.9143100380897522, + "learning_rate": 4.086833232556444e-06, + "loss": 1.8569, + "mean_token_accuracy": 0.5826597213745117, + "num_tokens": 7424715255.0, + "step": 14524 + }, + { + "epoch": 3.9277988101676584, + "grad_norm": 0.9069764614105225, + "learning_rate": 4.0858239214889885e-06, + "loss": 1.7998, + "mean_token_accuracy": 0.5967837572097778, + "num_tokens": 7425239525.0, + "step": 14525 + }, + { + "epoch": 3.9280692266089776, + "grad_norm": 0.9619664549827576, + "learning_rate": 4.084814822568791e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5696674585342407, + "num_tokens": 7425763804.0, + "step": 14526 + }, + { + "epoch": 3.9283396430502977, + "grad_norm": 0.9690373539924622, + "learning_rate": 4.08380593582681e-06, + "loss": 1.8937, + "mean_token_accuracy": 0.5683275461196899, + "num_tokens": 7426288063.0, + "step": 14527 + }, + { + "epoch": 3.928610059491617, + "grad_norm": 1.0250554084777832, + "learning_rate": 4.082797261294e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5788735747337341, + "num_tokens": 7426812342.0, + "step": 14528 + }, + { + "epoch": 3.928880475932937, + "grad_norm": 0.954437255859375, + "learning_rate": 4.081788799001315e-06, + "loss": 1.7918, + "mean_token_accuracy": 0.593382716178894, + "num_tokens": 7427323945.0, + "step": 14529 + }, + { + "epoch": 3.929150892374256, + "grad_norm": 0.8239107131958008, + "learning_rate": 4.080780548979691e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.586643397808075, + "num_tokens": 7427848217.0, + "step": 14530 + }, + { + "epoch": 3.929421308815576, + "grad_norm": 1.1394256353378296, + "learning_rate": 4.079772511260072e-06, + "loss": 1.6667, + "mean_token_accuracy": 0.6243385076522827, + "num_tokens": 7428372341.0, + "step": 14531 + }, + { + "epoch": 3.9296917252568955, + "grad_norm": 1.1455070972442627, + "learning_rate": 4.078764685873383e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5881669521331787, + "num_tokens": 7428896502.0, + "step": 14532 + }, + { + "epoch": 3.929962141698215, + "grad_norm": 0.8700107336044312, + "learning_rate": 4.077757072850546e-06, + "loss": 1.7813, + "mean_token_accuracy": 0.573422372341156, + "num_tokens": 7429393798.0, + "step": 14533 + }, + { + "epoch": 3.9302325581395348, + "grad_norm": 1.0035371780395508, + "learning_rate": 4.07674967222248e-06, + "loss": 1.6373, + "mean_token_accuracy": 0.6070952415466309, + "num_tokens": 7429899312.0, + "step": 14534 + }, + { + "epoch": 3.9305029745808544, + "grad_norm": 0.9388003945350647, + "learning_rate": 4.0757424840200935e-06, + "loss": 1.7948, + "mean_token_accuracy": 0.5604593753814697, + "num_tokens": 7430423540.0, + "step": 14535 + }, + { + "epoch": 3.930773391022174, + "grad_norm": 1.144527554512024, + "learning_rate": 4.074735508274289e-06, + "loss": 1.91, + "mean_token_accuracy": 0.5714922547340393, + "num_tokens": 7430947649.0, + "step": 14536 + }, + { + "epoch": 3.9310438074634937, + "grad_norm": 1.067013144493103, + "learning_rate": 4.073728745015965e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.599165678024292, + "num_tokens": 7431415019.0, + "step": 14537 + }, + { + "epoch": 3.9313142239048133, + "grad_norm": 1.0315799713134766, + "learning_rate": 4.072722194276014e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5854966640472412, + "num_tokens": 7431939128.0, + "step": 14538 + }, + { + "epoch": 3.931584640346133, + "grad_norm": 1.1485241651535034, + "learning_rate": 4.071715856085315e-06, + "loss": 1.8576, + "mean_token_accuracy": 0.5710274577140808, + "num_tokens": 7432463244.0, + "step": 14539 + }, + { + "epoch": 3.9318550567874526, + "grad_norm": 1.0148100852966309, + "learning_rate": 4.070709730474752e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5825561285018921, + "num_tokens": 7432940077.0, + "step": 14540 + }, + { + "epoch": 3.9321254732287723, + "grad_norm": 0.4434688091278076, + "learning_rate": 4.069703817475191e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.6767796277999878, + "num_tokens": 7433426149.0, + "step": 14541 + }, + { + "epoch": 3.932395889670092, + "grad_norm": 0.891097903251648, + "learning_rate": 4.068698117117496e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5805172324180603, + "num_tokens": 7433939044.0, + "step": 14542 + }, + { + "epoch": 3.9326663061114115, + "grad_norm": 0.9752552509307861, + "learning_rate": 4.067692629432529e-06, + "loss": 1.8638, + "mean_token_accuracy": 0.6017733812332153, + "num_tokens": 7434398384.0, + "step": 14543 + }, + { + "epoch": 3.932936722552731, + "grad_norm": 1.0389662981033325, + "learning_rate": 4.066687354451137e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5673401355743408, + "num_tokens": 7434922524.0, + "step": 14544 + }, + { + "epoch": 3.933207138994051, + "grad_norm": 0.8429396152496338, + "learning_rate": 4.065682292204166e-06, + "loss": 1.6776, + "mean_token_accuracy": 0.6086599826812744, + "num_tokens": 7435446773.0, + "step": 14545 + }, + { + "epoch": 3.9334775554353705, + "grad_norm": 0.9583624601364136, + "learning_rate": 4.064677442722457e-06, + "loss": 1.6453, + "mean_token_accuracy": 0.622391939163208, + "num_tokens": 7435971024.0, + "step": 14546 + }, + { + "epoch": 3.93374797187669, + "grad_norm": 0.8578668832778931, + "learning_rate": 4.063672806036839e-06, + "loss": 1.867, + "mean_token_accuracy": 0.5763731002807617, + "num_tokens": 7436495266.0, + "step": 14547 + }, + { + "epoch": 3.9340183883180098, + "grad_norm": 0.8372223973274231, + "learning_rate": 4.062668382178135e-06, + "loss": 1.6544, + "mean_token_accuracy": 0.6058656573295593, + "num_tokens": 7437019228.0, + "step": 14548 + }, + { + "epoch": 3.9342888047593294, + "grad_norm": 1.0174833536148071, + "learning_rate": 4.061664171177169e-06, + "loss": 1.7356, + "mean_token_accuracy": 0.595071017742157, + "num_tokens": 7437531021.0, + "step": 14549 + }, + { + "epoch": 3.934559221200649, + "grad_norm": 1.0498216152191162, + "learning_rate": 4.060660173064749e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5825060606002808, + "num_tokens": 7438055156.0, + "step": 14550 + }, + { + "epoch": 3.9348296376419687, + "grad_norm": 0.7733542323112488, + "learning_rate": 4.05965638787168e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5669422745704651, + "num_tokens": 7438579389.0, + "step": 14551 + }, + { + "epoch": 3.9351000540832883, + "grad_norm": 0.9716911911964417, + "learning_rate": 4.058652815628768e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5940917730331421, + "num_tokens": 7439103628.0, + "step": 14552 + }, + { + "epoch": 3.935370470524608, + "grad_norm": 1.0700408220291138, + "learning_rate": 4.057649456366796e-06, + "loss": 1.8395, + "mean_token_accuracy": 0.5774379968643188, + "num_tokens": 7439602880.0, + "step": 14553 + }, + { + "epoch": 3.9356408869659276, + "grad_norm": 0.9207227230072021, + "learning_rate": 4.05664631011656e-06, + "loss": 1.8251, + "mean_token_accuracy": 0.5878491401672363, + "num_tokens": 7440073015.0, + "step": 14554 + }, + { + "epoch": 3.9359113034072473, + "grad_norm": 0.8322142958641052, + "learning_rate": 4.055643376908833e-06, + "loss": 1.8207, + "mean_token_accuracy": 0.5759735703468323, + "num_tokens": 7440597222.0, + "step": 14555 + }, + { + "epoch": 3.936181719848567, + "grad_norm": 1.0404974222183228, + "learning_rate": 4.0546406567743865e-06, + "loss": 1.9215, + "mean_token_accuracy": 0.5600782632827759, + "num_tokens": 7441096468.0, + "step": 14556 + }, + { + "epoch": 3.9364521362898865, + "grad_norm": 0.9190302491188049, + "learning_rate": 4.053638149743993e-06, + "loss": 1.7732, + "mean_token_accuracy": 0.5832866430282593, + "num_tokens": 7441620646.0, + "step": 14557 + }, + { + "epoch": 3.936722552731206, + "grad_norm": 0.762559711933136, + "learning_rate": 4.052635855848408e-06, + "loss": 1.7883, + "mean_token_accuracy": 0.5704180002212524, + "num_tokens": 7442144829.0, + "step": 14558 + }, + { + "epoch": 3.936992969172526, + "grad_norm": 0.877325177192688, + "learning_rate": 4.0516337751183836e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5836257338523865, + "num_tokens": 7442626744.0, + "step": 14559 + }, + { + "epoch": 3.9372633856138455, + "grad_norm": 0.9706202745437622, + "learning_rate": 4.050631907584671e-06, + "loss": 1.7571, + "mean_token_accuracy": 0.5852841138839722, + "num_tokens": 7443150742.0, + "step": 14560 + }, + { + "epoch": 3.9375338020551647, + "grad_norm": 0.3453885018825531, + "learning_rate": 4.049630253278009e-06, + "loss": 1.1315, + "mean_token_accuracy": 0.6956989765167236, + "num_tokens": 7443674944.0, + "step": 14561 + }, + { + "epoch": 3.9378042184964848, + "grad_norm": 1.2117959260940552, + "learning_rate": 4.048628812229126e-06, + "loss": 1.788, + "mean_token_accuracy": 0.6000350713729858, + "num_tokens": 7444153190.0, + "step": 14562 + }, + { + "epoch": 3.938074634937804, + "grad_norm": 1.140208125114441, + "learning_rate": 4.047627584468758e-06, + "loss": 1.8175, + "mean_token_accuracy": 0.5875991582870483, + "num_tokens": 7444637531.0, + "step": 14563 + }, + { + "epoch": 3.938345051379124, + "grad_norm": 1.0163829326629639, + "learning_rate": 4.046626570027618e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5640463829040527, + "num_tokens": 7445161806.0, + "step": 14564 + }, + { + "epoch": 3.9386154678204432, + "grad_norm": 0.9586434960365295, + "learning_rate": 4.045625768936428e-06, + "loss": 1.6235, + "mean_token_accuracy": 0.6341322660446167, + "num_tokens": 7445685908.0, + "step": 14565 + }, + { + "epoch": 3.9388858842617633, + "grad_norm": 0.88584965467453, + "learning_rate": 4.044625181225887e-06, + "loss": 1.8821, + "mean_token_accuracy": 0.5663449764251709, + "num_tokens": 7446210153.0, + "step": 14566 + }, + { + "epoch": 3.9391563007030825, + "grad_norm": 0.8043649196624756, + "learning_rate": 4.043624806926702e-06, + "loss": 1.8653, + "mean_token_accuracy": 0.571624219417572, + "num_tokens": 7446676293.0, + "step": 14567 + }, + { + "epoch": 3.9394267171444026, + "grad_norm": 0.9470646381378174, + "learning_rate": 4.0426246460695625e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5635236501693726, + "num_tokens": 7447200523.0, + "step": 14568 + }, + { + "epoch": 3.939697133585722, + "grad_norm": 1.1161805391311646, + "learning_rate": 4.041624698685162e-06, + "loss": 1.8924, + "mean_token_accuracy": 0.5799823999404907, + "num_tokens": 7447688966.0, + "step": 14569 + }, + { + "epoch": 3.939967550027042, + "grad_norm": 0.961540699005127, + "learning_rate": 4.040624964804179e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.5972839593887329, + "num_tokens": 7448156056.0, + "step": 14570 + }, + { + "epoch": 3.940237966468361, + "grad_norm": 0.9321832060813904, + "learning_rate": 4.039625444457286e-06, + "loss": 1.8419, + "mean_token_accuracy": 0.5667922496795654, + "num_tokens": 7448680197.0, + "step": 14571 + }, + { + "epoch": 3.9405083829096808, + "grad_norm": 0.9759510159492493, + "learning_rate": 4.0386261376751564e-06, + "loss": 1.8053, + "mean_token_accuracy": 0.5772818326950073, + "num_tokens": 7449204402.0, + "step": 14572 + }, + { + "epoch": 3.9407787993510004, + "grad_norm": 0.9022737145423889, + "learning_rate": 4.037627044488449e-06, + "loss": 1.9261, + "mean_token_accuracy": 0.5672954320907593, + "num_tokens": 7449724542.0, + "step": 14573 + }, + { + "epoch": 3.94104921579232, + "grad_norm": 0.7752329707145691, + "learning_rate": 4.036628164927815e-06, + "loss": 1.8632, + "mean_token_accuracy": 0.5649257898330688, + "num_tokens": 7450248811.0, + "step": 14574 + }, + { + "epoch": 3.9413196322336397, + "grad_norm": 1.3705418109893799, + "learning_rate": 4.03562949902391e-06, + "loss": 1.7914, + "mean_token_accuracy": 0.6094510555267334, + "num_tokens": 7450726307.0, + "step": 14575 + }, + { + "epoch": 3.9415900486749593, + "grad_norm": 1.4750938415527344, + "learning_rate": 4.034631046807371e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5716730356216431, + "num_tokens": 7451250574.0, + "step": 14576 + }, + { + "epoch": 3.941860465116279, + "grad_norm": 1.3092848062515259, + "learning_rate": 4.033632808308837e-06, + "loss": 1.6381, + "mean_token_accuracy": 0.6227683424949646, + "num_tokens": 7451774755.0, + "step": 14577 + }, + { + "epoch": 3.9421308815575986, + "grad_norm": 1.267254114151001, + "learning_rate": 4.032634783558935e-06, + "loss": 1.7755, + "mean_token_accuracy": 0.5945709943771362, + "num_tokens": 7452283284.0, + "step": 14578 + }, + { + "epoch": 3.9424012979989183, + "grad_norm": 1.0536553859710693, + "learning_rate": 4.031636972588287e-06, + "loss": 1.7212, + "mean_token_accuracy": 0.5906071662902832, + "num_tokens": 7452807537.0, + "step": 14579 + }, + { + "epoch": 3.942671714440238, + "grad_norm": 0.9338876008987427, + "learning_rate": 4.030639375427508e-06, + "loss": 1.7172, + "mean_token_accuracy": 0.6040986180305481, + "num_tokens": 7453286894.0, + "step": 14580 + }, + { + "epoch": 3.9429421308815575, + "grad_norm": 0.3254764676094055, + "learning_rate": 4.02964199210721e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.7037126421928406, + "num_tokens": 7453811127.0, + "step": 14581 + }, + { + "epoch": 3.943212547322877, + "grad_norm": 1.2637330293655396, + "learning_rate": 4.028644822657992e-06, + "loss": 1.9606, + "mean_token_accuracy": 0.5405303239822388, + "num_tokens": 7454335397.0, + "step": 14582 + }, + { + "epoch": 3.943482963764197, + "grad_norm": 1.2657606601715088, + "learning_rate": 4.027647867110456e-06, + "loss": 1.7843, + "mean_token_accuracy": 0.5927966833114624, + "num_tokens": 7454801380.0, + "step": 14583 + }, + { + "epoch": 3.9437533802055165, + "grad_norm": 1.221381664276123, + "learning_rate": 4.026651125495187e-06, + "loss": 1.7165, + "mean_token_accuracy": 0.6009715795516968, + "num_tokens": 7455325608.0, + "step": 14584 + }, + { + "epoch": 3.944023796646836, + "grad_norm": 1.0153635740280151, + "learning_rate": 4.025654597842766e-06, + "loss": 1.8553, + "mean_token_accuracy": 0.5375560522079468, + "num_tokens": 7455849786.0, + "step": 14585 + }, + { + "epoch": 3.9442942130881558, + "grad_norm": 1.0117297172546387, + "learning_rate": 4.024658284183775e-06, + "loss": 1.8331, + "mean_token_accuracy": 0.5838646292686462, + "num_tokens": 7456352622.0, + "step": 14586 + }, + { + "epoch": 3.9445646295294754, + "grad_norm": 1.1664942502975464, + "learning_rate": 4.023662184548781e-06, + "loss": 1.7347, + "mean_token_accuracy": 0.6113898158073425, + "num_tokens": 7456824832.0, + "step": 14587 + }, + { + "epoch": 3.944835045970795, + "grad_norm": 1.0408625602722168, + "learning_rate": 4.022666298968345e-06, + "loss": 1.8657, + "mean_token_accuracy": 0.570314884185791, + "num_tokens": 7457348970.0, + "step": 14588 + }, + { + "epoch": 3.9451054624121147, + "grad_norm": 0.9632489085197449, + "learning_rate": 4.0216706274730275e-06, + "loss": 1.8381, + "mean_token_accuracy": 0.5642908811569214, + "num_tokens": 7457873204.0, + "step": 14589 + }, + { + "epoch": 3.9453758788534343, + "grad_norm": 0.9551023244857788, + "learning_rate": 4.0206751700933784e-06, + "loss": 1.9058, + "mean_token_accuracy": 0.5647010207176208, + "num_tokens": 7458397341.0, + "step": 14590 + }, + { + "epoch": 3.945646295294754, + "grad_norm": 1.1159809827804565, + "learning_rate": 4.019679926859936e-06, + "loss": 1.8419, + "mean_token_accuracy": 0.5864031910896301, + "num_tokens": 7458862345.0, + "step": 14591 + }, + { + "epoch": 3.9459167117360736, + "grad_norm": 1.1972885131835938, + "learning_rate": 4.018684897803245e-06, + "loss": 1.937, + "mean_token_accuracy": 0.5588266849517822, + "num_tokens": 7459324786.0, + "step": 14592 + }, + { + "epoch": 3.9461871281773933, + "grad_norm": 1.005726933479309, + "learning_rate": 4.017690082953829e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5873563289642334, + "num_tokens": 7459849045.0, + "step": 14593 + }, + { + "epoch": 3.946457544618713, + "grad_norm": 0.8774203062057495, + "learning_rate": 4.016695482342215e-06, + "loss": 1.7274, + "mean_token_accuracy": 0.587945818901062, + "num_tokens": 7460373243.0, + "step": 14594 + }, + { + "epoch": 3.9467279610600325, + "grad_norm": 1.0039875507354736, + "learning_rate": 4.015701095998923e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.5689683556556702, + "num_tokens": 7460897503.0, + "step": 14595 + }, + { + "epoch": 3.946998377501352, + "grad_norm": 0.9594221711158752, + "learning_rate": 4.014706923954461e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5811946392059326, + "num_tokens": 7461397152.0, + "step": 14596 + }, + { + "epoch": 3.947268793942672, + "grad_norm": 0.9414075613021851, + "learning_rate": 4.013712966239329e-06, + "loss": 1.858, + "mean_token_accuracy": 0.5628869533538818, + "num_tokens": 7461921426.0, + "step": 14597 + }, + { + "epoch": 3.9475392103839915, + "grad_norm": 0.8363479971885681, + "learning_rate": 4.012719222884032e-06, + "loss": 1.733, + "mean_token_accuracy": 0.6043967604637146, + "num_tokens": 7462440272.0, + "step": 14598 + }, + { + "epoch": 3.947809626825311, + "grad_norm": 1.0143646001815796, + "learning_rate": 4.011725693919055e-06, + "loss": 1.8749, + "mean_token_accuracy": 0.5421783924102783, + "num_tokens": 7462964478.0, + "step": 14599 + }, + { + "epoch": 3.9480800432666308, + "grad_norm": 0.9204209446907043, + "learning_rate": 4.010732379374888e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5758756399154663, + "num_tokens": 7463475904.0, + "step": 14600 + }, + { + "epoch": 3.9483504597079504, + "grad_norm": 0.35286080837249756, + "learning_rate": 4.009739279282005e-06, + "loss": 1.1253, + "mean_token_accuracy": 0.7046635150909424, + "num_tokens": 7464000104.0, + "step": 14601 + }, + { + "epoch": 3.9486208761492696, + "grad_norm": 1.020516037940979, + "learning_rate": 4.008746393670875e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5648835897445679, + "num_tokens": 7464524352.0, + "step": 14602 + }, + { + "epoch": 3.9488912925905897, + "grad_norm": 0.9521269202232361, + "learning_rate": 4.007753722571969e-06, + "loss": 1.8757, + "mean_token_accuracy": 0.5760116577148438, + "num_tokens": 7465048615.0, + "step": 14603 + }, + { + "epoch": 3.949161709031909, + "grad_norm": 0.9320822954177856, + "learning_rate": 4.006761266015741e-06, + "loss": 1.8697, + "mean_token_accuracy": 0.5632388591766357, + "num_tokens": 7465572825.0, + "step": 14604 + }, + { + "epoch": 3.949432125473229, + "grad_norm": 0.8257083296775818, + "learning_rate": 4.005769024032641e-06, + "loss": 1.701, + "mean_token_accuracy": 0.6079776287078857, + "num_tokens": 7466097039.0, + "step": 14605 + }, + { + "epoch": 3.949702541914548, + "grad_norm": 0.8189445734024048, + "learning_rate": 4.0047769966531185e-06, + "loss": 1.9197, + "mean_token_accuracy": 0.5572147965431213, + "num_tokens": 7466586748.0, + "step": 14606 + }, + { + "epoch": 3.9499729583558683, + "grad_norm": 0.9533587098121643, + "learning_rate": 4.003785183907607e-06, + "loss": 1.9289, + "mean_token_accuracy": 0.5683585405349731, + "num_tokens": 7467097381.0, + "step": 14607 + }, + { + "epoch": 3.9502433747971875, + "grad_norm": 0.7381550669670105, + "learning_rate": 4.002793585826542e-06, + "loss": 1.8406, + "mean_token_accuracy": 0.5762107968330383, + "num_tokens": 7467621490.0, + "step": 14608 + }, + { + "epoch": 3.9505137912385075, + "grad_norm": 0.9730543494224548, + "learning_rate": 4.001802202440347e-06, + "loss": 1.9621, + "mean_token_accuracy": 0.5493971109390259, + "num_tokens": 7468107614.0, + "step": 14609 + }, + { + "epoch": 3.9507842076798267, + "grad_norm": 1.028748869895935, + "learning_rate": 4.00081103377944e-06, + "loss": 2.014, + "mean_token_accuracy": 0.5300628542900085, + "num_tokens": 7468631770.0, + "step": 14610 + }, + { + "epoch": 3.951054624121147, + "grad_norm": 0.7779315114021301, + "learning_rate": 3.999820079874234e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5650185346603394, + "num_tokens": 7469156013.0, + "step": 14611 + }, + { + "epoch": 3.951325040562466, + "grad_norm": 0.916042149066925, + "learning_rate": 3.998829340755136e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.5717124938964844, + "num_tokens": 7469680271.0, + "step": 14612 + }, + { + "epoch": 3.9515954570037857, + "grad_norm": 0.7917288541793823, + "learning_rate": 3.997838816452542e-06, + "loss": 1.8701, + "mean_token_accuracy": 0.5596200823783875, + "num_tokens": 7470204474.0, + "step": 14613 + }, + { + "epoch": 3.9518658734451053, + "grad_norm": 0.9971067309379578, + "learning_rate": 3.996848506996843e-06, + "loss": 1.7827, + "mean_token_accuracy": 0.5678694844245911, + "num_tokens": 7470694924.0, + "step": 14614 + }, + { + "epoch": 3.952136289886425, + "grad_norm": 0.8078184723854065, + "learning_rate": 3.99585841241843e-06, + "loss": 1.7433, + "mean_token_accuracy": 0.5967421531677246, + "num_tokens": 7471195892.0, + "step": 14615 + }, + { + "epoch": 3.9524067063277446, + "grad_norm": 0.8122430443763733, + "learning_rate": 3.994868532747678e-06, + "loss": 1.7777, + "mean_token_accuracy": 0.5771145820617676, + "num_tokens": 7471720085.0, + "step": 14616 + }, + { + "epoch": 3.9526771227690642, + "grad_norm": 0.865776538848877, + "learning_rate": 3.993878868014959e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5759820342063904, + "num_tokens": 7472244230.0, + "step": 14617 + }, + { + "epoch": 3.952947539210384, + "grad_norm": 0.9188961982727051, + "learning_rate": 3.992889418250641e-06, + "loss": 1.7719, + "mean_token_accuracy": 0.5935724973678589, + "num_tokens": 7472703386.0, + "step": 14618 + }, + { + "epoch": 3.9532179556517035, + "grad_norm": 0.8799095749855042, + "learning_rate": 3.991900183485082e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5791968107223511, + "num_tokens": 7473227378.0, + "step": 14619 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 1.0064481496810913, + "learning_rate": 3.990911163748633e-06, + "loss": 1.8968, + "mean_token_accuracy": 0.557633638381958, + "num_tokens": 7473751433.0, + "step": 14620 + }, + { + "epoch": 3.953758788534343, + "grad_norm": 0.32677340507507324, + "learning_rate": 3.989922359071642e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.7038599252700806, + "num_tokens": 7474275583.0, + "step": 14621 + }, + { + "epoch": 3.9540292049756625, + "grad_norm": 1.1324983835220337, + "learning_rate": 3.988933769484451e-06, + "loss": 1.9233, + "mean_token_accuracy": 0.5580699443817139, + "num_tokens": 7474799807.0, + "step": 14622 + }, + { + "epoch": 3.954299621416982, + "grad_norm": 0.9490441083908081, + "learning_rate": 3.987945395017387e-06, + "loss": 1.7956, + "mean_token_accuracy": 0.5778764486312866, + "num_tokens": 7475323788.0, + "step": 14623 + }, + { + "epoch": 3.9545700378583017, + "grad_norm": 0.8758891820907593, + "learning_rate": 3.986957235700783e-06, + "loss": 1.8306, + "mean_token_accuracy": 0.575614333152771, + "num_tokens": 7475842378.0, + "step": 14624 + }, + { + "epoch": 3.9548404542996214, + "grad_norm": 0.9268870949745178, + "learning_rate": 3.985969291564952e-06, + "loss": 1.7576, + "mean_token_accuracy": 0.5933767557144165, + "num_tokens": 7476366587.0, + "step": 14625 + }, + { + "epoch": 3.955110870740941, + "grad_norm": 1.0980316400527954, + "learning_rate": 3.984981562640211e-06, + "loss": 1.8821, + "mean_token_accuracy": 0.5821020603179932, + "num_tokens": 7476877061.0, + "step": 14626 + }, + { + "epoch": 3.9553812871822607, + "grad_norm": 0.9559672474861145, + "learning_rate": 3.983994048956868e-06, + "loss": 1.7377, + "mean_token_accuracy": 0.5836089849472046, + "num_tokens": 7477401246.0, + "step": 14627 + }, + { + "epoch": 3.9556517036235803, + "grad_norm": 0.9128623604774475, + "learning_rate": 3.9830067505452156e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5870180130004883, + "num_tokens": 7477925453.0, + "step": 14628 + }, + { + "epoch": 3.9559221200649, + "grad_norm": 0.8524625301361084, + "learning_rate": 3.982019667435556e-06, + "loss": 1.8077, + "mean_token_accuracy": 0.5821385383605957, + "num_tokens": 7478425903.0, + "step": 14629 + }, + { + "epoch": 3.9561925365062196, + "grad_norm": 1.0320634841918945, + "learning_rate": 3.98103279965817e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5759603977203369, + "num_tokens": 7478950077.0, + "step": 14630 + }, + { + "epoch": 3.9564629529475392, + "grad_norm": 0.8613345623016357, + "learning_rate": 3.980046147243338e-06, + "loss": 1.8831, + "mean_token_accuracy": 0.5744765996932983, + "num_tokens": 7479474331.0, + "step": 14631 + }, + { + "epoch": 3.956733369388859, + "grad_norm": 1.1233611106872559, + "learning_rate": 3.979059710221336e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.586740255355835, + "num_tokens": 7479877422.0, + "step": 14632 + }, + { + "epoch": 3.9570037858301785, + "grad_norm": 0.8815636038780212, + "learning_rate": 3.978073488622431e-06, + "loss": 1.771, + "mean_token_accuracy": 0.5812419652938843, + "num_tokens": 7480401640.0, + "step": 14633 + }, + { + "epoch": 3.957274202271498, + "grad_norm": 0.9119060635566711, + "learning_rate": 3.977087482476877e-06, + "loss": 1.7579, + "mean_token_accuracy": 0.5859074592590332, + "num_tokens": 7480874551.0, + "step": 14634 + }, + { + "epoch": 3.957544618712818, + "grad_norm": 0.7597196102142334, + "learning_rate": 3.976101691814935e-06, + "loss": 1.6694, + "mean_token_accuracy": 0.598273754119873, + "num_tokens": 7481398719.0, + "step": 14635 + }, + { + "epoch": 3.9578150351541375, + "grad_norm": 0.8809781670570374, + "learning_rate": 3.975116116666847e-06, + "loss": 1.7868, + "mean_token_accuracy": 0.5909286141395569, + "num_tokens": 7481922890.0, + "step": 14636 + }, + { + "epoch": 3.958085451595457, + "grad_norm": 0.9061487317085266, + "learning_rate": 3.974130757062855e-06, + "loss": 1.67, + "mean_token_accuracy": 0.5863072872161865, + "num_tokens": 7482447124.0, + "step": 14637 + }, + { + "epoch": 3.9583558680367767, + "grad_norm": 0.9028627872467041, + "learning_rate": 3.973145613033197e-06, + "loss": 1.901, + "mean_token_accuracy": 0.5624873638153076, + "num_tokens": 7482969062.0, + "step": 14638 + }, + { + "epoch": 3.9586262844780964, + "grad_norm": 0.9645321369171143, + "learning_rate": 3.972160684608094e-06, + "loss": 1.9198, + "mean_token_accuracy": 0.5257786512374878, + "num_tokens": 7483493278.0, + "step": 14639 + }, + { + "epoch": 3.958896700919416, + "grad_norm": 0.7870923280715942, + "learning_rate": 3.9711759718177675e-06, + "loss": 1.6736, + "mean_token_accuracy": 0.6149765253067017, + "num_tokens": 7483982554.0, + "step": 14640 + }, + { + "epoch": 3.9591671173607357, + "grad_norm": 0.40529507398605347, + "learning_rate": 3.970191474692435e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.7025645971298218, + "num_tokens": 7484506769.0, + "step": 14641 + }, + { + "epoch": 3.9594375338020553, + "grad_norm": 1.0292303562164307, + "learning_rate": 3.969207193262301e-06, + "loss": 1.8015, + "mean_token_accuracy": 0.56245356798172, + "num_tokens": 7485030864.0, + "step": 14642 + }, + { + "epoch": 3.959707950243375, + "grad_norm": 1.1358484029769897, + "learning_rate": 3.968223127557565e-06, + "loss": 1.8453, + "mean_token_accuracy": 0.565804123878479, + "num_tokens": 7485555090.0, + "step": 14643 + }, + { + "epoch": 3.9599783666846946, + "grad_norm": 1.0895535945892334, + "learning_rate": 3.967239277608424e-06, + "loss": 1.81, + "mean_token_accuracy": 0.5959153771400452, + "num_tokens": 7486019457.0, + "step": 14644 + }, + { + "epoch": 3.960248783126014, + "grad_norm": 0.7981734871864319, + "learning_rate": 3.966255643445063e-06, + "loss": 1.8494, + "mean_token_accuracy": 0.5595099329948425, + "num_tokens": 7486543625.0, + "step": 14645 + }, + { + "epoch": 3.960519199567334, + "grad_norm": 0.7200589776039124, + "learning_rate": 3.965272225097665e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5651256442070007, + "num_tokens": 7487067901.0, + "step": 14646 + }, + { + "epoch": 3.960789616008653, + "grad_norm": 1.0048854351043701, + "learning_rate": 3.964289022596402e-06, + "loss": 1.8874, + "mean_token_accuracy": 0.5741192102432251, + "num_tokens": 7487592055.0, + "step": 14647 + }, + { + "epoch": 3.961060032449973, + "grad_norm": 1.2062609195709229, + "learning_rate": 3.9633060359714415e-06, + "loss": 1.9218, + "mean_token_accuracy": 0.5562164783477783, + "num_tokens": 7488116308.0, + "step": 14648 + }, + { + "epoch": 3.9613304488912924, + "grad_norm": 1.0061980485916138, + "learning_rate": 3.962323265252948e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5543426871299744, + "num_tokens": 7488640423.0, + "step": 14649 + }, + { + "epoch": 3.9616008653326125, + "grad_norm": 0.8209851384162903, + "learning_rate": 3.96134071047107e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5998885035514832, + "num_tokens": 7489147690.0, + "step": 14650 + }, + { + "epoch": 3.9618712817739317, + "grad_norm": 0.7565949559211731, + "learning_rate": 3.960358371655958e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5719704627990723, + "num_tokens": 7489671935.0, + "step": 14651 + }, + { + "epoch": 3.9621416982152518, + "grad_norm": 1.018922209739685, + "learning_rate": 3.959376248837756e-06, + "loss": 1.6564, + "mean_token_accuracy": 0.6120548248291016, + "num_tokens": 7490196216.0, + "step": 14652 + }, + { + "epoch": 3.962412114656571, + "grad_norm": 1.1238011121749878, + "learning_rate": 3.958394342046596e-06, + "loss": 1.8491, + "mean_token_accuracy": 0.5854278802871704, + "num_tokens": 7490680801.0, + "step": 14653 + }, + { + "epoch": 3.9626825310978906, + "grad_norm": 0.8725765347480774, + "learning_rate": 3.957412651312603e-06, + "loss": 1.9261, + "mean_token_accuracy": 0.5580723285675049, + "num_tokens": 7491205088.0, + "step": 14654 + }, + { + "epoch": 3.9629529475392102, + "grad_norm": 1.082574486732483, + "learning_rate": 3.956431176665905e-06, + "loss": 1.8823, + "mean_token_accuracy": 0.557026743888855, + "num_tokens": 7491675176.0, + "step": 14655 + }, + { + "epoch": 3.96322336398053, + "grad_norm": 1.0193877220153809, + "learning_rate": 3.95544991813661e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.5654861927032471, + "num_tokens": 7492199438.0, + "step": 14656 + }, + { + "epoch": 3.9634937804218495, + "grad_norm": 0.9109620451927185, + "learning_rate": 3.9544688757548265e-06, + "loss": 1.7937, + "mean_token_accuracy": 0.5664360523223877, + "num_tokens": 7492675697.0, + "step": 14657 + }, + { + "epoch": 3.963764196863169, + "grad_norm": 0.8742013573646545, + "learning_rate": 3.953488049550659e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5739166736602783, + "num_tokens": 7493165122.0, + "step": 14658 + }, + { + "epoch": 3.964034613304489, + "grad_norm": 0.9987239837646484, + "learning_rate": 3.952507439554203e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.5897904634475708, + "num_tokens": 7493689294.0, + "step": 14659 + }, + { + "epoch": 3.9643050297458085, + "grad_norm": 31.861635208129883, + "learning_rate": 3.951527045795539e-06, + "loss": 1.506, + "mean_token_accuracy": 0.6272701025009155, + "num_tokens": 7494201698.0, + "step": 14660 + }, + { + "epoch": 3.964575446187128, + "grad_norm": 0.379507839679718, + "learning_rate": 3.950546868304757e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.7213752865791321, + "num_tokens": 7494666567.0, + "step": 14661 + }, + { + "epoch": 3.9648458626284477, + "grad_norm": 0.9740511178970337, + "learning_rate": 3.949566907111929e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5860577821731567, + "num_tokens": 7495190844.0, + "step": 14662 + }, + { + "epoch": 3.9651162790697674, + "grad_norm": 1.014070749282837, + "learning_rate": 3.948587162247118e-06, + "loss": 1.8558, + "mean_token_accuracy": 0.560767412185669, + "num_tokens": 7495715089.0, + "step": 14663 + }, + { + "epoch": 3.965386695511087, + "grad_norm": 0.9932840466499329, + "learning_rate": 3.947607633740391e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5770958662033081, + "num_tokens": 7496239297.0, + "step": 14664 + }, + { + "epoch": 3.9656571119524067, + "grad_norm": 0.8977603316307068, + "learning_rate": 3.946628321621804e-06, + "loss": 1.8347, + "mean_token_accuracy": 0.5643594264984131, + "num_tokens": 7496763378.0, + "step": 14665 + }, + { + "epoch": 3.9659275283937263, + "grad_norm": 0.8682811260223389, + "learning_rate": 3.9456492259214e-06, + "loss": 1.7619, + "mean_token_accuracy": 0.6151359677314758, + "num_tokens": 7497287588.0, + "step": 14666 + }, + { + "epoch": 3.966197944835046, + "grad_norm": 0.944667637348175, + "learning_rate": 3.944670346669226e-06, + "loss": 1.9126, + "mean_token_accuracy": 0.5583937168121338, + "num_tokens": 7497811860.0, + "step": 14667 + }, + { + "epoch": 3.9664683612763656, + "grad_norm": 0.9482016563415527, + "learning_rate": 3.94369168389531e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5676490664482117, + "num_tokens": 7498336072.0, + "step": 14668 + }, + { + "epoch": 3.9667387777176852, + "grad_norm": 0.8776809573173523, + "learning_rate": 3.942713237629689e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5687429904937744, + "num_tokens": 7498844475.0, + "step": 14669 + }, + { + "epoch": 3.967009194159005, + "grad_norm": 0.9281471967697144, + "learning_rate": 3.941735007902379e-06, + "loss": 1.7829, + "mean_token_accuracy": 0.589012622833252, + "num_tokens": 7499307315.0, + "step": 14670 + }, + { + "epoch": 3.9672796106003245, + "grad_norm": 0.8372501730918884, + "learning_rate": 3.9407569947433935e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.558411717414856, + "num_tokens": 7499831398.0, + "step": 14671 + }, + { + "epoch": 3.967550027041644, + "grad_norm": 0.9176650643348694, + "learning_rate": 3.939779198182746e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5779212713241577, + "num_tokens": 7500355441.0, + "step": 14672 + }, + { + "epoch": 3.967820443482964, + "grad_norm": 0.9074445962905884, + "learning_rate": 3.9388016182504355e-06, + "loss": 1.7551, + "mean_token_accuracy": 0.5722601413726807, + "num_tokens": 7500879472.0, + "step": 14673 + }, + { + "epoch": 3.9680908599242835, + "grad_norm": 0.8488255143165588, + "learning_rate": 3.937824254976454e-06, + "loss": 1.7089, + "mean_token_accuracy": 0.6017024517059326, + "num_tokens": 7501403749.0, + "step": 14674 + }, + { + "epoch": 3.968361276365603, + "grad_norm": 0.9976122379302979, + "learning_rate": 3.936847108390797e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5911967754364014, + "num_tokens": 7501927930.0, + "step": 14675 + }, + { + "epoch": 3.9686316928069227, + "grad_norm": 0.9251195788383484, + "learning_rate": 3.93587017852344e-06, + "loss": 1.7286, + "mean_token_accuracy": 0.5855460166931152, + "num_tokens": 7502452085.0, + "step": 14676 + }, + { + "epoch": 3.9689021092482424, + "grad_norm": 0.9639459252357483, + "learning_rate": 3.934893465404359e-06, + "loss": 1.818, + "mean_token_accuracy": 0.6068413257598877, + "num_tokens": 7502911809.0, + "step": 14677 + }, + { + "epoch": 3.969172525689562, + "grad_norm": 0.935175895690918, + "learning_rate": 3.9339169690635255e-06, + "loss": 1.813, + "mean_token_accuracy": 0.5846613049507141, + "num_tokens": 7503436063.0, + "step": 14678 + }, + { + "epoch": 3.9694429421308817, + "grad_norm": 0.8179749250411987, + "learning_rate": 3.932940689530897e-06, + "loss": 1.8112, + "mean_token_accuracy": 0.5837228298187256, + "num_tokens": 7503933955.0, + "step": 14679 + }, + { + "epoch": 3.9697133585722013, + "grad_norm": 0.9398632645606995, + "learning_rate": 3.931964626836431e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5517390966415405, + "num_tokens": 7504458176.0, + "step": 14680 + }, + { + "epoch": 3.969983775013521, + "grad_norm": 0.36400941014289856, + "learning_rate": 3.930988781010078e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.7123288512229919, + "num_tokens": 7504982442.0, + "step": 14681 + }, + { + "epoch": 3.9702541914548406, + "grad_norm": 1.1599057912826538, + "learning_rate": 3.930013152081778e-06, + "loss": 1.9517, + "mean_token_accuracy": 0.550010085105896, + "num_tokens": 7505452495.0, + "step": 14682 + }, + { + "epoch": 3.9705246078961602, + "grad_norm": 1.0534650087356567, + "learning_rate": 3.9290377400814635e-06, + "loss": 1.8258, + "mean_token_accuracy": 0.5812010169029236, + "num_tokens": 7505938787.0, + "step": 14683 + }, + { + "epoch": 3.97079502433748, + "grad_norm": 1.0521131753921509, + "learning_rate": 3.928062545039069e-06, + "loss": 1.9662, + "mean_token_accuracy": 0.5449315309524536, + "num_tokens": 7506463066.0, + "step": 14684 + }, + { + "epoch": 3.9710654407787995, + "grad_norm": 0.8224487900733948, + "learning_rate": 3.927087566984512e-06, + "loss": 1.8707, + "mean_token_accuracy": 0.5569682121276855, + "num_tokens": 7506987194.0, + "step": 14685 + }, + { + "epoch": 3.9713358572201187, + "grad_norm": 0.9364157915115356, + "learning_rate": 3.926112805947706e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.579738974571228, + "num_tokens": 7507483858.0, + "step": 14686 + }, + { + "epoch": 3.971606273661439, + "grad_norm": 0.8278475403785706, + "learning_rate": 3.925138261958565e-06, + "loss": 1.7469, + "mean_token_accuracy": 0.592315673828125, + "num_tokens": 7508008053.0, + "step": 14687 + }, + { + "epoch": 3.971876690102758, + "grad_norm": 0.8181443810462952, + "learning_rate": 3.924163935046987e-06, + "loss": 1.8828, + "mean_token_accuracy": 0.5624135732650757, + "num_tokens": 7508532330.0, + "step": 14688 + }, + { + "epoch": 3.972147106544078, + "grad_norm": 0.9049620628356934, + "learning_rate": 3.923189825242867e-06, + "loss": 1.8361, + "mean_token_accuracy": 0.5845601558685303, + "num_tokens": 7509034743.0, + "step": 14689 + }, + { + "epoch": 3.9724175229853973, + "grad_norm": 1.0006686449050903, + "learning_rate": 3.922215932576095e-06, + "loss": 1.7897, + "mean_token_accuracy": 0.5871845483779907, + "num_tokens": 7509445138.0, + "step": 14690 + }, + { + "epoch": 3.9726879394267174, + "grad_norm": 0.9490581750869751, + "learning_rate": 3.921242257076553e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5994741320610046, + "num_tokens": 7509948773.0, + "step": 14691 + }, + { + "epoch": 3.9729583558680366, + "grad_norm": 0.9665541052818298, + "learning_rate": 3.920268798774117e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.5730039477348328, + "num_tokens": 7510473005.0, + "step": 14692 + }, + { + "epoch": 3.9732287723093567, + "grad_norm": 0.7992364764213562, + "learning_rate": 3.919295557698651e-06, + "loss": 1.5199, + "mean_token_accuracy": 0.6369048357009888, + "num_tokens": 7510985240.0, + "step": 14693 + }, + { + "epoch": 3.973499188750676, + "grad_norm": 0.964337944984436, + "learning_rate": 3.918322533880021e-06, + "loss": 1.789, + "mean_token_accuracy": 0.579914391040802, + "num_tokens": 7511472722.0, + "step": 14694 + }, + { + "epoch": 3.9737696051919955, + "grad_norm": 0.9264882802963257, + "learning_rate": 3.917349727348084e-06, + "loss": 1.8097, + "mean_token_accuracy": 0.5750324726104736, + "num_tokens": 7511996953.0, + "step": 14695 + }, + { + "epoch": 3.974040021633315, + "grad_norm": 0.8894128203392029, + "learning_rate": 3.916377138132686e-06, + "loss": 1.8054, + "mean_token_accuracy": 0.5890079736709595, + "num_tokens": 7512521050.0, + "step": 14696 + }, + { + "epoch": 3.974310438074635, + "grad_norm": 0.9125323295593262, + "learning_rate": 3.915404766263667e-06, + "loss": 1.9386, + "mean_token_accuracy": 0.5612657070159912, + "num_tokens": 7513045302.0, + "step": 14697 + }, + { + "epoch": 3.9745808545159544, + "grad_norm": 1.0114575624465942, + "learning_rate": 3.914432611770866e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.581488847732544, + "num_tokens": 7513511226.0, + "step": 14698 + }, + { + "epoch": 3.974851270957274, + "grad_norm": 0.8098040223121643, + "learning_rate": 3.9134606746841106e-06, + "loss": 1.5469, + "mean_token_accuracy": 0.6491378545761108, + "num_tokens": 7514035375.0, + "step": 14699 + }, + { + "epoch": 3.9751216873985937, + "grad_norm": 0.8561447262763977, + "learning_rate": 3.912488955033218e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5664049983024597, + "num_tokens": 7514559653.0, + "step": 14700 + }, + { + "epoch": 3.9753921038399134, + "grad_norm": 0.3428805470466614, + "learning_rate": 3.911517452848011e-06, + "loss": 1.1934, + "mean_token_accuracy": 0.6843976378440857, + "num_tokens": 7515083927.0, + "step": 14701 + }, + { + "epoch": 3.975662520281233, + "grad_norm": 1.0363471508026123, + "learning_rate": 3.910546168158294e-06, + "loss": 1.8116, + "mean_token_accuracy": 0.5938740968704224, + "num_tokens": 7515608212.0, + "step": 14702 + }, + { + "epoch": 3.9759329367225527, + "grad_norm": 0.9769095778465271, + "learning_rate": 3.909575100993867e-06, + "loss": 1.8711, + "mean_token_accuracy": 0.57525235414505, + "num_tokens": 7516132383.0, + "step": 14703 + }, + { + "epoch": 3.9762033531638723, + "grad_norm": 0.9596765637397766, + "learning_rate": 3.90860425138453e-06, + "loss": 1.811, + "mean_token_accuracy": 0.5779243111610413, + "num_tokens": 7516626838.0, + "step": 14704 + }, + { + "epoch": 3.976473769605192, + "grad_norm": 0.8598727583885193, + "learning_rate": 3.907633619360068e-06, + "loss": 1.8623, + "mean_token_accuracy": 0.581135630607605, + "num_tokens": 7517151108.0, + "step": 14705 + }, + { + "epoch": 3.9767441860465116, + "grad_norm": 0.8730841279029846, + "learning_rate": 3.906663204950263e-06, + "loss": 1.8447, + "mean_token_accuracy": 0.5753692388534546, + "num_tokens": 7517633371.0, + "step": 14706 + }, + { + "epoch": 3.9770146024878312, + "grad_norm": 0.8586779832839966, + "learning_rate": 3.90569300818489e-06, + "loss": 1.8577, + "mean_token_accuracy": 0.5771682262420654, + "num_tokens": 7518157538.0, + "step": 14707 + }, + { + "epoch": 3.977285018929151, + "grad_norm": 0.9199298620223999, + "learning_rate": 3.904723029093721e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.590607225894928, + "num_tokens": 7518662445.0, + "step": 14708 + }, + { + "epoch": 3.9775554353704705, + "grad_norm": 0.8909275531768799, + "learning_rate": 3.903753267706513e-06, + "loss": 1.9525, + "mean_token_accuracy": 0.5598540306091309, + "num_tokens": 7519186641.0, + "step": 14709 + }, + { + "epoch": 3.97782585181179, + "grad_norm": 1.0242080688476562, + "learning_rate": 3.902783724053026e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.59747314453125, + "num_tokens": 7519710812.0, + "step": 14710 + }, + { + "epoch": 3.97809626825311, + "grad_norm": 0.9113893508911133, + "learning_rate": 3.901814398163006e-06, + "loss": 1.8636, + "mean_token_accuracy": 0.5785414576530457, + "num_tokens": 7520234984.0, + "step": 14711 + }, + { + "epoch": 3.9783666846944294, + "grad_norm": 0.9570372104644775, + "learning_rate": 3.900845290066192e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5765226483345032, + "num_tokens": 7520759205.0, + "step": 14712 + }, + { + "epoch": 3.978637101135749, + "grad_norm": 1.003004789352417, + "learning_rate": 3.899876399792324e-06, + "loss": 1.9333, + "mean_token_accuracy": 0.5713956356048584, + "num_tokens": 7521261681.0, + "step": 14713 + }, + { + "epoch": 3.9789075175770687, + "grad_norm": 0.8573985695838928, + "learning_rate": 3.8989077273711265e-06, + "loss": 1.8633, + "mean_token_accuracy": 0.5828783512115479, + "num_tokens": 7521741269.0, + "step": 14714 + }, + { + "epoch": 3.9791779340183884, + "grad_norm": 1.0122352838516235, + "learning_rate": 3.897939272832324e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.5789482593536377, + "num_tokens": 7522231566.0, + "step": 14715 + }, + { + "epoch": 3.979448350459708, + "grad_norm": 0.8088287711143494, + "learning_rate": 3.896971036205632e-06, + "loss": 1.8354, + "mean_token_accuracy": 0.5876331329345703, + "num_tokens": 7522755846.0, + "step": 14716 + }, + { + "epoch": 3.9797187669010277, + "grad_norm": 0.7948700189590454, + "learning_rate": 3.896003017520754e-06, + "loss": 1.8335, + "mean_token_accuracy": 0.5816540718078613, + "num_tokens": 7523280100.0, + "step": 14717 + }, + { + "epoch": 3.9799891833423473, + "grad_norm": 0.8326187133789062, + "learning_rate": 3.895035216807397e-06, + "loss": 1.9187, + "mean_token_accuracy": 0.5620784759521484, + "num_tokens": 7523804168.0, + "step": 14718 + }, + { + "epoch": 3.980259599783667, + "grad_norm": 0.9021663069725037, + "learning_rate": 3.894067634095255e-06, + "loss": 1.7445, + "mean_token_accuracy": 0.6040543913841248, + "num_tokens": 7524328231.0, + "step": 14719 + }, + { + "epoch": 3.9805300162249866, + "grad_norm": 0.8780388832092285, + "learning_rate": 3.893100269414012e-06, + "loss": 1.8016, + "mean_token_accuracy": 0.56863933801651, + "num_tokens": 7524852473.0, + "step": 14720 + }, + { + "epoch": 3.9808004326663062, + "grad_norm": 0.3457161784172058, + "learning_rate": 3.892133122793352e-06, + "loss": 1.2027, + "mean_token_accuracy": 0.6734994649887085, + "num_tokens": 7525376646.0, + "step": 14721 + }, + { + "epoch": 3.981070849107626, + "grad_norm": 0.9592200517654419, + "learning_rate": 3.891166194262954e-06, + "loss": 1.8637, + "mean_token_accuracy": 0.5986591577529907, + "num_tokens": 7525839260.0, + "step": 14722 + }, + { + "epoch": 3.9813412655489455, + "grad_norm": 0.879084050655365, + "learning_rate": 3.890199483852481e-06, + "loss": 1.7725, + "mean_token_accuracy": 0.592768669128418, + "num_tokens": 7526363398.0, + "step": 14723 + }, + { + "epoch": 3.981611681990265, + "grad_norm": 0.8406228423118591, + "learning_rate": 3.889232991591597e-06, + "loss": 1.8354, + "mean_token_accuracy": 0.5862728357315063, + "num_tokens": 7526887452.0, + "step": 14724 + }, + { + "epoch": 3.981882098431585, + "grad_norm": 0.8146155476570129, + "learning_rate": 3.888266717509958e-06, + "loss": 1.8193, + "mean_token_accuracy": 0.5733282566070557, + "num_tokens": 7527411643.0, + "step": 14725 + }, + { + "epoch": 3.9821525148729044, + "grad_norm": 0.8301706910133362, + "learning_rate": 3.8873006616372074e-06, + "loss": 1.7952, + "mean_token_accuracy": 0.5888005495071411, + "num_tokens": 7527935892.0, + "step": 14726 + }, + { + "epoch": 3.9824229313142236, + "grad_norm": 0.9136403799057007, + "learning_rate": 3.8863348240029905e-06, + "loss": 1.7553, + "mean_token_accuracy": 0.6027281880378723, + "num_tokens": 7528398248.0, + "step": 14727 + }, + { + "epoch": 3.9826933477555437, + "grad_norm": 0.8627230525016785, + "learning_rate": 3.885369204636943e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5457891821861267, + "num_tokens": 7528922496.0, + "step": 14728 + }, + { + "epoch": 3.982963764196863, + "grad_norm": 0.7426548004150391, + "learning_rate": 3.8844038035686886e-06, + "loss": 1.7794, + "mean_token_accuracy": 0.567954957485199, + "num_tokens": 7529446641.0, + "step": 14729 + }, + { + "epoch": 3.983234180638183, + "grad_norm": 0.821426272392273, + "learning_rate": 3.883438620827853e-06, + "loss": 1.8067, + "mean_token_accuracy": 0.5839545726776123, + "num_tokens": 7529967118.0, + "step": 14730 + }, + { + "epoch": 3.983504597079502, + "grad_norm": 0.8141363859176636, + "learning_rate": 3.882473656444049e-06, + "loss": 1.8352, + "mean_token_accuracy": 0.5821455717086792, + "num_tokens": 7530491403.0, + "step": 14731 + }, + { + "epoch": 3.9837750135208223, + "grad_norm": 0.8183072209358215, + "learning_rate": 3.881508910446883e-06, + "loss": 1.8045, + "mean_token_accuracy": 0.574579119682312, + "num_tokens": 7531012766.0, + "step": 14732 + }, + { + "epoch": 3.9840454299621415, + "grad_norm": 0.9451077580451965, + "learning_rate": 3.880544382865958e-06, + "loss": 1.7017, + "mean_token_accuracy": 0.5827184915542603, + "num_tokens": 7531536998.0, + "step": 14733 + }, + { + "epoch": 3.9843158464034616, + "grad_norm": 0.7837788462638855, + "learning_rate": 3.879580073730868e-06, + "loss": 1.7997, + "mean_token_accuracy": 0.5909048318862915, + "num_tokens": 7532061184.0, + "step": 14734 + }, + { + "epoch": 3.984586262844781, + "grad_norm": 0.8427572846412659, + "learning_rate": 3.878615983071203e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5754767656326294, + "num_tokens": 7532585338.0, + "step": 14735 + }, + { + "epoch": 3.9848566792861004, + "grad_norm": 0.9034582376480103, + "learning_rate": 3.877652110916539e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.5619720816612244, + "num_tokens": 7533109622.0, + "step": 14736 + }, + { + "epoch": 3.98512709572742, + "grad_norm": 0.7582426071166992, + "learning_rate": 3.8766884572964535e-06, + "loss": 1.831, + "mean_token_accuracy": 0.5798317790031433, + "num_tokens": 7533593354.0, + "step": 14737 + }, + { + "epoch": 3.9853975121687397, + "grad_norm": 0.8280890583992004, + "learning_rate": 3.875725022240518e-06, + "loss": 1.7267, + "mean_token_accuracy": 0.5739866495132446, + "num_tokens": 7534117488.0, + "step": 14738 + }, + { + "epoch": 3.9856679286100594, + "grad_norm": 1.007086992263794, + "learning_rate": 3.87476180577829e-06, + "loss": 1.7668, + "mean_token_accuracy": 0.5802228450775146, + "num_tokens": 7534640594.0, + "step": 14739 + }, + { + "epoch": 3.985938345051379, + "grad_norm": 1.0204887390136719, + "learning_rate": 3.873798807939321e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.5703935623168945, + "num_tokens": 7535164781.0, + "step": 14740 + }, + { + "epoch": 3.9862087614926986, + "grad_norm": 0.368456095457077, + "learning_rate": 3.872836028753163e-06, + "loss": 1.1852, + "mean_token_accuracy": 0.6927273273468018, + "num_tokens": 7535626936.0, + "step": 14741 + }, + { + "epoch": 3.9864791779340183, + "grad_norm": 0.9864118099212646, + "learning_rate": 3.871873468249356e-06, + "loss": 1.9019, + "mean_token_accuracy": 0.5610952377319336, + "num_tokens": 7536151163.0, + "step": 14742 + }, + { + "epoch": 3.986749594375338, + "grad_norm": 0.7777770757675171, + "learning_rate": 3.870911126457432e-06, + "loss": 1.4637, + "mean_token_accuracy": 0.6490217447280884, + "num_tokens": 7536657529.0, + "step": 14743 + }, + { + "epoch": 3.9870200108166576, + "grad_norm": 0.9894959330558777, + "learning_rate": 3.8699490034069205e-06, + "loss": 1.9068, + "mean_token_accuracy": 0.5674247741699219, + "num_tokens": 7537151571.0, + "step": 14744 + }, + { + "epoch": 3.987290427257977, + "grad_norm": 0.888797402381897, + "learning_rate": 3.8689870991273425e-06, + "loss": 1.8911, + "mean_token_accuracy": 0.5544127225875854, + "num_tokens": 7537675808.0, + "step": 14745 + }, + { + "epoch": 3.987560843699297, + "grad_norm": 0.9885455965995789, + "learning_rate": 3.868025413648209e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.5803817510604858, + "num_tokens": 7538200033.0, + "step": 14746 + }, + { + "epoch": 3.9878312601406165, + "grad_norm": 0.9039497375488281, + "learning_rate": 3.867063946999031e-06, + "loss": 1.8631, + "mean_token_accuracy": 0.565669059753418, + "num_tokens": 7538724134.0, + "step": 14747 + }, + { + "epoch": 3.988101676581936, + "grad_norm": 0.8625496625900269, + "learning_rate": 3.86610269920931e-06, + "loss": 1.8043, + "mean_token_accuracy": 0.5804003477096558, + "num_tokens": 7539248302.0, + "step": 14748 + }, + { + "epoch": 3.988372093023256, + "grad_norm": 1.0154401063919067, + "learning_rate": 3.865141670308533e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5971728563308716, + "num_tokens": 7539709367.0, + "step": 14749 + }, + { + "epoch": 3.9886425094645754, + "grad_norm": 0.9962251782417297, + "learning_rate": 3.864180860326192e-06, + "loss": 1.7276, + "mean_token_accuracy": 0.5810402631759644, + "num_tokens": 7540223201.0, + "step": 14750 + }, + { + "epoch": 3.988912925905895, + "grad_norm": 1.2538387775421143, + "learning_rate": 3.863220269291771e-06, + "loss": 1.928, + "mean_token_accuracy": 0.5677618980407715, + "num_tokens": 7540692448.0, + "step": 14751 + }, + { + "epoch": 3.9891833423472147, + "grad_norm": 1.0473854541778564, + "learning_rate": 3.862259897234736e-06, + "loss": 1.9531, + "mean_token_accuracy": 0.5654655694961548, + "num_tokens": 7541172662.0, + "step": 14752 + }, + { + "epoch": 3.9894537587885344, + "grad_norm": 0.9541749358177185, + "learning_rate": 3.861299744184562e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5649815797805786, + "num_tokens": 7541696861.0, + "step": 14753 + }, + { + "epoch": 3.989724175229854, + "grad_norm": 0.9054892659187317, + "learning_rate": 3.8603398101707046e-06, + "loss": 1.694, + "mean_token_accuracy": 0.594679057598114, + "num_tokens": 7542211197.0, + "step": 14754 + }, + { + "epoch": 3.9899945916711737, + "grad_norm": 1.101618766784668, + "learning_rate": 3.8593800952226165e-06, + "loss": 1.8922, + "mean_token_accuracy": 0.5666630268096924, + "num_tokens": 7542735335.0, + "step": 14755 + }, + { + "epoch": 3.9902650081124933, + "grad_norm": 1.107343316078186, + "learning_rate": 3.858420599369747e-06, + "loss": 1.7883, + "mean_token_accuracy": 0.6150148510932922, + "num_tokens": 7543195898.0, + "step": 14756 + }, + { + "epoch": 3.990535424553813, + "grad_norm": 1.1044665575027466, + "learning_rate": 3.857461322641537e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5637962818145752, + "num_tokens": 7543720059.0, + "step": 14757 + }, + { + "epoch": 3.9908058409951326, + "grad_norm": 0.9039793610572815, + "learning_rate": 3.856502265067416e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.5644084811210632, + "num_tokens": 7544244171.0, + "step": 14758 + }, + { + "epoch": 3.9910762574364522, + "grad_norm": 0.9415552616119385, + "learning_rate": 3.8555434266768144e-06, + "loss": 1.782, + "mean_token_accuracy": 0.5861097574234009, + "num_tokens": 7544739999.0, + "step": 14759 + }, + { + "epoch": 3.991346673877772, + "grad_norm": 1.154558539390564, + "learning_rate": 3.85458480749915e-06, + "loss": 1.8955, + "mean_token_accuracy": 0.5577616691589355, + "num_tokens": 7545264155.0, + "step": 14760 + }, + { + "epoch": 3.9916170903190915, + "grad_norm": 0.3360750675201416, + "learning_rate": 3.853626407563839e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7234503626823425, + "num_tokens": 7545728913.0, + "step": 14761 + }, + { + "epoch": 3.991887506760411, + "grad_norm": 0.9777555465698242, + "learning_rate": 3.852668226900286e-06, + "loss": 1.8127, + "mean_token_accuracy": 0.5669019222259521, + "num_tokens": 7546253131.0, + "step": 14762 + }, + { + "epoch": 3.992157923201731, + "grad_norm": 0.9535797238349915, + "learning_rate": 3.851710265537888e-06, + "loss": 1.8174, + "mean_token_accuracy": 0.5841624736785889, + "num_tokens": 7546777329.0, + "step": 14763 + }, + { + "epoch": 3.9924283396430504, + "grad_norm": 0.9649894833564758, + "learning_rate": 3.85075252350604e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.593694806098938, + "num_tokens": 7547301494.0, + "step": 14764 + }, + { + "epoch": 3.99269875608437, + "grad_norm": 0.8911652565002441, + "learning_rate": 3.849795000834132e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.5718750357627869, + "num_tokens": 7547806925.0, + "step": 14765 + }, + { + "epoch": 3.9929691725256897, + "grad_norm": 0.940728485584259, + "learning_rate": 3.84883769755154e-06, + "loss": 1.78, + "mean_token_accuracy": 0.5734108686447144, + "num_tokens": 7548331198.0, + "step": 14766 + }, + { + "epoch": 3.9932395889670094, + "grad_norm": 1.0172322988510132, + "learning_rate": 3.847880613687639e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5661983489990234, + "num_tokens": 7548855383.0, + "step": 14767 + }, + { + "epoch": 3.9935100054083286, + "grad_norm": 0.9908467531204224, + "learning_rate": 3.846923749271793e-06, + "loss": 1.958, + "mean_token_accuracy": 0.5520155429840088, + "num_tokens": 7549349014.0, + "step": 14768 + }, + { + "epoch": 3.9937804218496487, + "grad_norm": 0.9942784309387207, + "learning_rate": 3.84596710433336e-06, + "loss": 1.7874, + "mean_token_accuracy": 0.5837844014167786, + "num_tokens": 7549810586.0, + "step": 14769 + }, + { + "epoch": 3.994050838290968, + "grad_norm": 0.9548399448394775, + "learning_rate": 3.845010678901698e-06, + "loss": 1.8183, + "mean_token_accuracy": 0.5788416862487793, + "num_tokens": 7550312518.0, + "step": 14770 + }, + { + "epoch": 3.994321254732288, + "grad_norm": 1.1686450242996216, + "learning_rate": 3.844054473006149e-06, + "loss": 1.8337, + "mean_token_accuracy": 0.5915398597717285, + "num_tokens": 7550767834.0, + "step": 14771 + }, + { + "epoch": 3.994591671173607, + "grad_norm": 0.9029080271720886, + "learning_rate": 3.843098486676051e-06, + "loss": 1.88, + "mean_token_accuracy": 0.5567842721939087, + "num_tokens": 7551291993.0, + "step": 14772 + }, + { + "epoch": 3.9948620876149272, + "grad_norm": 0.9059226512908936, + "learning_rate": 3.842142719940742e-06, + "loss": 1.7371, + "mean_token_accuracy": 0.5957275629043579, + "num_tokens": 7551774172.0, + "step": 14773 + }, + { + "epoch": 3.9951325040562464, + "grad_norm": 1.0090659856796265, + "learning_rate": 3.841187172829543e-06, + "loss": 1.8451, + "mean_token_accuracy": 0.5621147155761719, + "num_tokens": 7552298450.0, + "step": 14774 + }, + { + "epoch": 3.9954029204975665, + "grad_norm": 1.0876036882400513, + "learning_rate": 3.840231845371772e-06, + "loss": 1.9399, + "mean_token_accuracy": 0.5312661528587341, + "num_tokens": 7552822550.0, + "step": 14775 + }, + { + "epoch": 3.9956733369388857, + "grad_norm": 0.9354943037033081, + "learning_rate": 3.839276737596746e-06, + "loss": 1.7154, + "mean_token_accuracy": 0.5885032415390015, + "num_tokens": 7553346638.0, + "step": 14776 + }, + { + "epoch": 3.9959437533802054, + "grad_norm": 0.9735305309295654, + "learning_rate": 3.8383218495337656e-06, + "loss": 1.8867, + "mean_token_accuracy": 0.565670371055603, + "num_tokens": 7553870841.0, + "step": 14777 + }, + { + "epoch": 3.996214169821525, + "grad_norm": 1.0357223749160767, + "learning_rate": 3.837367181212134e-06, + "loss": 1.8931, + "mean_token_accuracy": 0.5818465948104858, + "num_tokens": 7554340092.0, + "step": 14778 + }, + { + "epoch": 3.9964845862628446, + "grad_norm": 1.037773847579956, + "learning_rate": 3.836412732661139e-06, + "loss": 1.8785, + "mean_token_accuracy": 0.5606476664543152, + "num_tokens": 7554864333.0, + "step": 14779 + }, + { + "epoch": 3.9967550027041643, + "grad_norm": 0.8201442956924438, + "learning_rate": 3.8354585039100716e-06, + "loss": 1.91, + "mean_token_accuracy": 0.5472253561019897, + "num_tokens": 7555388531.0, + "step": 14780 + }, + { + "epoch": 3.997025419145484, + "grad_norm": 0.31312665343284607, + "learning_rate": 3.834504494988203e-06, + "loss": 1.1208, + "mean_token_accuracy": 0.6951507925987244, + "num_tokens": 7555912725.0, + "step": 14781 + }, + { + "epoch": 3.9972958355868036, + "grad_norm": 1.0527799129486084, + "learning_rate": 3.833550705924812e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5795794725418091, + "num_tokens": 7556436921.0, + "step": 14782 + }, + { + "epoch": 3.997566252028123, + "grad_norm": 1.101432204246521, + "learning_rate": 3.832597136749158e-06, + "loss": 1.9901, + "mean_token_accuracy": 0.5421230792999268, + "num_tokens": 7556961192.0, + "step": 14783 + }, + { + "epoch": 3.997836668469443, + "grad_norm": 0.8847227692604065, + "learning_rate": 3.831643787490504e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.5741401314735413, + "num_tokens": 7557485368.0, + "step": 14784 + }, + { + "epoch": 3.9981070849107625, + "grad_norm": 0.7880908846855164, + "learning_rate": 3.830690658178099e-06, + "loss": 1.8234, + "mean_token_accuracy": 0.5954388380050659, + "num_tokens": 7557966523.0, + "step": 14785 + }, + { + "epoch": 3.998377501352082, + "grad_norm": 0.8474429845809937, + "learning_rate": 3.829737748841187e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5904226899147034, + "num_tokens": 7558490716.0, + "step": 14786 + }, + { + "epoch": 3.998647917793402, + "grad_norm": 0.9524385333061218, + "learning_rate": 3.828785059509008e-06, + "loss": 1.8497, + "mean_token_accuracy": 0.5791015625, + "num_tokens": 7559014921.0, + "step": 14787 + }, + { + "epoch": 3.9989183342347214, + "grad_norm": 1.0187734365463257, + "learning_rate": 3.827832590210794e-06, + "loss": 1.7107, + "mean_token_accuracy": 0.5788374543190002, + "num_tokens": 7559539033.0, + "step": 14788 + }, + { + "epoch": 3.999188750676041, + "grad_norm": 1.0493863821029663, + "learning_rate": 3.826880340975765e-06, + "loss": 1.8242, + "mean_token_accuracy": 0.5770193338394165, + "num_tokens": 7560007864.0, + "step": 14789 + }, + { + "epoch": 3.9994591671173607, + "grad_norm": 0.8115931153297424, + "learning_rate": 3.825928311833143e-06, + "loss": 1.7385, + "mean_token_accuracy": 0.5790384411811829, + "num_tokens": 7560532057.0, + "step": 14790 + }, + { + "epoch": 3.9997295835586804, + "grad_norm": 0.9256179332733154, + "learning_rate": 3.824976502812139e-06, + "loss": 1.7665, + "mean_token_accuracy": 0.591363251209259, + "num_tokens": 7561035901.0, + "step": 14791 + }, + { + "epoch": 4.0, + "grad_norm": 0.9124437570571899, + "learning_rate": 3.824024913941956e-06, + "loss": 1.7927, + "mean_token_accuracy": 0.5829142928123474, + "num_tokens": 7561297916.0, + "step": 14792 + }, + { + "epoch": 4.000270416441319, + "grad_norm": 0.8841496109962463, + "learning_rate": 3.82307354525179e-06, + "loss": 1.7537, + "mean_token_accuracy": 0.5855730175971985, + "num_tokens": 7561822112.0, + "step": 14793 + }, + { + "epoch": 4.000540832882639, + "grad_norm": 0.9114842414855957, + "learning_rate": 3.822122396770836e-06, + "loss": 1.7678, + "mean_token_accuracy": 0.5883009433746338, + "num_tokens": 7562346308.0, + "step": 14794 + }, + { + "epoch": 4.0008112493239585, + "grad_norm": 0.7987900972366333, + "learning_rate": 3.821171468528273e-06, + "loss": 1.8357, + "mean_token_accuracy": 0.5659776329994202, + "num_tokens": 7562870558.0, + "step": 14795 + }, + { + "epoch": 4.001081665765279, + "grad_norm": 0.8420325517654419, + "learning_rate": 3.820220760553284e-06, + "loss": 1.7859, + "mean_token_accuracy": 0.5734621286392212, + "num_tokens": 7563394837.0, + "step": 14796 + }, + { + "epoch": 4.001352082206598, + "grad_norm": 1.1616159677505493, + "learning_rate": 3.8192702728750364e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5797274708747864, + "num_tokens": 7563858539.0, + "step": 14797 + }, + { + "epoch": 4.001622498647918, + "grad_norm": 0.8874658346176147, + "learning_rate": 3.8183200055226905e-06, + "loss": 1.7138, + "mean_token_accuracy": 0.6095274090766907, + "num_tokens": 7564382710.0, + "step": 14798 + }, + { + "epoch": 4.001892915089237, + "grad_norm": 0.941245436668396, + "learning_rate": 3.817369958525409e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.5451473593711853, + "num_tokens": 7564906843.0, + "step": 14799 + }, + { + "epoch": 4.002163331530557, + "grad_norm": 0.8697585463523865, + "learning_rate": 3.816420131912342e-06, + "loss": 1.8877, + "mean_token_accuracy": 0.5633574724197388, + "num_tokens": 7565394751.0, + "step": 14800 + }, + { + "epoch": 4.002433747971876, + "grad_norm": 0.3799545168876648, + "learning_rate": 3.8154705257126265e-06, + "loss": 1.1332, + "mean_token_accuracy": 0.6897742748260498, + "num_tokens": 7565919025.0, + "step": 14801 + }, + { + "epoch": 4.002704164413196, + "grad_norm": 1.0632025003433228, + "learning_rate": 3.814521139955407e-06, + "loss": 1.8368, + "mean_token_accuracy": 0.5586837530136108, + "num_tokens": 7566443243.0, + "step": 14802 + }, + { + "epoch": 4.002974580854516, + "grad_norm": 0.9229710698127747, + "learning_rate": 3.8135719746698095e-06, + "loss": 1.8811, + "mean_token_accuracy": 0.5621488094329834, + "num_tokens": 7566967510.0, + "step": 14803 + }, + { + "epoch": 4.003244997295836, + "grad_norm": 0.9287973046302795, + "learning_rate": 3.812623029884955e-06, + "loss": 1.8408, + "mean_token_accuracy": 0.5611333250999451, + "num_tokens": 7567491636.0, + "step": 14804 + }, + { + "epoch": 4.003515413737155, + "grad_norm": 0.8091159462928772, + "learning_rate": 3.8116743056299665e-06, + "loss": 1.7128, + "mean_token_accuracy": 0.6141000986099243, + "num_tokens": 7568015825.0, + "step": 14805 + }, + { + "epoch": 4.003785830178475, + "grad_norm": 0.8131193518638611, + "learning_rate": 3.8107258019339467e-06, + "loss": 1.8297, + "mean_token_accuracy": 0.578331708908081, + "num_tokens": 7568540087.0, + "step": 14806 + }, + { + "epoch": 4.004056246619794, + "grad_norm": 0.8895211815834045, + "learning_rate": 3.809777518826001e-06, + "loss": 1.8986, + "mean_token_accuracy": 0.5706372857093811, + "num_tokens": 7569019794.0, + "step": 14807 + }, + { + "epoch": 4.004326663061114, + "grad_norm": 0.860489010810852, + "learning_rate": 3.808829456335229e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5921801328659058, + "num_tokens": 7569543947.0, + "step": 14808 + }, + { + "epoch": 4.0045970795024335, + "grad_norm": 1.0035032033920288, + "learning_rate": 3.807881614490715e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5849613547325134, + "num_tokens": 7570023389.0, + "step": 14809 + }, + { + "epoch": 4.004867495943754, + "grad_norm": 0.9601926803588867, + "learning_rate": 3.806933993321545e-06, + "loss": 1.7213, + "mean_token_accuracy": 0.5908823013305664, + "num_tokens": 7570500841.0, + "step": 14810 + }, + { + "epoch": 4.005137912385073, + "grad_norm": 0.9201894998550415, + "learning_rate": 3.805986592856795e-06, + "loss": 1.9296, + "mean_token_accuracy": 0.5448819994926453, + "num_tokens": 7571025041.0, + "step": 14811 + }, + { + "epoch": 4.005408328826393, + "grad_norm": 0.8685295581817627, + "learning_rate": 3.805039413125529e-06, + "loss": 1.6441, + "mean_token_accuracy": 0.6203261613845825, + "num_tokens": 7571549266.0, + "step": 14812 + }, + { + "epoch": 4.005678745267712, + "grad_norm": 0.8855488896369934, + "learning_rate": 3.804092454156816e-06, + "loss": 1.9249, + "mean_token_accuracy": 0.5670145153999329, + "num_tokens": 7572073396.0, + "step": 14813 + }, + { + "epoch": 4.005949161709032, + "grad_norm": 0.8523733019828796, + "learning_rate": 3.803145715979707e-06, + "loss": 1.8596, + "mean_token_accuracy": 0.5710268020629883, + "num_tokens": 7572597571.0, + "step": 14814 + }, + { + "epoch": 4.006219578150351, + "grad_norm": 0.8322278261184692, + "learning_rate": 3.802199198623251e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.5847489833831787, + "num_tokens": 7573121790.0, + "step": 14815 + }, + { + "epoch": 4.006489994591671, + "grad_norm": 0.8479598760604858, + "learning_rate": 3.8012529021164924e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.5887121558189392, + "num_tokens": 7573591285.0, + "step": 14816 + }, + { + "epoch": 4.006760411032991, + "grad_norm": 0.9780422449111938, + "learning_rate": 3.8003068264884655e-06, + "loss": 1.8014, + "mean_token_accuracy": 0.5793119668960571, + "num_tokens": 7574115314.0, + "step": 14817 + }, + { + "epoch": 4.007030827474311, + "grad_norm": 0.8606562614440918, + "learning_rate": 3.7993609717681958e-06, + "loss": 1.789, + "mean_token_accuracy": 0.597376823425293, + "num_tokens": 7574639526.0, + "step": 14818 + }, + { + "epoch": 4.00730124391563, + "grad_norm": 0.9348509907722473, + "learning_rate": 3.798415337984709e-06, + "loss": 1.9192, + "mean_token_accuracy": 0.574067234992981, + "num_tokens": 7575163720.0, + "step": 14819 + }, + { + "epoch": 4.00757166035695, + "grad_norm": 0.8758068084716797, + "learning_rate": 3.797469925167017e-06, + "loss": 1.7867, + "mean_token_accuracy": 0.568188488483429, + "num_tokens": 7575643468.0, + "step": 14820 + }, + { + "epoch": 4.007842076798269, + "grad_norm": 0.3538861572742462, + "learning_rate": 3.7965247333441302e-06, + "loss": 1.1132, + "mean_token_accuracy": 0.6995450258255005, + "num_tokens": 7576167726.0, + "step": 14821 + }, + { + "epoch": 4.008112493239589, + "grad_norm": 1.0959491729736328, + "learning_rate": 3.795579762545045e-06, + "loss": 1.8012, + "mean_token_accuracy": 0.5446892976760864, + "num_tokens": 7576691976.0, + "step": 14822 + }, + { + "epoch": 4.0083829096809085, + "grad_norm": 0.9818622469902039, + "learning_rate": 3.794635012798764e-06, + "loss": 1.7945, + "mean_token_accuracy": 0.5811810493469238, + "num_tokens": 7577216135.0, + "step": 14823 + }, + { + "epoch": 4.008653326122229, + "grad_norm": 0.9634556770324707, + "learning_rate": 3.793690484134267e-06, + "loss": 1.7154, + "mean_token_accuracy": 0.5829111933708191, + "num_tokens": 7577740237.0, + "step": 14824 + }, + { + "epoch": 4.008923742563548, + "grad_norm": 0.975943922996521, + "learning_rate": 3.792746176580542e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.5770041942596436, + "num_tokens": 7578264483.0, + "step": 14825 + }, + { + "epoch": 4.009194159004868, + "grad_norm": 0.9275326132774353, + "learning_rate": 3.791802090166558e-06, + "loss": 1.6999, + "mean_token_accuracy": 0.5976506471633911, + "num_tokens": 7578788739.0, + "step": 14826 + }, + { + "epoch": 4.009464575446187, + "grad_norm": 1.019811749458313, + "learning_rate": 3.7908582249212822e-06, + "loss": 1.8233, + "mean_token_accuracy": 0.5557518005371094, + "num_tokens": 7579313019.0, + "step": 14827 + }, + { + "epoch": 4.009734991887507, + "grad_norm": 0.8398231863975525, + "learning_rate": 3.789914580873678e-06, + "loss": 1.8873, + "mean_token_accuracy": 0.5851339101791382, + "num_tokens": 7579835132.0, + "step": 14828 + }, + { + "epoch": 4.010005408328826, + "grad_norm": 0.8894381523132324, + "learning_rate": 3.7889711580526967e-06, + "loss": 1.8216, + "mean_token_accuracy": 0.5674422979354858, + "num_tokens": 7580326552.0, + "step": 14829 + }, + { + "epoch": 4.010275824770146, + "grad_norm": 0.8593464493751526, + "learning_rate": 3.7880279564872885e-06, + "loss": 1.7144, + "mean_token_accuracy": 0.5956694483757019, + "num_tokens": 7580850622.0, + "step": 14830 + }, + { + "epoch": 4.010546241211466, + "grad_norm": 0.9470400214195251, + "learning_rate": 3.787084976206392e-06, + "loss": 1.6772, + "mean_token_accuracy": 0.5842567086219788, + "num_tokens": 7581374888.0, + "step": 14831 + }, + { + "epoch": 4.010816657652786, + "grad_norm": 0.8742706179618835, + "learning_rate": 3.786142217238937e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.5688941478729248, + "num_tokens": 7581898996.0, + "step": 14832 + }, + { + "epoch": 4.011087074094105, + "grad_norm": 1.031076192855835, + "learning_rate": 3.7851996796138566e-06, + "loss": 1.9253, + "mean_token_accuracy": 0.5820664763450623, + "num_tokens": 7582362665.0, + "step": 14833 + }, + { + "epoch": 4.011357490535424, + "grad_norm": 0.9010686278343201, + "learning_rate": 3.7842573633600643e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5837481617927551, + "num_tokens": 7582850142.0, + "step": 14834 + }, + { + "epoch": 4.011627906976744, + "grad_norm": 0.8402722477912903, + "learning_rate": 3.78331526850648e-06, + "loss": 1.908, + "mean_token_accuracy": 0.5581529140472412, + "num_tokens": 7583374299.0, + "step": 14835 + }, + { + "epoch": 4.011898323418063, + "grad_norm": 1.1217328310012817, + "learning_rate": 3.782373395082002e-06, + "loss": 1.6777, + "mean_token_accuracy": 0.599742591381073, + "num_tokens": 7583854212.0, + "step": 14836 + }, + { + "epoch": 4.0121687398593835, + "grad_norm": 0.9409195184707642, + "learning_rate": 3.7814317431155374e-06, + "loss": 1.8971, + "mean_token_accuracy": 0.5679407119750977, + "num_tokens": 7584378366.0, + "step": 14837 + }, + { + "epoch": 4.012439156300703, + "grad_norm": 0.8310165405273438, + "learning_rate": 3.7804903126359727e-06, + "loss": 1.8276, + "mean_token_accuracy": 0.5638518333435059, + "num_tokens": 7584902651.0, + "step": 14838 + }, + { + "epoch": 4.012709572742023, + "grad_norm": 0.8619785308837891, + "learning_rate": 3.779549103672199e-06, + "loss": 1.7735, + "mean_token_accuracy": 0.5894484519958496, + "num_tokens": 7585426684.0, + "step": 14839 + }, + { + "epoch": 4.012979989183342, + "grad_norm": 0.9478840231895447, + "learning_rate": 3.7786081162530925e-06, + "loss": 1.6428, + "mean_token_accuracy": 0.6264703869819641, + "num_tokens": 7585950873.0, + "step": 14840 + }, + { + "epoch": 4.013250405624662, + "grad_norm": 0.34558919072151184, + "learning_rate": 3.7776673504075234e-06, + "loss": 1.1244, + "mean_token_accuracy": 0.7028748393058777, + "num_tokens": 7586432262.0, + "step": 14841 + }, + { + "epoch": 4.013520822065981, + "grad_norm": 0.9176939129829407, + "learning_rate": 3.7767268061643615e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5601601600646973, + "num_tokens": 7586956369.0, + "step": 14842 + }, + { + "epoch": 4.013791238507301, + "grad_norm": 0.9236729741096497, + "learning_rate": 3.7757864835524634e-06, + "loss": 1.7608, + "mean_token_accuracy": 0.6008877754211426, + "num_tokens": 7587480639.0, + "step": 14843 + }, + { + "epoch": 4.0140616549486205, + "grad_norm": 0.9192430973052979, + "learning_rate": 3.7748463826006787e-06, + "loss": 1.8553, + "mean_token_accuracy": 0.574134886264801, + "num_tokens": 7588004859.0, + "step": 14844 + }, + { + "epoch": 4.014332071389941, + "grad_norm": 1.0433502197265625, + "learning_rate": 3.7739065033378573e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.6026706695556641, + "num_tokens": 7588463818.0, + "step": 14845 + }, + { + "epoch": 4.01460248783126, + "grad_norm": 0.875743567943573, + "learning_rate": 3.772966845792835e-06, + "loss": 1.9658, + "mean_token_accuracy": 0.5722414255142212, + "num_tokens": 7588900135.0, + "step": 14846 + }, + { + "epoch": 4.01487290427258, + "grad_norm": 0.8532765507698059, + "learning_rate": 3.772027409994442e-06, + "loss": 1.828, + "mean_token_accuracy": 0.5681650042533875, + "num_tokens": 7589424389.0, + "step": 14847 + }, + { + "epoch": 4.015143320713899, + "grad_norm": 0.9415515661239624, + "learning_rate": 3.771088195971505e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.6069419384002686, + "num_tokens": 7589948543.0, + "step": 14848 + }, + { + "epoch": 4.015413737155219, + "grad_norm": 0.9836796522140503, + "learning_rate": 3.7701492037528382e-06, + "loss": 1.8793, + "mean_token_accuracy": 0.5769377946853638, + "num_tokens": 7590472788.0, + "step": 14849 + }, + { + "epoch": 4.015684153596538, + "grad_norm": 0.819135844707489, + "learning_rate": 3.7692104333672564e-06, + "loss": 1.8777, + "mean_token_accuracy": 0.5654624700546265, + "num_tokens": 7590997051.0, + "step": 14850 + }, + { + "epoch": 4.0159545700378585, + "grad_norm": 0.9673253893852234, + "learning_rate": 3.7682718848435653e-06, + "loss": 1.6988, + "mean_token_accuracy": 0.5895029306411743, + "num_tokens": 7591461996.0, + "step": 14851 + }, + { + "epoch": 4.016224986479178, + "grad_norm": 0.8347160220146179, + "learning_rate": 3.7673335582105575e-06, + "loss": 1.8238, + "mean_token_accuracy": 0.5770128965377808, + "num_tokens": 7591986270.0, + "step": 14852 + }, + { + "epoch": 4.016495402920498, + "grad_norm": 1.020591139793396, + "learning_rate": 3.7663954534970275e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5815984010696411, + "num_tokens": 7592510498.0, + "step": 14853 + }, + { + "epoch": 4.016765819361817, + "grad_norm": 0.910916268825531, + "learning_rate": 3.7654575707317588e-06, + "loss": 1.9865, + "mean_token_accuracy": 0.5527986288070679, + "num_tokens": 7593034751.0, + "step": 14854 + }, + { + "epoch": 4.017036235803137, + "grad_norm": 1.075487494468689, + "learning_rate": 3.7645199099435242e-06, + "loss": 1.898, + "mean_token_accuracy": 0.5815747380256653, + "num_tokens": 7593516981.0, + "step": 14855 + }, + { + "epoch": 4.017306652244456, + "grad_norm": 0.8966935276985168, + "learning_rate": 3.763582471161099e-06, + "loss": 1.7697, + "mean_token_accuracy": 0.5962746143341064, + "num_tokens": 7594036756.0, + "step": 14856 + }, + { + "epoch": 4.017577068685776, + "grad_norm": 0.886327862739563, + "learning_rate": 3.762645254413245e-06, + "loss": 1.8451, + "mean_token_accuracy": 0.5554536581039429, + "num_tokens": 7594560941.0, + "step": 14857 + }, + { + "epoch": 4.0178474851270956, + "grad_norm": 0.9446104764938354, + "learning_rate": 3.761708259728716e-06, + "loss": 1.7554, + "mean_token_accuracy": 0.5969774723052979, + "num_tokens": 7595085074.0, + "step": 14858 + }, + { + "epoch": 4.018117901568416, + "grad_norm": 0.8811548948287964, + "learning_rate": 3.7607714871362665e-06, + "loss": 1.8165, + "mean_token_accuracy": 0.5776767134666443, + "num_tokens": 7595609347.0, + "step": 14859 + }, + { + "epoch": 4.018388318009735, + "grad_norm": 1.008363127708435, + "learning_rate": 3.7598349366646368e-06, + "loss": 1.96, + "mean_token_accuracy": 0.5635336637496948, + "num_tokens": 7596133596.0, + "step": 14860 + }, + { + "epoch": 4.018658734451055, + "grad_norm": 0.3283153176307678, + "learning_rate": 3.7588986083425606e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.705880880355835, + "num_tokens": 7596657708.0, + "step": 14861 + }, + { + "epoch": 4.018929150892374, + "grad_norm": 0.9238629937171936, + "learning_rate": 3.7579625021987724e-06, + "loss": 1.85, + "mean_token_accuracy": 0.5618258714675903, + "num_tokens": 7597181980.0, + "step": 14862 + }, + { + "epoch": 4.019199567333694, + "grad_norm": 0.991515040397644, + "learning_rate": 3.75702661826199e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5914968252182007, + "num_tokens": 7597662735.0, + "step": 14863 + }, + { + "epoch": 4.019469983775013, + "grad_norm": 0.8119651675224304, + "learning_rate": 3.7560909565609304e-06, + "loss": 1.6747, + "mean_token_accuracy": 0.59783935546875, + "num_tokens": 7598160726.0, + "step": 14864 + }, + { + "epoch": 4.0197404002163335, + "grad_norm": 0.8163799047470093, + "learning_rate": 3.7551555171243054e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.5742579102516174, + "num_tokens": 7598640307.0, + "step": 14865 + }, + { + "epoch": 4.020010816657653, + "grad_norm": 0.9418290257453918, + "learning_rate": 3.7542202999808163e-06, + "loss": 1.8097, + "mean_token_accuracy": 0.5993964672088623, + "num_tokens": 7599100687.0, + "step": 14866 + }, + { + "epoch": 4.020281233098973, + "grad_norm": 1.0331838130950928, + "learning_rate": 3.753285305159153e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5717531442642212, + "num_tokens": 7599580931.0, + "step": 14867 + }, + { + "epoch": 4.020551649540292, + "grad_norm": 0.911920964717865, + "learning_rate": 3.7523505326880105e-06, + "loss": 1.7119, + "mean_token_accuracy": 0.5790311098098755, + "num_tokens": 7600105077.0, + "step": 14868 + }, + { + "epoch": 4.020822065981612, + "grad_norm": 0.9192795157432556, + "learning_rate": 3.7514159825960685e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5849630236625671, + "num_tokens": 7600629344.0, + "step": 14869 + }, + { + "epoch": 4.021092482422931, + "grad_norm": 1.0445735454559326, + "learning_rate": 3.750481654911997e-06, + "loss": 1.7482, + "mean_token_accuracy": 0.5830217003822327, + "num_tokens": 7601142989.0, + "step": 14870 + }, + { + "epoch": 4.021362898864251, + "grad_norm": 0.9857206344604492, + "learning_rate": 3.7495475496644705e-06, + "loss": 1.9157, + "mean_token_accuracy": 0.572948694229126, + "num_tokens": 7601667090.0, + "step": 14871 + }, + { + "epoch": 4.0216333153055706, + "grad_norm": 0.9280788898468018, + "learning_rate": 3.748613666882148e-06, + "loss": 1.7971, + "mean_token_accuracy": 0.5681334137916565, + "num_tokens": 7602191343.0, + "step": 14872 + }, + { + "epoch": 4.021903731746891, + "grad_norm": 0.8730428218841553, + "learning_rate": 3.74768000659368e-06, + "loss": 1.7846, + "mean_token_accuracy": 0.5751803517341614, + "num_tokens": 7602715545.0, + "step": 14873 + }, + { + "epoch": 4.02217414818821, + "grad_norm": 1.0590206384658813, + "learning_rate": 3.7467465688277206e-06, + "loss": 1.8841, + "mean_token_accuracy": 0.5762248039245605, + "num_tokens": 7603239791.0, + "step": 14874 + }, + { + "epoch": 4.022444564629529, + "grad_norm": 0.9278325438499451, + "learning_rate": 3.745813353612904e-06, + "loss": 1.9173, + "mean_token_accuracy": 0.5501875877380371, + "num_tokens": 7603763961.0, + "step": 14875 + }, + { + "epoch": 4.022714981070849, + "grad_norm": 0.8927293419837952, + "learning_rate": 3.7448803609778693e-06, + "loss": 1.7307, + "mean_token_accuracy": 0.5758888721466064, + "num_tokens": 7604288152.0, + "step": 14876 + }, + { + "epoch": 4.022985397512168, + "grad_norm": 0.9460015296936035, + "learning_rate": 3.743947590951238e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.5626657009124756, + "num_tokens": 7604812398.0, + "step": 14877 + }, + { + "epoch": 4.023255813953488, + "grad_norm": 1.0670387744903564, + "learning_rate": 3.743015043561636e-06, + "loss": 1.8409, + "mean_token_accuracy": 0.5678422451019287, + "num_tokens": 7605336552.0, + "step": 14878 + }, + { + "epoch": 4.023526230394808, + "grad_norm": 0.9904413223266602, + "learning_rate": 3.7420827188376723e-06, + "loss": 1.7633, + "mean_token_accuracy": 0.6169813871383667, + "num_tokens": 7605860701.0, + "step": 14879 + }, + { + "epoch": 4.023796646836128, + "grad_norm": 0.8567332029342651, + "learning_rate": 3.741150616807956e-06, + "loss": 1.7787, + "mean_token_accuracy": 0.5790310502052307, + "num_tokens": 7606384813.0, + "step": 14880 + }, + { + "epoch": 4.024067063277447, + "grad_norm": 0.3529689908027649, + "learning_rate": 3.7402187375010834e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.710898220539093, + "num_tokens": 7606908975.0, + "step": 14881 + }, + { + "epoch": 4.024337479718767, + "grad_norm": 1.1829471588134766, + "learning_rate": 3.7392870809456526e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5810455083847046, + "num_tokens": 7607433087.0, + "step": 14882 + }, + { + "epoch": 4.024607896160086, + "grad_norm": 1.0694866180419922, + "learning_rate": 3.7383556471702475e-06, + "loss": 1.7387, + "mean_token_accuracy": 0.5937141180038452, + "num_tokens": 7607947911.0, + "step": 14883 + }, + { + "epoch": 4.024878312601406, + "grad_norm": 0.9512230753898621, + "learning_rate": 3.737424436203442e-06, + "loss": 1.8491, + "mean_token_accuracy": 0.5765260457992554, + "num_tokens": 7608424133.0, + "step": 14884 + }, + { + "epoch": 4.0251487290427255, + "grad_norm": 0.9799008965492249, + "learning_rate": 3.7364934480738156e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.572163462638855, + "num_tokens": 7608925270.0, + "step": 14885 + }, + { + "epoch": 4.025419145484046, + "grad_norm": 0.7397030591964722, + "learning_rate": 3.735562682809931e-06, + "loss": 1.7683, + "mean_token_accuracy": 0.5880072116851807, + "num_tokens": 7609449277.0, + "step": 14886 + }, + { + "epoch": 4.025689561925365, + "grad_norm": 0.9415981769561768, + "learning_rate": 3.734632140440344e-06, + "loss": 1.9416, + "mean_token_accuracy": 0.5334948897361755, + "num_tokens": 7609973413.0, + "step": 14887 + }, + { + "epoch": 4.025959978366685, + "grad_norm": 0.9018350839614868, + "learning_rate": 3.7337018209936126e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5549361109733582, + "num_tokens": 7610497670.0, + "step": 14888 + }, + { + "epoch": 4.026230394808004, + "grad_norm": 0.9350273609161377, + "learning_rate": 3.7327717244982764e-06, + "loss": 1.8165, + "mean_token_accuracy": 0.5846436023712158, + "num_tokens": 7611000062.0, + "step": 14889 + }, + { + "epoch": 4.026500811249324, + "grad_norm": 0.8982101082801819, + "learning_rate": 3.7318418509828737e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5797721147537231, + "num_tokens": 7611472899.0, + "step": 14890 + }, + { + "epoch": 4.026771227690643, + "grad_norm": 0.9135673642158508, + "learning_rate": 3.7309122004759367e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5881301164627075, + "num_tokens": 7611997075.0, + "step": 14891 + }, + { + "epoch": 4.027041644131963, + "grad_norm": 0.8782705664634705, + "learning_rate": 3.7299827730059936e-06, + "loss": 1.8502, + "mean_token_accuracy": 0.5510729551315308, + "num_tokens": 7612521155.0, + "step": 14892 + }, + { + "epoch": 4.027312060573283, + "grad_norm": 0.9149389266967773, + "learning_rate": 3.729053568601556e-06, + "loss": 1.9316, + "mean_token_accuracy": 0.5554503798484802, + "num_tokens": 7613045378.0, + "step": 14893 + }, + { + "epoch": 4.027582477014603, + "grad_norm": 1.028359055519104, + "learning_rate": 3.72812458729114e-06, + "loss": 1.9679, + "mean_token_accuracy": 0.567774772644043, + "num_tokens": 7613518948.0, + "step": 14894 + }, + { + "epoch": 4.027852893455922, + "grad_norm": 1.0327860116958618, + "learning_rate": 3.727195829103246e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.589145302772522, + "num_tokens": 7614032324.0, + "step": 14895 + }, + { + "epoch": 4.028123309897242, + "grad_norm": 0.8070108294487, + "learning_rate": 3.72626729406637e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5696851015090942, + "num_tokens": 7614556484.0, + "step": 14896 + }, + { + "epoch": 4.028393726338561, + "grad_norm": 0.8005263805389404, + "learning_rate": 3.7253389822090057e-06, + "loss": 1.7585, + "mean_token_accuracy": 0.5811965465545654, + "num_tokens": 7615080687.0, + "step": 14897 + }, + { + "epoch": 4.028664142779881, + "grad_norm": 1.0101443529129028, + "learning_rate": 3.7244108935596333e-06, + "loss": 1.8363, + "mean_token_accuracy": 0.5707651376724243, + "num_tokens": 7615604864.0, + "step": 14898 + }, + { + "epoch": 4.0289345592212005, + "grad_norm": 0.8939626812934875, + "learning_rate": 3.723483028146734e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.5529855489730835, + "num_tokens": 7616129117.0, + "step": 14899 + }, + { + "epoch": 4.029204975662521, + "grad_norm": 0.8839665055274963, + "learning_rate": 3.722555385998773e-06, + "loss": 1.6923, + "mean_token_accuracy": 0.6083060503005981, + "num_tokens": 7616653380.0, + "step": 14900 + }, + { + "epoch": 4.02947539210384, + "grad_norm": 0.3540569245815277, + "learning_rate": 3.7216279671442124e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7318061590194702, + "num_tokens": 7617177623.0, + "step": 14901 + }, + { + "epoch": 4.02974580854516, + "grad_norm": 1.121214747428894, + "learning_rate": 3.720700771611513e-06, + "loss": 1.78, + "mean_token_accuracy": 0.5851101875305176, + "num_tokens": 7617664894.0, + "step": 14902 + }, + { + "epoch": 4.030016224986479, + "grad_norm": 1.2395949363708496, + "learning_rate": 3.7197737994291195e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5788165926933289, + "num_tokens": 7618189169.0, + "step": 14903 + }, + { + "epoch": 4.030286641427799, + "grad_norm": 1.0548021793365479, + "learning_rate": 3.7188470506254746e-06, + "loss": 1.8364, + "mean_token_accuracy": 0.5850969552993774, + "num_tokens": 7618684988.0, + "step": 14904 + }, + { + "epoch": 4.030557057869118, + "grad_norm": 0.7713566422462463, + "learning_rate": 3.7179205252290164e-06, + "loss": 1.8015, + "mean_token_accuracy": 0.5795046091079712, + "num_tokens": 7619154629.0, + "step": 14905 + }, + { + "epoch": 4.030827474310438, + "grad_norm": 0.8694214224815369, + "learning_rate": 3.716994223268169e-06, + "loss": 1.8203, + "mean_token_accuracy": 0.590462863445282, + "num_tokens": 7619678814.0, + "step": 14906 + }, + { + "epoch": 4.031097890751758, + "grad_norm": 1.238563895225525, + "learning_rate": 3.716068144771357e-06, + "loss": 1.7756, + "mean_token_accuracy": 0.5954707860946655, + "num_tokens": 7620203069.0, + "step": 14907 + }, + { + "epoch": 4.031368307193078, + "grad_norm": 0.969373881816864, + "learning_rate": 3.7151422897669974e-06, + "loss": 1.5554, + "mean_token_accuracy": 0.631787896156311, + "num_tokens": 7620727242.0, + "step": 14908 + }, + { + "epoch": 4.031638723634397, + "grad_norm": 1.141526460647583, + "learning_rate": 3.714216658283496e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5670475959777832, + "num_tokens": 7621251264.0, + "step": 14909 + }, + { + "epoch": 4.031909140075717, + "grad_norm": 0.9314674139022827, + "learning_rate": 3.7132912503492506e-06, + "loss": 1.9838, + "mean_token_accuracy": 0.5451695919036865, + "num_tokens": 7621775343.0, + "step": 14910 + }, + { + "epoch": 4.032179556517036, + "grad_norm": 1.3738731145858765, + "learning_rate": 3.712366065992662e-06, + "loss": 2.0685, + "mean_token_accuracy": 0.5315189957618713, + "num_tokens": 7622275427.0, + "step": 14911 + }, + { + "epoch": 4.032449972958356, + "grad_norm": 0.8748582005500793, + "learning_rate": 3.711441105242112e-06, + "loss": 1.5542, + "mean_token_accuracy": 0.6276804804801941, + "num_tokens": 7622799655.0, + "step": 14912 + }, + { + "epoch": 4.0327203893996755, + "grad_norm": 1.1618413925170898, + "learning_rate": 3.7105163681259826e-06, + "loss": 1.8576, + "mean_token_accuracy": 0.5741295218467712, + "num_tokens": 7623323831.0, + "step": 14913 + }, + { + "epoch": 4.032990805840996, + "grad_norm": 0.9129734635353088, + "learning_rate": 3.7095918546726495e-06, + "loss": 1.8636, + "mean_token_accuracy": 0.5760625600814819, + "num_tokens": 7623848113.0, + "step": 14914 + }, + { + "epoch": 4.033261222282315, + "grad_norm": 0.7897258400917053, + "learning_rate": 3.7086675649104785e-06, + "loss": 1.8628, + "mean_token_accuracy": 0.5635303258895874, + "num_tokens": 7624372307.0, + "step": 14915 + }, + { + "epoch": 4.033531638723634, + "grad_norm": 1.2168489694595337, + "learning_rate": 3.7077434988678265e-06, + "loss": 1.85, + "mean_token_accuracy": 0.5708574056625366, + "num_tokens": 7624882415.0, + "step": 14916 + }, + { + "epoch": 4.033802055164954, + "grad_norm": 1.0203608274459839, + "learning_rate": 3.7068196565730512e-06, + "loss": 1.8711, + "mean_token_accuracy": 0.5741183757781982, + "num_tokens": 7625402427.0, + "step": 14917 + }, + { + "epoch": 4.034072471606273, + "grad_norm": 1.0196093320846558, + "learning_rate": 3.7058960380544975e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5852757692337036, + "num_tokens": 7625865823.0, + "step": 14918 + }, + { + "epoch": 4.034342888047593, + "grad_norm": 0.935547411441803, + "learning_rate": 3.704972643340501e-06, + "loss": 1.6776, + "mean_token_accuracy": 0.6152776479721069, + "num_tokens": 7626389827.0, + "step": 14919 + }, + { + "epoch": 4.0346133044889125, + "grad_norm": 1.0018057823181152, + "learning_rate": 3.7040494724593967e-06, + "loss": 1.8274, + "mean_token_accuracy": 0.5731360912322998, + "num_tokens": 7626913984.0, + "step": 14920 + }, + { + "epoch": 4.034883720930233, + "grad_norm": 0.3393457531929016, + "learning_rate": 3.7031265254395133e-06, + "loss": 1.1428, + "mean_token_accuracy": 0.69303959608078, + "num_tokens": 7627438164.0, + "step": 14921 + }, + { + "epoch": 4.035154137371552, + "grad_norm": 1.1773960590362549, + "learning_rate": 3.7022038023091644e-06, + "loss": 1.873, + "mean_token_accuracy": 0.543124258518219, + "num_tokens": 7627962413.0, + "step": 14922 + }, + { + "epoch": 4.035424553812872, + "grad_norm": 0.9116653800010681, + "learning_rate": 3.7012813030966664e-06, + "loss": 1.8404, + "mean_token_accuracy": 0.5762760639190674, + "num_tokens": 7628486600.0, + "step": 14923 + }, + { + "epoch": 4.035694970254191, + "grad_norm": 0.9294024705886841, + "learning_rate": 3.7003590278303205e-06, + "loss": 1.7899, + "mean_token_accuracy": 0.5831605195999146, + "num_tokens": 7629010815.0, + "step": 14924 + }, + { + "epoch": 4.035965386695511, + "grad_norm": 0.8647240996360779, + "learning_rate": 3.6994369765384275e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5797038078308105, + "num_tokens": 7629506416.0, + "step": 14925 + }, + { + "epoch": 4.03623580313683, + "grad_norm": 1.0299959182739258, + "learning_rate": 3.698515149249279e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5929615497589111, + "num_tokens": 7630030649.0, + "step": 14926 + }, + { + "epoch": 4.0365062195781505, + "grad_norm": 0.976788341999054, + "learning_rate": 3.697593545991155e-06, + "loss": 1.9495, + "mean_token_accuracy": 0.558861494064331, + "num_tokens": 7630520144.0, + "step": 14927 + }, + { + "epoch": 4.03677663601947, + "grad_norm": 0.9122642874717712, + "learning_rate": 3.6966721667923376e-06, + "loss": 1.8466, + "mean_token_accuracy": 0.5700855255126953, + "num_tokens": 7630997653.0, + "step": 14928 + }, + { + "epoch": 4.03704705246079, + "grad_norm": 1.083797812461853, + "learning_rate": 3.6957510116810976e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5982084274291992, + "num_tokens": 7631521890.0, + "step": 14929 + }, + { + "epoch": 4.037317468902109, + "grad_norm": 1.0541795492172241, + "learning_rate": 3.694830080685694e-06, + "loss": 1.8499, + "mean_token_accuracy": 0.5888627767562866, + "num_tokens": 7632015853.0, + "step": 14930 + }, + { + "epoch": 4.037587885343429, + "grad_norm": 0.9915549755096436, + "learning_rate": 3.693909373834388e-06, + "loss": 1.7813, + "mean_token_accuracy": 0.5892852544784546, + "num_tokens": 7632540099.0, + "step": 14931 + }, + { + "epoch": 4.037858301784748, + "grad_norm": 1.0107015371322632, + "learning_rate": 3.692988891155429e-06, + "loss": 1.8117, + "mean_token_accuracy": 0.5917382836341858, + "num_tokens": 7633064299.0, + "step": 14932 + }, + { + "epoch": 4.038128718226068, + "grad_norm": 1.1210672855377197, + "learning_rate": 3.6920686326770566e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5709081888198853, + "num_tokens": 7633588425.0, + "step": 14933 + }, + { + "epoch": 4.0383991346673875, + "grad_norm": 1.0166188478469849, + "learning_rate": 3.6911485984275102e-06, + "loss": 1.7982, + "mean_token_accuracy": 0.5818052887916565, + "num_tokens": 7634069496.0, + "step": 14934 + }, + { + "epoch": 4.038669551108708, + "grad_norm": 1.1167553663253784, + "learning_rate": 3.690228788435021e-06, + "loss": 1.8178, + "mean_token_accuracy": 0.5588177442550659, + "num_tokens": 7634593622.0, + "step": 14935 + }, + { + "epoch": 4.038939967550027, + "grad_norm": 0.819531261920929, + "learning_rate": 3.6893092027278072e-06, + "loss": 1.7589, + "mean_token_accuracy": 0.579052746295929, + "num_tokens": 7635117751.0, + "step": 14936 + }, + { + "epoch": 4.039210383991347, + "grad_norm": 0.9955248236656189, + "learning_rate": 3.688389841334088e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5714765787124634, + "num_tokens": 7635641972.0, + "step": 14937 + }, + { + "epoch": 4.039480800432666, + "grad_norm": 1.1392807960510254, + "learning_rate": 3.687470704282071e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5648195743560791, + "num_tokens": 7636166202.0, + "step": 14938 + }, + { + "epoch": 4.039751216873986, + "grad_norm": 0.8547882437705994, + "learning_rate": 3.686551791599955e-06, + "loss": 1.9285, + "mean_token_accuracy": 0.5659945011138916, + "num_tokens": 7636690423.0, + "step": 14939 + }, + { + "epoch": 4.040021633315305, + "grad_norm": 0.9049164056777954, + "learning_rate": 3.6856331033159397e-06, + "loss": 1.7147, + "mean_token_accuracy": 0.5988491177558899, + "num_tokens": 7637151504.0, + "step": 14940 + }, + { + "epoch": 4.0402920497566255, + "grad_norm": 0.3430812656879425, + "learning_rate": 3.6847146394582113e-06, + "loss": 1.1196, + "mean_token_accuracy": 0.6849342584609985, + "num_tokens": 7637675645.0, + "step": 14941 + }, + { + "epoch": 4.040562466197945, + "grad_norm": 1.1438156366348267, + "learning_rate": 3.683796400054948e-06, + "loss": 1.8509, + "mean_token_accuracy": 0.5699619650840759, + "num_tokens": 7638199912.0, + "step": 14942 + }, + { + "epoch": 4.040832882639265, + "grad_norm": 1.0149261951446533, + "learning_rate": 3.6828783851343286e-06, + "loss": 1.7423, + "mean_token_accuracy": 0.5904842615127563, + "num_tokens": 7638724152.0, + "step": 14943 + }, + { + "epoch": 4.041103299080584, + "grad_norm": 0.9519296288490295, + "learning_rate": 3.681960594724517e-06, + "loss": 1.9349, + "mean_token_accuracy": 0.5362993478775024, + "num_tokens": 7639248270.0, + "step": 14944 + }, + { + "epoch": 4.041373715521904, + "grad_norm": 0.8823251128196716, + "learning_rate": 3.681043028853677e-06, + "loss": 1.8859, + "mean_token_accuracy": 0.5706871747970581, + "num_tokens": 7639772439.0, + "step": 14945 + }, + { + "epoch": 4.041644131963223, + "grad_norm": 0.8116264939308167, + "learning_rate": 3.6801256875499614e-06, + "loss": 1.7869, + "mean_token_accuracy": 0.5824877619743347, + "num_tokens": 7640236784.0, + "step": 14946 + }, + { + "epoch": 4.041914548404543, + "grad_norm": 0.950204074382782, + "learning_rate": 3.679208570841514e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5740588903427124, + "num_tokens": 7640737804.0, + "step": 14947 + }, + { + "epoch": 4.0421849648458625, + "grad_norm": 0.9000364542007446, + "learning_rate": 3.678291678756478e-06, + "loss": 1.8257, + "mean_token_accuracy": 0.5880923271179199, + "num_tokens": 7641226424.0, + "step": 14948 + }, + { + "epoch": 4.042455381287183, + "grad_norm": 0.8972792625427246, + "learning_rate": 3.6773750113229824e-06, + "loss": 1.8513, + "mean_token_accuracy": 0.5732699632644653, + "num_tokens": 7641750638.0, + "step": 14949 + }, + { + "epoch": 4.042725797728502, + "grad_norm": 0.9304071068763733, + "learning_rate": 3.676458568569156e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.5948487520217896, + "num_tokens": 7642274742.0, + "step": 14950 + }, + { + "epoch": 4.042996214169822, + "grad_norm": 1.0494238138198853, + "learning_rate": 3.6755423505231215e-06, + "loss": 1.754, + "mean_token_accuracy": 0.5857574939727783, + "num_tokens": 7642798930.0, + "step": 14951 + }, + { + "epoch": 4.043266630611141, + "grad_norm": 1.0319865942001343, + "learning_rate": 3.6746263572129857e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5591025948524475, + "num_tokens": 7643323194.0, + "step": 14952 + }, + { + "epoch": 4.043537047052461, + "grad_norm": 0.9298191070556641, + "learning_rate": 3.6737105886668554e-06, + "loss": 1.8062, + "mean_token_accuracy": 0.5852947235107422, + "num_tokens": 7643847432.0, + "step": 14953 + }, + { + "epoch": 4.04380746349378, + "grad_norm": 0.9823976755142212, + "learning_rate": 3.67279504491283e-06, + "loss": 1.8319, + "mean_token_accuracy": 0.5878437757492065, + "num_tokens": 7644371711.0, + "step": 14954 + }, + { + "epoch": 4.0440778799351005, + "grad_norm": 1.2452377080917358, + "learning_rate": 3.671879725979002e-06, + "loss": 1.8703, + "mean_token_accuracy": 0.5666825175285339, + "num_tokens": 7644895941.0, + "step": 14955 + }, + { + "epoch": 4.04434829637642, + "grad_norm": 1.0334148406982422, + "learning_rate": 3.670964631893451e-06, + "loss": 1.8312, + "mean_token_accuracy": 0.5655443668365479, + "num_tokens": 7645420140.0, + "step": 14956 + }, + { + "epoch": 4.044618712817739, + "grad_norm": 0.920178234577179, + "learning_rate": 3.6700497626842623e-06, + "loss": 1.7164, + "mean_token_accuracy": 0.5838356614112854, + "num_tokens": 7645892986.0, + "step": 14957 + }, + { + "epoch": 4.044889129259059, + "grad_norm": 1.2009952068328857, + "learning_rate": 3.669135118379501e-06, + "loss": 1.9711, + "mean_token_accuracy": 0.5507715940475464, + "num_tokens": 7646417129.0, + "step": 14958 + }, + { + "epoch": 4.045159545700378, + "grad_norm": 1.0724232196807861, + "learning_rate": 3.6682206990072312e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.5763514637947083, + "num_tokens": 7646941300.0, + "step": 14959 + }, + { + "epoch": 4.045429962141698, + "grad_norm": 1.125203013420105, + "learning_rate": 3.6673065045955153e-06, + "loss": 1.8722, + "mean_token_accuracy": 0.5779516696929932, + "num_tokens": 7647465575.0, + "step": 14960 + }, + { + "epoch": 4.0457003785830175, + "grad_norm": 0.3489367961883545, + "learning_rate": 3.666392535172396e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.721214234828949, + "num_tokens": 7647989786.0, + "step": 14961 + }, + { + "epoch": 4.0459707950243375, + "grad_norm": 1.279786229133606, + "learning_rate": 3.665478790765923e-06, + "loss": 1.8042, + "mean_token_accuracy": 0.5731068849563599, + "num_tokens": 7648490190.0, + "step": 14962 + }, + { + "epoch": 4.046241211465657, + "grad_norm": 1.2733932733535767, + "learning_rate": 3.664565271404128e-06, + "loss": 1.9025, + "mean_token_accuracy": 0.5711156129837036, + "num_tokens": 7649014367.0, + "step": 14963 + }, + { + "epoch": 4.046511627906977, + "grad_norm": 0.9583011269569397, + "learning_rate": 3.6636519771150456e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5748543739318848, + "num_tokens": 7649538449.0, + "step": 14964 + }, + { + "epoch": 4.046782044348296, + "grad_norm": 0.9507167935371399, + "learning_rate": 3.6627389079266925e-06, + "loss": 1.7796, + "mean_token_accuracy": 0.5942298173904419, + "num_tokens": 7649964186.0, + "step": 14965 + }, + { + "epoch": 4.047052460789616, + "grad_norm": 0.9287237524986267, + "learning_rate": 3.66182606386709e-06, + "loss": 1.8134, + "mean_token_accuracy": 0.5863823890686035, + "num_tokens": 7650488337.0, + "step": 14966 + }, + { + "epoch": 4.047322877230935, + "grad_norm": 0.9595422744750977, + "learning_rate": 3.6609134449642416e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.6011461019515991, + "num_tokens": 7650913497.0, + "step": 14967 + }, + { + "epoch": 4.047593293672255, + "grad_norm": 1.0988103151321411, + "learning_rate": 3.6600010512461526e-06, + "loss": 1.8039, + "mean_token_accuracy": 0.5723820924758911, + "num_tokens": 7651437530.0, + "step": 14968 + }, + { + "epoch": 4.047863710113575, + "grad_norm": 0.951728105545044, + "learning_rate": 3.659088882740818e-06, + "loss": 1.8137, + "mean_token_accuracy": 0.5928729176521301, + "num_tokens": 7651961786.0, + "step": 14969 + }, + { + "epoch": 4.048134126554895, + "grad_norm": 0.9685467481613159, + "learning_rate": 3.6581769394762217e-06, + "loss": 1.8678, + "mean_token_accuracy": 0.5575748085975647, + "num_tokens": 7652485990.0, + "step": 14970 + }, + { + "epoch": 4.048404542996214, + "grad_norm": 0.8496072292327881, + "learning_rate": 3.6572652214803508e-06, + "loss": 1.7143, + "mean_token_accuracy": 0.59010910987854, + "num_tokens": 7653010215.0, + "step": 14971 + }, + { + "epoch": 4.048674959437534, + "grad_norm": 0.9607966542243958, + "learning_rate": 3.656353728781176e-06, + "loss": 1.7843, + "mean_token_accuracy": 0.5770403146743774, + "num_tokens": 7653529954.0, + "step": 14972 + }, + { + "epoch": 4.048945375878853, + "grad_norm": 1.0133373737335205, + "learning_rate": 3.655442461406663e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5801477432250977, + "num_tokens": 7653991610.0, + "step": 14973 + }, + { + "epoch": 4.049215792320173, + "grad_norm": 0.9197741746902466, + "learning_rate": 3.6545314193847758e-06, + "loss": 1.9407, + "mean_token_accuracy": 0.5539462566375732, + "num_tokens": 7654483939.0, + "step": 14974 + }, + { + "epoch": 4.0494862087614925, + "grad_norm": 0.866773247718811, + "learning_rate": 3.653620602743467e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.565934419631958, + "num_tokens": 7654970031.0, + "step": 14975 + }, + { + "epoch": 4.0497566252028125, + "grad_norm": 1.0632596015930176, + "learning_rate": 3.6527100115106806e-06, + "loss": 1.7193, + "mean_token_accuracy": 0.5914297699928284, + "num_tokens": 7655467107.0, + "step": 14976 + }, + { + "epoch": 4.050027041644132, + "grad_norm": 1.048117756843567, + "learning_rate": 3.6517996457143577e-06, + "loss": 1.7365, + "mean_token_accuracy": 0.594388484954834, + "num_tokens": 7655991215.0, + "step": 14977 + }, + { + "epoch": 4.050297458085452, + "grad_norm": 0.978614866733551, + "learning_rate": 3.6508895053824324e-06, + "loss": 1.855, + "mean_token_accuracy": 0.5742182731628418, + "num_tokens": 7656515246.0, + "step": 14978 + }, + { + "epoch": 4.050567874526771, + "grad_norm": 1.0775071382522583, + "learning_rate": 3.6499795905428283e-06, + "loss": 1.8744, + "mean_token_accuracy": 0.5642759799957275, + "num_tokens": 7656986961.0, + "step": 14979 + }, + { + "epoch": 4.050838290968091, + "grad_norm": 0.8743506669998169, + "learning_rate": 3.6490699012234675e-06, + "loss": 1.7335, + "mean_token_accuracy": 0.5973930358886719, + "num_tokens": 7657511180.0, + "step": 14980 + }, + { + "epoch": 4.05110870740941, + "grad_norm": 0.37253278493881226, + "learning_rate": 3.64816043745226e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.7075372338294983, + "num_tokens": 7658035406.0, + "step": 14981 + }, + { + "epoch": 4.05137912385073, + "grad_norm": 1.1164010763168335, + "learning_rate": 3.647251199257108e-06, + "loss": 1.878, + "mean_token_accuracy": 0.5707951784133911, + "num_tokens": 7658559670.0, + "step": 14982 + }, + { + "epoch": 4.05164954029205, + "grad_norm": 0.9818433523178101, + "learning_rate": 3.646342186665915e-06, + "loss": 1.6924, + "mean_token_accuracy": 0.6070165634155273, + "num_tokens": 7659083917.0, + "step": 14983 + }, + { + "epoch": 4.05191995673337, + "grad_norm": 0.9847273826599121, + "learning_rate": 3.645433399706569e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5600032806396484, + "num_tokens": 7659608043.0, + "step": 14984 + }, + { + "epoch": 4.052190373174689, + "grad_norm": 1.010146141052246, + "learning_rate": 3.644524838406952e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5888837575912476, + "num_tokens": 7660132208.0, + "step": 14985 + }, + { + "epoch": 4.052460789616009, + "grad_norm": 0.8282341957092285, + "learning_rate": 3.6436165027949473e-06, + "loss": 1.695, + "mean_token_accuracy": 0.6131424307823181, + "num_tokens": 7660656391.0, + "step": 14986 + }, + { + "epoch": 4.052731206057328, + "grad_norm": 0.8892847895622253, + "learning_rate": 3.642708392898421e-06, + "loss": 1.8923, + "mean_token_accuracy": 0.5804203748703003, + "num_tokens": 7661140922.0, + "step": 14987 + }, + { + "epoch": 4.053001622498648, + "grad_norm": 0.8889349699020386, + "learning_rate": 3.641800508745236e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.5704748034477234, + "num_tokens": 7661665002.0, + "step": 14988 + }, + { + "epoch": 4.0532720389399675, + "grad_norm": 0.9118894934654236, + "learning_rate": 3.6408928503632525e-06, + "loss": 1.6715, + "mean_token_accuracy": 0.6079671382904053, + "num_tokens": 7662189238.0, + "step": 14989 + }, + { + "epoch": 4.0535424553812875, + "grad_norm": 1.0228188037872314, + "learning_rate": 3.639985417780315e-06, + "loss": 1.8037, + "mean_token_accuracy": 0.584895670413971, + "num_tokens": 7662713454.0, + "step": 14990 + }, + { + "epoch": 4.053812871822607, + "grad_norm": 0.7486757040023804, + "learning_rate": 3.639078211024273e-06, + "loss": 1.7848, + "mean_token_accuracy": 0.5721510648727417, + "num_tokens": 7663237648.0, + "step": 14991 + }, + { + "epoch": 4.054083288263927, + "grad_norm": 0.879987359046936, + "learning_rate": 3.6381712301229555e-06, + "loss": 1.7306, + "mean_token_accuracy": 0.5933722257614136, + "num_tokens": 7663749115.0, + "step": 14992 + }, + { + "epoch": 4.054353704705246, + "grad_norm": 0.9914458990097046, + "learning_rate": 3.6372644751041942e-06, + "loss": 1.8945, + "mean_token_accuracy": 0.5747633576393127, + "num_tokens": 7664273335.0, + "step": 14993 + }, + { + "epoch": 4.054624121146566, + "grad_norm": 0.8523914813995361, + "learning_rate": 3.6363579459958133e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5659875869750977, + "num_tokens": 7664797607.0, + "step": 14994 + }, + { + "epoch": 4.054894537587885, + "grad_norm": 0.9656583070755005, + "learning_rate": 3.635451642825627e-06, + "loss": 1.7054, + "mean_token_accuracy": 0.6113554835319519, + "num_tokens": 7665321823.0, + "step": 14995 + }, + { + "epoch": 4.055164954029205, + "grad_norm": 0.9012396335601807, + "learning_rate": 3.6345455656214377e-06, + "loss": 1.9055, + "mean_token_accuracy": 0.5635156631469727, + "num_tokens": 7665846102.0, + "step": 14996 + }, + { + "epoch": 4.055435370470525, + "grad_norm": 0.8097804188728333, + "learning_rate": 3.6336397144110546e-06, + "loss": 1.7707, + "mean_token_accuracy": 0.5835745334625244, + "num_tokens": 7666370362.0, + "step": 14997 + }, + { + "epoch": 4.055705786911844, + "grad_norm": 0.8716522455215454, + "learning_rate": 3.6327340892222685e-06, + "loss": 1.7816, + "mean_token_accuracy": 0.5765244364738464, + "num_tokens": 7666894548.0, + "step": 14998 + }, + { + "epoch": 4.055976203353164, + "grad_norm": 0.8722168207168579, + "learning_rate": 3.6318286900828625e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.5843409299850464, + "num_tokens": 7667418754.0, + "step": 14999 + }, + { + "epoch": 4.056246619794483, + "grad_norm": 0.7873956561088562, + "learning_rate": 3.6309235170206243e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5630083680152893, + "num_tokens": 7667942853.0, + "step": 15000 + }, + { + "epoch": 4.056517036235803, + "grad_norm": 0.3647495210170746, + "learning_rate": 3.6300185700633227e-06, + "loss": 1.1604, + "mean_token_accuracy": 0.6770362257957458, + "num_tokens": 7668467132.0, + "step": 15001 + }, + { + "epoch": 4.056787452677122, + "grad_norm": 1.043007254600525, + "learning_rate": 3.6291138492387234e-06, + "loss": 1.8434, + "mean_token_accuracy": 0.5702135562896729, + "num_tokens": 7668991317.0, + "step": 15002 + }, + { + "epoch": 4.0570578691184425, + "grad_norm": 0.9807336926460266, + "learning_rate": 3.6282093545745887e-06, + "loss": 1.7138, + "mean_token_accuracy": 0.5882481336593628, + "num_tokens": 7669492686.0, + "step": 15003 + }, + { + "epoch": 4.057328285559762, + "grad_norm": 1.003253698348999, + "learning_rate": 3.627305086098669e-06, + "loss": 1.9386, + "mean_token_accuracy": 0.5599518418312073, + "num_tokens": 7670012917.0, + "step": 15004 + }, + { + "epoch": 4.057598702001082, + "grad_norm": 0.9559377431869507, + "learning_rate": 3.626401043838713e-06, + "loss": 1.7783, + "mean_token_accuracy": 0.5724716186523438, + "num_tokens": 7670536946.0, + "step": 15005 + }, + { + "epoch": 4.057869118442401, + "grad_norm": 1.0241776704788208, + "learning_rate": 3.6254972278224542e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.5851625204086304, + "num_tokens": 7671061042.0, + "step": 15006 + }, + { + "epoch": 4.058139534883721, + "grad_norm": 1.1250132322311401, + "learning_rate": 3.6245936380776314e-06, + "loss": 1.7418, + "mean_token_accuracy": 0.6055902242660522, + "num_tokens": 7671517137.0, + "step": 15007 + }, + { + "epoch": 4.05840995132504, + "grad_norm": 0.9451587200164795, + "learning_rate": 3.6236902746319613e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.5722498893737793, + "num_tokens": 7672041308.0, + "step": 15008 + }, + { + "epoch": 4.05868036776636, + "grad_norm": 0.8683164119720459, + "learning_rate": 3.6227871375131695e-06, + "loss": 1.748, + "mean_token_accuracy": 0.5948268175125122, + "num_tokens": 7672490628.0, + "step": 15009 + }, + { + "epoch": 4.0589507842076795, + "grad_norm": 0.8853003978729248, + "learning_rate": 3.621884226748963e-06, + "loss": 1.7454, + "mean_token_accuracy": 0.5939936637878418, + "num_tokens": 7672989909.0, + "step": 15010 + }, + { + "epoch": 4.059221200649, + "grad_norm": 1.1990704536437988, + "learning_rate": 3.6209815423670434e-06, + "loss": 1.9235, + "mean_token_accuracy": 0.5593540072441101, + "num_tokens": 7673514153.0, + "step": 15011 + }, + { + "epoch": 4.059491617090319, + "grad_norm": 1.0015841722488403, + "learning_rate": 3.6200790843951127e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.5795586109161377, + "num_tokens": 7674038438.0, + "step": 15012 + }, + { + "epoch": 4.059762033531639, + "grad_norm": 1.0174229145050049, + "learning_rate": 3.619176852860856e-06, + "loss": 1.8518, + "mean_token_accuracy": 0.5658565759658813, + "num_tokens": 7674562582.0, + "step": 15013 + }, + { + "epoch": 4.060032449972958, + "grad_norm": 1.0772321224212646, + "learning_rate": 3.6182748477919627e-06, + "loss": 1.8043, + "mean_token_accuracy": 0.5932598114013672, + "num_tokens": 7675086731.0, + "step": 15014 + }, + { + "epoch": 4.060302866414278, + "grad_norm": 1.1148186922073364, + "learning_rate": 3.6173730692161034e-06, + "loss": 2.006, + "mean_token_accuracy": 0.5312246084213257, + "num_tokens": 7675610978.0, + "step": 15015 + }, + { + "epoch": 4.060573282855597, + "grad_norm": 0.9577503800392151, + "learning_rate": 3.616471517160949e-06, + "loss": 1.7696, + "mean_token_accuracy": 0.5908043384552002, + "num_tokens": 7676135260.0, + "step": 15016 + }, + { + "epoch": 4.0608436992969175, + "grad_norm": 1.0484920740127563, + "learning_rate": 3.6155701916541624e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5914353132247925, + "num_tokens": 7676555588.0, + "step": 15017 + }, + { + "epoch": 4.061114115738237, + "grad_norm": 0.8284161686897278, + "learning_rate": 3.6146690927234e-06, + "loss": 1.7, + "mean_token_accuracy": 0.605069637298584, + "num_tokens": 7677059930.0, + "step": 15018 + }, + { + "epoch": 4.061384532179557, + "grad_norm": 0.8696975111961365, + "learning_rate": 3.6137682203963055e-06, + "loss": 1.7086, + "mean_token_accuracy": 0.588287889957428, + "num_tokens": 7677584061.0, + "step": 15019 + }, + { + "epoch": 4.061654948620876, + "grad_norm": 0.8914931416511536, + "learning_rate": 3.612867574700524e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5792179107666016, + "num_tokens": 7678095622.0, + "step": 15020 + }, + { + "epoch": 4.061925365062196, + "grad_norm": 0.34959059953689575, + "learning_rate": 3.6119671556636904e-06, + "loss": 1.155, + "mean_token_accuracy": 0.6959884166717529, + "num_tokens": 7678619811.0, + "step": 15021 + }, + { + "epoch": 4.062195781503515, + "grad_norm": 0.9008282423019409, + "learning_rate": 3.611066963313431e-06, + "loss": 1.8602, + "mean_token_accuracy": 0.5793832540512085, + "num_tokens": 7679144089.0, + "step": 15022 + }, + { + "epoch": 4.062466197944835, + "grad_norm": 1.1774473190307617, + "learning_rate": 3.610166997677368e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5627721548080444, + "num_tokens": 7679668171.0, + "step": 15023 + }, + { + "epoch": 4.0627366143861545, + "grad_norm": 0.9976481795310974, + "learning_rate": 3.6092672587831136e-06, + "loss": 1.8001, + "mean_token_accuracy": 0.5865589380264282, + "num_tokens": 7680192416.0, + "step": 15024 + }, + { + "epoch": 4.063007030827475, + "grad_norm": 0.8974927663803101, + "learning_rate": 3.608367746658271e-06, + "loss": 1.7908, + "mean_token_accuracy": 0.6188712120056152, + "num_tokens": 7680578378.0, + "step": 15025 + }, + { + "epoch": 4.063277447268794, + "grad_norm": 1.0118253231048584, + "learning_rate": 3.6074684613304466e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5696899890899658, + "num_tokens": 7681102497.0, + "step": 15026 + }, + { + "epoch": 4.063547863710114, + "grad_norm": 0.913702130317688, + "learning_rate": 3.6065694028272303e-06, + "loss": 1.7169, + "mean_token_accuracy": 0.604148268699646, + "num_tokens": 7681590736.0, + "step": 15027 + }, + { + "epoch": 4.063818280151433, + "grad_norm": 0.9348350763320923, + "learning_rate": 3.6056705711762054e-06, + "loss": 1.8335, + "mean_token_accuracy": 0.5737074017524719, + "num_tokens": 7682114967.0, + "step": 15028 + }, + { + "epoch": 4.064088696592753, + "grad_norm": 0.8097690939903259, + "learning_rate": 3.604771966404954e-06, + "loss": 1.8132, + "mean_token_accuracy": 0.5865858197212219, + "num_tokens": 7682639195.0, + "step": 15029 + }, + { + "epoch": 4.064359113034072, + "grad_norm": 0.8469559550285339, + "learning_rate": 3.6038735885410464e-06, + "loss": 1.8459, + "mean_token_accuracy": 0.5594179630279541, + "num_tokens": 7683163364.0, + "step": 15030 + }, + { + "epoch": 4.0646295294753925, + "grad_norm": 0.9228309988975525, + "learning_rate": 3.6029754376120463e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.573700487613678, + "num_tokens": 7683687494.0, + "step": 15031 + }, + { + "epoch": 4.064899945916712, + "grad_norm": 1.4039182662963867, + "learning_rate": 3.6020775136455156e-06, + "loss": 1.7948, + "mean_token_accuracy": 0.5947847366333008, + "num_tokens": 7684210976.0, + "step": 15032 + }, + { + "epoch": 4.065170362358032, + "grad_norm": 0.9597499966621399, + "learning_rate": 3.6011798166689993e-06, + "loss": 1.6834, + "mean_token_accuracy": 0.6180630922317505, + "num_tokens": 7684735201.0, + "step": 15033 + }, + { + "epoch": 4.065440778799351, + "grad_norm": 0.8576599359512329, + "learning_rate": 3.6002823467100446e-06, + "loss": 1.8695, + "mean_token_accuracy": 0.5699807405471802, + "num_tokens": 7685259305.0, + "step": 15034 + }, + { + "epoch": 4.065711195240671, + "grad_norm": 1.1558175086975098, + "learning_rate": 3.599385103796191e-06, + "loss": 1.8842, + "mean_token_accuracy": 0.5657044053077698, + "num_tokens": 7685783575.0, + "step": 15035 + }, + { + "epoch": 4.06598161168199, + "grad_norm": 0.9467992782592773, + "learning_rate": 3.5984880879549645e-06, + "loss": 1.9646, + "mean_token_accuracy": 0.5637539625167847, + "num_tokens": 7686270814.0, + "step": 15036 + }, + { + "epoch": 4.06625202812331, + "grad_norm": 0.8597660660743713, + "learning_rate": 3.5975912992138927e-06, + "loss": 1.7486, + "mean_token_accuracy": 0.6097654104232788, + "num_tokens": 7686731394.0, + "step": 15037 + }, + { + "epoch": 4.0665224445646295, + "grad_norm": 0.8758093118667603, + "learning_rate": 3.596694737600488e-06, + "loss": 1.79, + "mean_token_accuracy": 0.580654501914978, + "num_tokens": 7687255666.0, + "step": 15038 + }, + { + "epoch": 4.066792861005949, + "grad_norm": 0.8288407921791077, + "learning_rate": 3.595798403142258e-06, + "loss": 1.769, + "mean_token_accuracy": 0.5915738344192505, + "num_tokens": 7687779879.0, + "step": 15039 + }, + { + "epoch": 4.067063277447269, + "grad_norm": 0.8713370561599731, + "learning_rate": 3.59490229586671e-06, + "loss": 1.777, + "mean_token_accuracy": 0.5711934566497803, + "num_tokens": 7688299930.0, + "step": 15040 + }, + { + "epoch": 4.067333693888588, + "grad_norm": 0.3425079584121704, + "learning_rate": 3.5940064158013367e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.7153157591819763, + "num_tokens": 7688772946.0, + "step": 15041 + }, + { + "epoch": 4.067604110329908, + "grad_norm": 0.9198288321495056, + "learning_rate": 3.593110762973624e-06, + "loss": 1.6629, + "mean_token_accuracy": 0.6185308694839478, + "num_tokens": 7689297187.0, + "step": 15042 + }, + { + "epoch": 4.067874526771227, + "grad_norm": 0.8299229145050049, + "learning_rate": 3.5922153374110568e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.5882049202919006, + "num_tokens": 7689821355.0, + "step": 15043 + }, + { + "epoch": 4.068144943212547, + "grad_norm": 0.9389016628265381, + "learning_rate": 3.5913201391411086e-06, + "loss": 1.9322, + "mean_token_accuracy": 0.5557135343551636, + "num_tokens": 7690345538.0, + "step": 15044 + }, + { + "epoch": 4.068415359653867, + "grad_norm": 0.8140068650245667, + "learning_rate": 3.5904251681912437e-06, + "loss": 1.9168, + "mean_token_accuracy": 0.5690155625343323, + "num_tokens": 7690869693.0, + "step": 15045 + }, + { + "epoch": 4.068685776095187, + "grad_norm": 0.8791561126708984, + "learning_rate": 3.589530424588927e-06, + "loss": 1.7594, + "mean_token_accuracy": 0.5944125652313232, + "num_tokens": 7691393943.0, + "step": 15046 + }, + { + "epoch": 4.068956192536506, + "grad_norm": 0.8507316708564758, + "learning_rate": 3.5886359083616073e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5860979557037354, + "num_tokens": 7691910143.0, + "step": 15047 + }, + { + "epoch": 4.069226608977826, + "grad_norm": 0.9646099805831909, + "learning_rate": 3.5877416195367367e-06, + "loss": 1.7808, + "mean_token_accuracy": 0.5916042327880859, + "num_tokens": 7692434308.0, + "step": 15048 + }, + { + "epoch": 4.069497025419145, + "grad_norm": 0.8817590475082397, + "learning_rate": 3.586847558141747e-06, + "loss": 1.8098, + "mean_token_accuracy": 0.5965583324432373, + "num_tokens": 7692958565.0, + "step": 15049 + }, + { + "epoch": 4.069767441860465, + "grad_norm": 0.9359566569328308, + "learning_rate": 3.5859537242040777e-06, + "loss": 1.8657, + "mean_token_accuracy": 0.5722412467002869, + "num_tokens": 7693482787.0, + "step": 15050 + }, + { + "epoch": 4.070037858301784, + "grad_norm": 0.945206344127655, + "learning_rate": 3.5850601177511495e-06, + "loss": 1.9182, + "mean_token_accuracy": 0.5573128461837769, + "num_tokens": 7694007042.0, + "step": 15051 + }, + { + "epoch": 4.0703082747431045, + "grad_norm": 0.884492814540863, + "learning_rate": 3.584166738810386e-06, + "loss": 1.8465, + "mean_token_accuracy": 0.5885541439056396, + "num_tokens": 7694531126.0, + "step": 15052 + }, + { + "epoch": 4.070578691184424, + "grad_norm": 0.9630447030067444, + "learning_rate": 3.583273587409194e-06, + "loss": 1.7832, + "mean_token_accuracy": 0.5962187051773071, + "num_tokens": 7695055304.0, + "step": 15053 + }, + { + "epoch": 4.070849107625744, + "grad_norm": 0.8027512431144714, + "learning_rate": 3.582380663574978e-06, + "loss": 1.726, + "mean_token_accuracy": 0.5923619270324707, + "num_tokens": 7695579535.0, + "step": 15054 + }, + { + "epoch": 4.071119524067063, + "grad_norm": 1.1026856899261475, + "learning_rate": 3.5814879673351388e-06, + "loss": 1.9252, + "mean_token_accuracy": 0.5670551061630249, + "num_tokens": 7696068347.0, + "step": 15055 + }, + { + "epoch": 4.071389940508383, + "grad_norm": 1.1813362836837769, + "learning_rate": 3.5805954987170664e-06, + "loss": 1.8895, + "mean_token_accuracy": 0.5594921708106995, + "num_tokens": 7696592481.0, + "step": 15056 + }, + { + "epoch": 4.071660356949702, + "grad_norm": 0.9367049336433411, + "learning_rate": 3.579703257748139e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5902445316314697, + "num_tokens": 7697086731.0, + "step": 15057 + }, + { + "epoch": 4.071930773391022, + "grad_norm": 0.8118127584457397, + "learning_rate": 3.578811244455741e-06, + "loss": 1.7557, + "mean_token_accuracy": 0.5913118720054626, + "num_tokens": 7697610909.0, + "step": 15058 + }, + { + "epoch": 4.072201189832342, + "grad_norm": 0.8138901591300964, + "learning_rate": 3.577919458867235e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5737861394882202, + "num_tokens": 7698135088.0, + "step": 15059 + }, + { + "epoch": 4.072471606273662, + "grad_norm": 0.8171725869178772, + "learning_rate": 3.5770279010099904e-06, + "loss": 1.8081, + "mean_token_accuracy": 0.5806660652160645, + "num_tokens": 7698659280.0, + "step": 15060 + }, + { + "epoch": 4.072742022714981, + "grad_norm": 0.3327016830444336, + "learning_rate": 3.576136570911358e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.7193617820739746, + "num_tokens": 7699123535.0, + "step": 15061 + }, + { + "epoch": 4.073012439156301, + "grad_norm": 0.8599199056625366, + "learning_rate": 3.575245468598687e-06, + "loss": 1.9653, + "mean_token_accuracy": 0.5608965158462524, + "num_tokens": 7699647785.0, + "step": 15062 + }, + { + "epoch": 4.07328285559762, + "grad_norm": 0.9651824235916138, + "learning_rate": 3.5743545940993196e-06, + "loss": 1.862, + "mean_token_accuracy": 0.5749580264091492, + "num_tokens": 7700171976.0, + "step": 15063 + }, + { + "epoch": 4.07355327203894, + "grad_norm": 0.8449071645736694, + "learning_rate": 3.5734639474405918e-06, + "loss": 1.7637, + "mean_token_accuracy": 0.5894004106521606, + "num_tokens": 7700696062.0, + "step": 15064 + }, + { + "epoch": 4.073823688480259, + "grad_norm": 0.9835463166236877, + "learning_rate": 3.5725735286498284e-06, + "loss": 1.7837, + "mean_token_accuracy": 0.5931787490844727, + "num_tokens": 7701161185.0, + "step": 15065 + }, + { + "epoch": 4.0740941049215795, + "grad_norm": 0.8461405634880066, + "learning_rate": 3.5716833377543536e-06, + "loss": 1.8343, + "mean_token_accuracy": 0.5559183359146118, + "num_tokens": 7701685441.0, + "step": 15066 + }, + { + "epoch": 4.074364521362899, + "grad_norm": 0.9258796572685242, + "learning_rate": 3.5707933747814803e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.5859159231185913, + "num_tokens": 7702153287.0, + "step": 15067 + }, + { + "epoch": 4.074634937804219, + "grad_norm": 0.8489516973495483, + "learning_rate": 3.569903639758511e-06, + "loss": 1.9299, + "mean_token_accuracy": 0.5650784969329834, + "num_tokens": 7702677450.0, + "step": 15068 + }, + { + "epoch": 4.074905354245538, + "grad_norm": 0.8534603714942932, + "learning_rate": 3.5690141327127516e-06, + "loss": 1.7649, + "mean_token_accuracy": 0.5924384593963623, + "num_tokens": 7703160074.0, + "step": 15069 + }, + { + "epoch": 4.075175770686858, + "grad_norm": 0.8911504149436951, + "learning_rate": 3.5681248536714918e-06, + "loss": 1.8144, + "mean_token_accuracy": 0.5927107334136963, + "num_tokens": 7703684143.0, + "step": 15070 + }, + { + "epoch": 4.075446187128177, + "grad_norm": 1.0036239624023438, + "learning_rate": 3.5672358026620134e-06, + "loss": 1.9203, + "mean_token_accuracy": 0.5594546794891357, + "num_tokens": 7704186496.0, + "step": 15071 + }, + { + "epoch": 4.075716603569497, + "grad_norm": 0.8997259736061096, + "learning_rate": 3.5663469797116024e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.5910021066665649, + "num_tokens": 7704710702.0, + "step": 15072 + }, + { + "epoch": 4.075987020010817, + "grad_norm": 0.7648183703422546, + "learning_rate": 3.5654583848475266e-06, + "loss": 1.7809, + "mean_token_accuracy": 0.5931304693222046, + "num_tokens": 7705234925.0, + "step": 15073 + }, + { + "epoch": 4.076257436452137, + "grad_norm": 0.9116880297660828, + "learning_rate": 3.5645700180970492e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5669729709625244, + "num_tokens": 7705758998.0, + "step": 15074 + }, + { + "epoch": 4.076527852893456, + "grad_norm": 0.8598408102989197, + "learning_rate": 3.5636818794874318e-06, + "loss": 1.8211, + "mean_token_accuracy": 0.6160711050033569, + "num_tokens": 7706195696.0, + "step": 15075 + }, + { + "epoch": 4.076798269334776, + "grad_norm": 0.9686496257781982, + "learning_rate": 3.5627939690459207e-06, + "loss": 1.8484, + "mean_token_accuracy": 0.579829216003418, + "num_tokens": 7706719868.0, + "step": 15076 + }, + { + "epoch": 4.077068685776095, + "grad_norm": 0.9721331596374512, + "learning_rate": 3.561906286799763e-06, + "loss": 1.7993, + "mean_token_accuracy": 0.576741099357605, + "num_tokens": 7707224073.0, + "step": 15077 + }, + { + "epoch": 4.077339102217415, + "grad_norm": 0.7881508469581604, + "learning_rate": 3.561018832776195e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5918204188346863, + "num_tokens": 7707725257.0, + "step": 15078 + }, + { + "epoch": 4.0776095186587344, + "grad_norm": 0.8768541812896729, + "learning_rate": 3.560131607002447e-06, + "loss": 1.7911, + "mean_token_accuracy": 0.5790886878967285, + "num_tokens": 7708249447.0, + "step": 15079 + }, + { + "epoch": 4.077879935100054, + "grad_norm": 0.8634119629859924, + "learning_rate": 3.559244609505739e-06, + "loss": 1.7126, + "mean_token_accuracy": 0.588691234588623, + "num_tokens": 7708773688.0, + "step": 15080 + }, + { + "epoch": 4.078150351541374, + "grad_norm": 0.34585294127464294, + "learning_rate": 3.558357840313289e-06, + "loss": 1.1245, + "mean_token_accuracy": 0.6982775330543518, + "num_tokens": 7709297877.0, + "step": 15081 + }, + { + "epoch": 4.078420767982693, + "grad_norm": 1.0707666873931885, + "learning_rate": 3.557471299452303e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.5729833245277405, + "num_tokens": 7709822115.0, + "step": 15082 + }, + { + "epoch": 4.078691184424013, + "grad_norm": 0.9128759503364563, + "learning_rate": 3.5565849869499868e-06, + "loss": 1.8899, + "mean_token_accuracy": 0.5627016425132751, + "num_tokens": 7710346129.0, + "step": 15083 + }, + { + "epoch": 4.078961600865332, + "grad_norm": 0.8374271392822266, + "learning_rate": 3.5556989028335335e-06, + "loss": 1.8969, + "mean_token_accuracy": 0.5530269742012024, + "num_tokens": 7710870315.0, + "step": 15084 + }, + { + "epoch": 4.079232017306652, + "grad_norm": 1.1890811920166016, + "learning_rate": 3.554813047130127e-06, + "loss": 1.9003, + "mean_token_accuracy": 0.5606886148452759, + "num_tokens": 7711394496.0, + "step": 15085 + }, + { + "epoch": 4.0795024337479715, + "grad_norm": 0.9079720377922058, + "learning_rate": 3.553927419866954e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5681900978088379, + "num_tokens": 7711918674.0, + "step": 15086 + }, + { + "epoch": 4.079772850189292, + "grad_norm": 1.0121114253997803, + "learning_rate": 3.553042021071185e-06, + "loss": 1.8008, + "mean_token_accuracy": 0.56559818983078, + "num_tokens": 7712442929.0, + "step": 15087 + }, + { + "epoch": 4.080043266630611, + "grad_norm": 1.0201791524887085, + "learning_rate": 3.5521568507699843e-06, + "loss": 1.9644, + "mean_token_accuracy": 0.5617083311080933, + "num_tokens": 7712944065.0, + "step": 15088 + }, + { + "epoch": 4.080313683071931, + "grad_norm": 0.906738817691803, + "learning_rate": 3.551271908990517e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.5812536478042603, + "num_tokens": 7713468098.0, + "step": 15089 + }, + { + "epoch": 4.08058409951325, + "grad_norm": 0.9749529361724854, + "learning_rate": 3.55038719575993e-06, + "loss": 1.8357, + "mean_token_accuracy": 0.5747681856155396, + "num_tokens": 7713972896.0, + "step": 15090 + }, + { + "epoch": 4.08085451595457, + "grad_norm": 0.9330214858055115, + "learning_rate": 3.549502711105375e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.5766878128051758, + "num_tokens": 7714497111.0, + "step": 15091 + }, + { + "epoch": 4.081124932395889, + "grad_norm": 0.9878330230712891, + "learning_rate": 3.548618455053984e-06, + "loss": 1.837, + "mean_token_accuracy": 0.6159883737564087, + "num_tokens": 7714918589.0, + "step": 15092 + }, + { + "epoch": 4.0813953488372094, + "grad_norm": 0.9262992143630981, + "learning_rate": 3.547734427632894e-06, + "loss": 1.7439, + "mean_token_accuracy": 0.600353479385376, + "num_tokens": 7715336932.0, + "step": 15093 + }, + { + "epoch": 4.081665765278529, + "grad_norm": 1.0325816869735718, + "learning_rate": 3.546850628869226e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5842595100402832, + "num_tokens": 7715861182.0, + "step": 15094 + }, + { + "epoch": 4.081936181719849, + "grad_norm": 1.1234310865402222, + "learning_rate": 3.5459670587901006e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.5730495452880859, + "num_tokens": 7716348241.0, + "step": 15095 + }, + { + "epoch": 4.082206598161168, + "grad_norm": 0.9415091276168823, + "learning_rate": 3.545083717422626e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.5659443140029907, + "num_tokens": 7716872400.0, + "step": 15096 + }, + { + "epoch": 4.082477014602488, + "grad_norm": 0.947527289390564, + "learning_rate": 3.5442006047939057e-06, + "loss": 1.9544, + "mean_token_accuracy": 0.5596258640289307, + "num_tokens": 7717379916.0, + "step": 15097 + }, + { + "epoch": 4.082747431043807, + "grad_norm": 0.8016968369483948, + "learning_rate": 3.5433177209310386e-06, + "loss": 1.8377, + "mean_token_accuracy": 0.5627158880233765, + "num_tokens": 7717904103.0, + "step": 15098 + }, + { + "epoch": 4.083017847485127, + "grad_norm": 1.064301609992981, + "learning_rate": 3.542435065861113e-06, + "loss": 1.9463, + "mean_token_accuracy": 0.5763901472091675, + "num_tokens": 7718333551.0, + "step": 15099 + }, + { + "epoch": 4.0832882639264465, + "grad_norm": 0.8647804260253906, + "learning_rate": 3.5415526396112083e-06, + "loss": 1.8841, + "mean_token_accuracy": 0.5459555983543396, + "num_tokens": 7718857749.0, + "step": 15100 + }, + { + "epoch": 4.083558680367767, + "grad_norm": 0.34235844016075134, + "learning_rate": 3.5406704422084035e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7308639883995056, + "num_tokens": 7719343880.0, + "step": 15101 + }, + { + "epoch": 4.083829096809086, + "grad_norm": 0.9384922385215759, + "learning_rate": 3.5397884736797676e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5782474279403687, + "num_tokens": 7719868006.0, + "step": 15102 + }, + { + "epoch": 4.084099513250406, + "grad_norm": 0.8614959716796875, + "learning_rate": 3.5389067340523574e-06, + "loss": 1.7832, + "mean_token_accuracy": 0.5751165151596069, + "num_tokens": 7720392240.0, + "step": 15103 + }, + { + "epoch": 4.084369929691725, + "grad_norm": 0.9684810638427734, + "learning_rate": 3.5380252233532305e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5807225704193115, + "num_tokens": 7720916438.0, + "step": 15104 + }, + { + "epoch": 4.084640346133045, + "grad_norm": 0.9201785326004028, + "learning_rate": 3.537143941609437e-06, + "loss": 1.9329, + "mean_token_accuracy": 0.5690134167671204, + "num_tokens": 7721395904.0, + "step": 15105 + }, + { + "epoch": 4.084910762574364, + "grad_norm": 0.8617242574691772, + "learning_rate": 3.5362628888480112e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.5666965246200562, + "num_tokens": 7721920185.0, + "step": 15106 + }, + { + "epoch": 4.0851811790156844, + "grad_norm": 0.825549304485321, + "learning_rate": 3.53538206509599e-06, + "loss": 1.7502, + "mean_token_accuracy": 0.5983473658561707, + "num_tokens": 7722444460.0, + "step": 15107 + }, + { + "epoch": 4.085451595457004, + "grad_norm": 0.894566535949707, + "learning_rate": 3.5345014703803985e-06, + "loss": 1.9304, + "mean_token_accuracy": 0.5518554449081421, + "num_tokens": 7722968666.0, + "step": 15108 + }, + { + "epoch": 4.085722011898324, + "grad_norm": 1.0129987001419067, + "learning_rate": 3.5336211047282597e-06, + "loss": 1.9327, + "mean_token_accuracy": 0.5713881850242615, + "num_tokens": 7723479469.0, + "step": 15109 + }, + { + "epoch": 4.085992428339643, + "grad_norm": 0.9771546125411987, + "learning_rate": 3.5327409681665805e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5784575939178467, + "num_tokens": 7724003749.0, + "step": 15110 + }, + { + "epoch": 4.086262844780963, + "grad_norm": 0.8862442374229431, + "learning_rate": 3.531861060722367e-06, + "loss": 1.8015, + "mean_token_accuracy": 0.5813186168670654, + "num_tokens": 7724527981.0, + "step": 15111 + }, + { + "epoch": 4.086533261222282, + "grad_norm": 0.8602731227874756, + "learning_rate": 3.53098138242262e-06, + "loss": 1.736, + "mean_token_accuracy": 0.6083482503890991, + "num_tokens": 7724972296.0, + "step": 15112 + }, + { + "epoch": 4.086803677663602, + "grad_norm": 0.9888436794281006, + "learning_rate": 3.530101933294331e-06, + "loss": 1.7642, + "mean_token_accuracy": 0.5945650339126587, + "num_tokens": 7725459460.0, + "step": 15113 + }, + { + "epoch": 4.0870740941049215, + "grad_norm": 0.9703734517097473, + "learning_rate": 3.5292227133644784e-06, + "loss": 1.7955, + "mean_token_accuracy": 0.5793132781982422, + "num_tokens": 7725983684.0, + "step": 15114 + }, + { + "epoch": 4.087344510546242, + "grad_norm": 0.8544926643371582, + "learning_rate": 3.528343722660046e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5636513829231262, + "num_tokens": 7726507793.0, + "step": 15115 + }, + { + "epoch": 4.087614926987561, + "grad_norm": 0.7583210468292236, + "learning_rate": 3.5274649612080003e-06, + "loss": 1.7688, + "mean_token_accuracy": 0.5947908759117126, + "num_tokens": 7727031975.0, + "step": 15116 + }, + { + "epoch": 4.087885343428881, + "grad_norm": 1.0700541734695435, + "learning_rate": 3.5265864290353023e-06, + "loss": 1.7351, + "mean_token_accuracy": 0.5880311131477356, + "num_tokens": 7727492762.0, + "step": 15117 + }, + { + "epoch": 4.0881557598702, + "grad_norm": 1.1083585023880005, + "learning_rate": 3.525708126168914e-06, + "loss": 1.8082, + "mean_token_accuracy": 0.5795998573303223, + "num_tokens": 7728017013.0, + "step": 15118 + }, + { + "epoch": 4.08842617631152, + "grad_norm": 0.8441053032875061, + "learning_rate": 3.5248300526357777e-06, + "loss": 1.8188, + "mean_token_accuracy": 0.5752874612808228, + "num_tokens": 7728541219.0, + "step": 15119 + }, + { + "epoch": 4.088696592752839, + "grad_norm": 0.9373461604118347, + "learning_rate": 3.5239522084628386e-06, + "loss": 1.9028, + "mean_token_accuracy": 0.554831862449646, + "num_tokens": 7729065473.0, + "step": 15120 + }, + { + "epoch": 4.088967009194159, + "grad_norm": 0.3435998857021332, + "learning_rate": 3.5230745936770337e-06, + "loss": 1.1349, + "mean_token_accuracy": 0.6936442852020264, + "num_tokens": 7729589558.0, + "step": 15121 + }, + { + "epoch": 4.089237425635479, + "grad_norm": 1.0097291469573975, + "learning_rate": 3.5221972083052893e-06, + "loss": 1.8665, + "mean_token_accuracy": 0.5537221431732178, + "num_tokens": 7730113838.0, + "step": 15122 + }, + { + "epoch": 4.089507842076798, + "grad_norm": 0.8626189827919006, + "learning_rate": 3.5213200523745228e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5860665440559387, + "num_tokens": 7730631745.0, + "step": 15123 + }, + { + "epoch": 4.089778258518118, + "grad_norm": 0.9299325346946716, + "learning_rate": 3.5204431259116517e-06, + "loss": 1.7671, + "mean_token_accuracy": 0.5927544832229614, + "num_tokens": 7731099761.0, + "step": 15124 + }, + { + "epoch": 4.090048674959437, + "grad_norm": 0.8984611630439758, + "learning_rate": 3.5195664289435815e-06, + "loss": 1.8066, + "mean_token_accuracy": 0.5934438705444336, + "num_tokens": 7731513572.0, + "step": 15125 + }, + { + "epoch": 4.090319091400757, + "grad_norm": 0.8431371450424194, + "learning_rate": 3.5186899614972102e-06, + "loss": 1.7478, + "mean_token_accuracy": 0.5977281332015991, + "num_tokens": 7732037731.0, + "step": 15126 + }, + { + "epoch": 4.090589507842076, + "grad_norm": 0.9077544212341309, + "learning_rate": 3.5178137235994346e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.5823804140090942, + "num_tokens": 7732561916.0, + "step": 15127 + }, + { + "epoch": 4.0908599242833965, + "grad_norm": 0.8937615752220154, + "learning_rate": 3.5169377152771368e-06, + "loss": 1.815, + "mean_token_accuracy": 0.5809696912765503, + "num_tokens": 7733086057.0, + "step": 15128 + }, + { + "epoch": 4.091130340724716, + "grad_norm": 0.8079961538314819, + "learning_rate": 3.516061936557194e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.56150221824646, + "num_tokens": 7733610232.0, + "step": 15129 + }, + { + "epoch": 4.091400757166036, + "grad_norm": 1.0874379873275757, + "learning_rate": 3.5151863874664827e-06, + "loss": 1.8949, + "mean_token_accuracy": 0.5680332183837891, + "num_tokens": 7734118538.0, + "step": 15130 + }, + { + "epoch": 4.091671173607355, + "grad_norm": 0.9378283023834229, + "learning_rate": 3.514311068031861e-06, + "loss": 1.8484, + "mean_token_accuracy": 0.5719028115272522, + "num_tokens": 7734642607.0, + "step": 15131 + }, + { + "epoch": 4.091941590048675, + "grad_norm": 0.8452032208442688, + "learning_rate": 3.513435978280193e-06, + "loss": 1.9669, + "mean_token_accuracy": 0.5545539855957031, + "num_tokens": 7735166881.0, + "step": 15132 + }, + { + "epoch": 4.092212006489994, + "grad_norm": 0.9937674403190613, + "learning_rate": 3.5125611182383236e-06, + "loss": 1.8047, + "mean_token_accuracy": 0.5804799199104309, + "num_tokens": 7735651546.0, + "step": 15133 + }, + { + "epoch": 4.092482422931314, + "grad_norm": 0.9895807504653931, + "learning_rate": 3.5116864879330974e-06, + "loss": 1.8002, + "mean_token_accuracy": 0.5925993919372559, + "num_tokens": 7736171921.0, + "step": 15134 + }, + { + "epoch": 4.092752839372634, + "grad_norm": 0.8556848764419556, + "learning_rate": 3.5108120873913536e-06, + "loss": 1.7729, + "mean_token_accuracy": 0.5948098301887512, + "num_tokens": 7736696148.0, + "step": 15135 + }, + { + "epoch": 4.093023255813954, + "grad_norm": 0.9783825278282166, + "learning_rate": 3.5099379166399194e-06, + "loss": 1.8755, + "mean_token_accuracy": 0.603927731513977, + "num_tokens": 7737201828.0, + "step": 15136 + }, + { + "epoch": 4.093293672255273, + "grad_norm": 0.9863330125808716, + "learning_rate": 3.5090639757056156e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5734524130821228, + "num_tokens": 7737685902.0, + "step": 15137 + }, + { + "epoch": 4.093564088696593, + "grad_norm": 0.759060800075531, + "learning_rate": 3.5081902646152597e-06, + "loss": 1.8715, + "mean_token_accuracy": 0.5692012310028076, + "num_tokens": 7738210162.0, + "step": 15138 + }, + { + "epoch": 4.093834505137912, + "grad_norm": 0.9070225954055786, + "learning_rate": 3.507316783395659e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5963304042816162, + "num_tokens": 7738659607.0, + "step": 15139 + }, + { + "epoch": 4.094104921579232, + "grad_norm": 0.9338918328285217, + "learning_rate": 3.5064435320736113e-06, + "loss": 1.7703, + "mean_token_accuracy": 0.5975042581558228, + "num_tokens": 7739157271.0, + "step": 15140 + }, + { + "epoch": 4.094375338020551, + "grad_norm": 0.3257714509963989, + "learning_rate": 3.5055705106759153e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.7201471328735352, + "num_tokens": 7739642524.0, + "step": 15141 + }, + { + "epoch": 4.0946457544618715, + "grad_norm": 0.8975146412849426, + "learning_rate": 3.5046977192293553e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5719486474990845, + "num_tokens": 7740166791.0, + "step": 15142 + }, + { + "epoch": 4.094916170903191, + "grad_norm": 1.0031994581222534, + "learning_rate": 3.503825157760711e-06, + "loss": 1.7895, + "mean_token_accuracy": 0.5773193836212158, + "num_tokens": 7740691004.0, + "step": 15143 + }, + { + "epoch": 4.095186587344511, + "grad_norm": 1.0555486679077148, + "learning_rate": 3.5029528262967562e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.5896706581115723, + "num_tokens": 7741213025.0, + "step": 15144 + }, + { + "epoch": 4.09545700378583, + "grad_norm": 0.8599016070365906, + "learning_rate": 3.5020807248642564e-06, + "loss": 1.9185, + "mean_token_accuracy": 0.5705379843711853, + "num_tokens": 7741737133.0, + "step": 15145 + }, + { + "epoch": 4.09572742022715, + "grad_norm": 0.93753981590271, + "learning_rate": 3.501208853489968e-06, + "loss": 1.8204, + "mean_token_accuracy": 0.5657646656036377, + "num_tokens": 7742261306.0, + "step": 15146 + }, + { + "epoch": 4.095997836668469, + "grad_norm": 0.7383777499198914, + "learning_rate": 3.500337212200644e-06, + "loss": 1.7658, + "mean_token_accuracy": 0.5758522748947144, + "num_tokens": 7742785549.0, + "step": 15147 + }, + { + "epoch": 4.096268253109789, + "grad_norm": 0.9978623986244202, + "learning_rate": 3.4994658010230326e-06, + "loss": 1.8922, + "mean_token_accuracy": 0.5798035860061646, + "num_tokens": 7743309789.0, + "step": 15148 + }, + { + "epoch": 4.096538669551109, + "grad_norm": 1.0110819339752197, + "learning_rate": 3.4985946199838643e-06, + "loss": 1.9348, + "mean_token_accuracy": 0.5616297125816345, + "num_tokens": 7743834021.0, + "step": 15149 + }, + { + "epoch": 4.096809085992429, + "grad_norm": 0.7824794054031372, + "learning_rate": 3.497723669109877e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5642444491386414, + "num_tokens": 7744358237.0, + "step": 15150 + }, + { + "epoch": 4.097079502433748, + "grad_norm": 0.7961907386779785, + "learning_rate": 3.4968529484277893e-06, + "loss": 1.8087, + "mean_token_accuracy": 0.5837439298629761, + "num_tokens": 7744797002.0, + "step": 15151 + }, + { + "epoch": 4.097349918875068, + "grad_norm": 0.9703258872032166, + "learning_rate": 3.4959824579643164e-06, + "loss": 1.8772, + "mean_token_accuracy": 0.573907732963562, + "num_tokens": 7745276272.0, + "step": 15152 + }, + { + "epoch": 4.097620335316387, + "grad_norm": 0.9485150575637817, + "learning_rate": 3.495112197746172e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5763610601425171, + "num_tokens": 7745792663.0, + "step": 15153 + }, + { + "epoch": 4.097890751757707, + "grad_norm": 0.9805676937103271, + "learning_rate": 3.4942421678000536e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5686925053596497, + "num_tokens": 7746262738.0, + "step": 15154 + }, + { + "epoch": 4.098161168199026, + "grad_norm": 0.9102251529693604, + "learning_rate": 3.4933723681526616e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5580734014511108, + "num_tokens": 7746787008.0, + "step": 15155 + }, + { + "epoch": 4.0984315846403465, + "grad_norm": 1.0853506326675415, + "learning_rate": 3.492502798830679e-06, + "loss": 1.8707, + "mean_token_accuracy": 0.5875388979911804, + "num_tokens": 7747268692.0, + "step": 15156 + }, + { + "epoch": 4.098702001081666, + "grad_norm": 1.075305700302124, + "learning_rate": 3.491633459860788e-06, + "loss": 1.8242, + "mean_token_accuracy": 0.5779654383659363, + "num_tokens": 7747792872.0, + "step": 15157 + }, + { + "epoch": 4.098972417522986, + "grad_norm": 1.1554155349731445, + "learning_rate": 3.4907643512696653e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5747143030166626, + "num_tokens": 7748317138.0, + "step": 15158 + }, + { + "epoch": 4.099242833964305, + "grad_norm": 0.8648988008499146, + "learning_rate": 3.489895473083975e-06, + "loss": 1.8802, + "mean_token_accuracy": 0.5582795143127441, + "num_tokens": 7748841210.0, + "step": 15159 + }, + { + "epoch": 4.099513250405625, + "grad_norm": 0.8221662640571594, + "learning_rate": 3.489026825330375e-06, + "loss": 1.7938, + "mean_token_accuracy": 0.587700605392456, + "num_tokens": 7749365375.0, + "step": 15160 + }, + { + "epoch": 4.099783666846944, + "grad_norm": 0.3665138781070709, + "learning_rate": 3.4881584080355222e-06, + "loss": 1.094, + "mean_token_accuracy": 0.7054353952407837, + "num_tokens": 7749884075.0, + "step": 15161 + }, + { + "epoch": 4.1000540832882635, + "grad_norm": 1.2963024377822876, + "learning_rate": 3.4872902212260596e-06, + "loss": 1.788, + "mean_token_accuracy": 0.5893644094467163, + "num_tokens": 7750400866.0, + "step": 15162 + }, + { + "epoch": 4.100324499729584, + "grad_norm": 1.2328953742980957, + "learning_rate": 3.486422264928625e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.5802146792411804, + "num_tokens": 7750925089.0, + "step": 15163 + }, + { + "epoch": 4.100594916170903, + "grad_norm": 0.919579029083252, + "learning_rate": 3.485554539169853e-06, + "loss": 1.7919, + "mean_token_accuracy": 0.5916194319725037, + "num_tokens": 7751449294.0, + "step": 15164 + }, + { + "epoch": 4.100865332612223, + "grad_norm": 0.9078643918037415, + "learning_rate": 3.484687043976368e-06, + "loss": 1.8109, + "mean_token_accuracy": 0.5637770295143127, + "num_tokens": 7751973365.0, + "step": 15165 + }, + { + "epoch": 4.101135749053542, + "grad_norm": 1.1049854755401611, + "learning_rate": 3.483819779374783e-06, + "loss": 1.7665, + "mean_token_accuracy": 0.5932208299636841, + "num_tokens": 7752497399.0, + "step": 15166 + }, + { + "epoch": 4.101406165494862, + "grad_norm": 1.1656601428985596, + "learning_rate": 3.4829527453917124e-06, + "loss": 1.9482, + "mean_token_accuracy": 0.5436123609542847, + "num_tokens": 7753021558.0, + "step": 15167 + }, + { + "epoch": 4.101676581936181, + "grad_norm": 0.9580396413803101, + "learning_rate": 3.482085942053758e-06, + "loss": 1.8482, + "mean_token_accuracy": 0.5819439888000488, + "num_tokens": 7753491192.0, + "step": 15168 + }, + { + "epoch": 4.101946998377501, + "grad_norm": 0.9559164643287659, + "learning_rate": 3.481219369387514e-06, + "loss": 1.7453, + "mean_token_accuracy": 0.5759106278419495, + "num_tokens": 7754015464.0, + "step": 15169 + }, + { + "epoch": 4.102217414818821, + "grad_norm": 0.9651455879211426, + "learning_rate": 3.4803530274195724e-06, + "loss": 1.7207, + "mean_token_accuracy": 0.6080465912818909, + "num_tokens": 7754539624.0, + "step": 15170 + }, + { + "epoch": 4.102487831260141, + "grad_norm": 1.0965324640274048, + "learning_rate": 3.4794869161765143e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.6002388000488281, + "num_tokens": 7755063795.0, + "step": 15171 + }, + { + "epoch": 4.10275824770146, + "grad_norm": 1.0449073314666748, + "learning_rate": 3.47862103568491e-06, + "loss": 1.9329, + "mean_token_accuracy": 0.5451141595840454, + "num_tokens": 7755569185.0, + "step": 15172 + }, + { + "epoch": 4.10302866414278, + "grad_norm": 1.0052926540374756, + "learning_rate": 3.4777553859713365e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5693165063858032, + "num_tokens": 7756093405.0, + "step": 15173 + }, + { + "epoch": 4.103299080584099, + "grad_norm": 1.1753191947937012, + "learning_rate": 3.4768899670623444e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5735669136047363, + "num_tokens": 7756617579.0, + "step": 15174 + }, + { + "epoch": 4.103569497025419, + "grad_norm": 1.05899178981781, + "learning_rate": 3.476024778984496e-06, + "loss": 1.8154, + "mean_token_accuracy": 0.5707111954689026, + "num_tokens": 7757083275.0, + "step": 15175 + }, + { + "epoch": 4.1038399134667385, + "grad_norm": 0.9433067440986633, + "learning_rate": 3.4751598217643312e-06, + "loss": 1.8858, + "mean_token_accuracy": 0.5755312442779541, + "num_tokens": 7757607404.0, + "step": 15176 + }, + { + "epoch": 4.104110329908059, + "grad_norm": 0.8537237644195557, + "learning_rate": 3.474295095428393e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5642101168632507, + "num_tokens": 7758131674.0, + "step": 15177 + }, + { + "epoch": 4.104380746349378, + "grad_norm": 1.1563811302185059, + "learning_rate": 3.4734306000032137e-06, + "loss": 1.6051, + "mean_token_accuracy": 0.6269829273223877, + "num_tokens": 7758655834.0, + "step": 15178 + }, + { + "epoch": 4.104651162790698, + "grad_norm": 0.9905443787574768, + "learning_rate": 3.4725663355153183e-06, + "loss": 1.7345, + "mean_token_accuracy": 0.5852006673812866, + "num_tokens": 7759180072.0, + "step": 15179 + }, + { + "epoch": 4.104921579232017, + "grad_norm": 1.107513427734375, + "learning_rate": 3.471702301991222e-06, + "loss": 1.7992, + "mean_token_accuracy": 0.5870515704154968, + "num_tokens": 7759704264.0, + "step": 15180 + }, + { + "epoch": 4.105191995673337, + "grad_norm": 0.3639426529407501, + "learning_rate": 3.4708384994574417e-06, + "loss": 1.085, + "mean_token_accuracy": 0.704883337020874, + "num_tokens": 7760228456.0, + "step": 15181 + }, + { + "epoch": 4.105462412114656, + "grad_norm": 0.9864677786827087, + "learning_rate": 3.4699749279404772e-06, + "loss": 1.7958, + "mean_token_accuracy": 0.5947233438491821, + "num_tokens": 7760750831.0, + "step": 15182 + }, + { + "epoch": 4.105732828555976, + "grad_norm": 0.8717124462127686, + "learning_rate": 3.4691115874668246e-06, + "loss": 1.7624, + "mean_token_accuracy": 0.5952214002609253, + "num_tokens": 7761215087.0, + "step": 15183 + }, + { + "epoch": 4.106003244997296, + "grad_norm": 1.0703167915344238, + "learning_rate": 3.4682484780629765e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.5932919383049011, + "num_tokens": 7761719976.0, + "step": 15184 + }, + { + "epoch": 4.106273661438616, + "grad_norm": 1.037161111831665, + "learning_rate": 3.4673855997554157e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.5702955722808838, + "num_tokens": 7762244173.0, + "step": 15185 + }, + { + "epoch": 4.106544077879935, + "grad_norm": 0.9528591632843018, + "learning_rate": 3.4665229525706135e-06, + "loss": 1.8967, + "mean_token_accuracy": 0.5781210064888, + "num_tokens": 7762705110.0, + "step": 15186 + }, + { + "epoch": 4.106814494321255, + "grad_norm": 0.9346392154693604, + "learning_rate": 3.465660536535044e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.6018807291984558, + "num_tokens": 7763229361.0, + "step": 15187 + }, + { + "epoch": 4.107084910762574, + "grad_norm": 0.9721655249595642, + "learning_rate": 3.4647983516751677e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5817934274673462, + "num_tokens": 7763705077.0, + "step": 15188 + }, + { + "epoch": 4.107355327203894, + "grad_norm": 0.7957157492637634, + "learning_rate": 3.4639363980174333e-06, + "loss": 1.8411, + "mean_token_accuracy": 0.5781383514404297, + "num_tokens": 7764229258.0, + "step": 15189 + }, + { + "epoch": 4.1076257436452135, + "grad_norm": 0.9546460509300232, + "learning_rate": 3.4630746755882937e-06, + "loss": 1.8093, + "mean_token_accuracy": 0.6065722703933716, + "num_tokens": 7764688028.0, + "step": 15190 + }, + { + "epoch": 4.107896160086534, + "grad_norm": 0.9413079023361206, + "learning_rate": 3.4622131844141894e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5635323524475098, + "num_tokens": 7765187799.0, + "step": 15191 + }, + { + "epoch": 4.108166576527853, + "grad_norm": 0.8709710836410522, + "learning_rate": 3.46135192452155e-06, + "loss": 1.6931, + "mean_token_accuracy": 0.5996565818786621, + "num_tokens": 7765711935.0, + "step": 15192 + }, + { + "epoch": 4.108436992969173, + "grad_norm": 0.8442668914794922, + "learning_rate": 3.4604908959368043e-06, + "loss": 1.8214, + "mean_token_accuracy": 0.5990394949913025, + "num_tokens": 7766172896.0, + "step": 15193 + }, + { + "epoch": 4.108707409410492, + "grad_norm": 0.8311616778373718, + "learning_rate": 3.459630098686372e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5635330677032471, + "num_tokens": 7766697113.0, + "step": 15194 + }, + { + "epoch": 4.108977825851812, + "grad_norm": 1.156220555305481, + "learning_rate": 3.4587695327966597e-06, + "loss": 1.8709, + "mean_token_accuracy": 0.5779292583465576, + "num_tokens": 7767193555.0, + "step": 15195 + }, + { + "epoch": 4.109248242293131, + "grad_norm": 0.9254406690597534, + "learning_rate": 3.4579091982940772e-06, + "loss": 1.6923, + "mean_token_accuracy": 0.6146214008331299, + "num_tokens": 7767717747.0, + "step": 15196 + }, + { + "epoch": 4.109518658734451, + "grad_norm": 0.9411171674728394, + "learning_rate": 3.4570490952050218e-06, + "loss": 1.687, + "mean_token_accuracy": 0.5961679816246033, + "num_tokens": 7768242003.0, + "step": 15197 + }, + { + "epoch": 4.109789075175771, + "grad_norm": 0.9137649536132812, + "learning_rate": 3.4561892235558794e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5695808529853821, + "num_tokens": 7768766088.0, + "step": 15198 + }, + { + "epoch": 4.110059491617091, + "grad_norm": 1.0247044563293457, + "learning_rate": 3.4553295833730384e-06, + "loss": 1.8319, + "mean_token_accuracy": 0.5763622522354126, + "num_tokens": 7769290323.0, + "step": 15199 + }, + { + "epoch": 4.11032990805841, + "grad_norm": 1.1315234899520874, + "learning_rate": 3.4544701746828718e-06, + "loss": 1.8409, + "mean_token_accuracy": 0.5788187980651855, + "num_tokens": 7769814281.0, + "step": 15200 + }, + { + "epoch": 4.11060032449973, + "grad_norm": 0.315361887216568, + "learning_rate": 3.453610997511752e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.734750509262085, + "num_tokens": 7770338389.0, + "step": 15201 + }, + { + "epoch": 4.110870740941049, + "grad_norm": 1.0909461975097656, + "learning_rate": 3.4527520518860393e-06, + "loss": 1.863, + "mean_token_accuracy": 0.577238917350769, + "num_tokens": 7770862276.0, + "step": 15202 + }, + { + "epoch": 4.111141157382368, + "grad_norm": 0.9743217825889587, + "learning_rate": 3.4518933378320864e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5775469541549683, + "num_tokens": 7771386399.0, + "step": 15203 + }, + { + "epoch": 4.1114115738236885, + "grad_norm": 1.0493475198745728, + "learning_rate": 3.451034855376244e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5544126033782959, + "num_tokens": 7771910576.0, + "step": 15204 + }, + { + "epoch": 4.111681990265008, + "grad_norm": 0.806803822517395, + "learning_rate": 3.450176604544854e-06, + "loss": 1.7884, + "mean_token_accuracy": 0.5802297592163086, + "num_tokens": 7772434845.0, + "step": 15205 + }, + { + "epoch": 4.111952406706328, + "grad_norm": 0.9009171724319458, + "learning_rate": 3.4493185853642463e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.5787747502326965, + "num_tokens": 7772958970.0, + "step": 15206 + }, + { + "epoch": 4.112222823147647, + "grad_norm": 0.9483808279037476, + "learning_rate": 3.4484607978607507e-06, + "loss": 1.8067, + "mean_token_accuracy": 0.5924209356307983, + "num_tokens": 7773458036.0, + "step": 15207 + }, + { + "epoch": 4.112493239588967, + "grad_norm": 1.067622184753418, + "learning_rate": 3.4476032420606865e-06, + "loss": 1.8398, + "mean_token_accuracy": 0.5782437920570374, + "num_tokens": 7773982156.0, + "step": 15208 + }, + { + "epoch": 4.112763656030286, + "grad_norm": 1.0984071493148804, + "learning_rate": 3.4467459179903628e-06, + "loss": 1.664, + "mean_token_accuracy": 0.6223440766334534, + "num_tokens": 7774417498.0, + "step": 15209 + }, + { + "epoch": 4.113034072471606, + "grad_norm": 1.0490339994430542, + "learning_rate": 3.4458888256760893e-06, + "loss": 2.0021, + "mean_token_accuracy": 0.5317488312721252, + "num_tokens": 7774941714.0, + "step": 15210 + }, + { + "epoch": 4.1133044889129255, + "grad_norm": 0.9372274279594421, + "learning_rate": 3.445031965144161e-06, + "loss": 1.901, + "mean_token_accuracy": 0.5687678456306458, + "num_tokens": 7775430769.0, + "step": 15211 + }, + { + "epoch": 4.113574905354246, + "grad_norm": 1.086349606513977, + "learning_rate": 3.4441753364208685e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.6014828681945801, + "num_tokens": 7775870441.0, + "step": 15212 + }, + { + "epoch": 4.113845321795565, + "grad_norm": 0.860621452331543, + "learning_rate": 3.4433189395324986e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5718571543693542, + "num_tokens": 7776394663.0, + "step": 15213 + }, + { + "epoch": 4.114115738236885, + "grad_norm": 0.8253284096717834, + "learning_rate": 3.4424627745053262e-06, + "loss": 1.7986, + "mean_token_accuracy": 0.5852614641189575, + "num_tokens": 7776883574.0, + "step": 15214 + }, + { + "epoch": 4.114386154678204, + "grad_norm": 0.8463495969772339, + "learning_rate": 3.4416068413656187e-06, + "loss": 1.8267, + "mean_token_accuracy": 0.5672744512557983, + "num_tokens": 7777407699.0, + "step": 15215 + }, + { + "epoch": 4.114656571119524, + "grad_norm": 0.8074406385421753, + "learning_rate": 3.4407511401396443e-06, + "loss": 1.7267, + "mean_token_accuracy": 0.5895065069198608, + "num_tokens": 7777931967.0, + "step": 15216 + }, + { + "epoch": 4.114926987560843, + "grad_norm": 1.0226356983184814, + "learning_rate": 3.4398956708536526e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5717073678970337, + "num_tokens": 7778424830.0, + "step": 15217 + }, + { + "epoch": 4.1151974040021635, + "grad_norm": 1.0276567935943604, + "learning_rate": 3.439040433533897e-06, + "loss": 1.8707, + "mean_token_accuracy": 0.5753988027572632, + "num_tokens": 7778949066.0, + "step": 15218 + }, + { + "epoch": 4.115467820443483, + "grad_norm": 0.8529340028762817, + "learning_rate": 3.4381854282066142e-06, + "loss": 1.8642, + "mean_token_accuracy": 0.5772920846939087, + "num_tokens": 7779473263.0, + "step": 15219 + }, + { + "epoch": 4.115738236884803, + "grad_norm": 0.9241218566894531, + "learning_rate": 3.437330654898041e-06, + "loss": 1.6967, + "mean_token_accuracy": 0.6024807691574097, + "num_tokens": 7779903839.0, + "step": 15220 + }, + { + "epoch": 4.116008653326122, + "grad_norm": 0.3570549190044403, + "learning_rate": 3.4364761136344032e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.7235561609268188, + "num_tokens": 7780428075.0, + "step": 15221 + }, + { + "epoch": 4.116279069767442, + "grad_norm": 1.0513261556625366, + "learning_rate": 3.435621804441922e-06, + "loss": 1.6955, + "mean_token_accuracy": 0.6153910756111145, + "num_tokens": 7780952214.0, + "step": 15222 + }, + { + "epoch": 4.116549486208761, + "grad_norm": 1.0055088996887207, + "learning_rate": 3.434767727346808e-06, + "loss": 1.7167, + "mean_token_accuracy": 0.5966132283210754, + "num_tokens": 7781460407.0, + "step": 15223 + }, + { + "epoch": 4.116819902650081, + "grad_norm": 0.9277073740959167, + "learning_rate": 3.4339138823752703e-06, + "loss": 1.8252, + "mean_token_accuracy": 0.5670996308326721, + "num_tokens": 7781984512.0, + "step": 15224 + }, + { + "epoch": 4.1170903190914006, + "grad_norm": 0.916005551815033, + "learning_rate": 3.433060269553504e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.5670696496963501, + "num_tokens": 7782508643.0, + "step": 15225 + }, + { + "epoch": 4.117360735532721, + "grad_norm": 0.8613690137863159, + "learning_rate": 3.4322068889077e-06, + "loss": 1.8348, + "mean_token_accuracy": 0.5720059871673584, + "num_tokens": 7783032767.0, + "step": 15226 + }, + { + "epoch": 4.11763115197404, + "grad_norm": 1.0507676601409912, + "learning_rate": 3.4313537404640464e-06, + "loss": 1.9537, + "mean_token_accuracy": 0.5783263444900513, + "num_tokens": 7783557044.0, + "step": 15227 + }, + { + "epoch": 4.11790156841536, + "grad_norm": 0.9009530544281006, + "learning_rate": 3.430500824248718e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5864189863204956, + "num_tokens": 7783994317.0, + "step": 15228 + }, + { + "epoch": 4.118171984856679, + "grad_norm": 0.864130437374115, + "learning_rate": 3.4296481402878822e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.580726146697998, + "num_tokens": 7784461682.0, + "step": 15229 + }, + { + "epoch": 4.118442401297999, + "grad_norm": 0.9935486316680908, + "learning_rate": 3.428795688607707e-06, + "loss": 1.8118, + "mean_token_accuracy": 0.5635292530059814, + "num_tokens": 7784979274.0, + "step": 15230 + }, + { + "epoch": 4.118712817739318, + "grad_norm": 0.9831752777099609, + "learning_rate": 3.4279434692343455e-06, + "loss": 1.8898, + "mean_token_accuracy": 0.5508136749267578, + "num_tokens": 7785503366.0, + "step": 15231 + }, + { + "epoch": 4.1189832341806385, + "grad_norm": 1.0109119415283203, + "learning_rate": 3.427091482193944e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5931533575057983, + "num_tokens": 7785976114.0, + "step": 15232 + }, + { + "epoch": 4.119253650621958, + "grad_norm": 0.9979788661003113, + "learning_rate": 3.4262397275126456e-06, + "loss": 1.7857, + "mean_token_accuracy": 0.5803710222244263, + "num_tokens": 7786489574.0, + "step": 15233 + }, + { + "epoch": 4.119524067063278, + "grad_norm": 0.9054433703422546, + "learning_rate": 3.4253882052165878e-06, + "loss": 1.7554, + "mean_token_accuracy": 0.5995280742645264, + "num_tokens": 7786980013.0, + "step": 15234 + }, + { + "epoch": 4.119794483504597, + "grad_norm": 0.8914211988449097, + "learning_rate": 3.424536915331894e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.586881160736084, + "num_tokens": 7787491704.0, + "step": 15235 + }, + { + "epoch": 4.120064899945917, + "grad_norm": 0.9558637142181396, + "learning_rate": 3.4236858578846876e-06, + "loss": 1.9213, + "mean_token_accuracy": 0.5659905672073364, + "num_tokens": 7788015986.0, + "step": 15236 + }, + { + "epoch": 4.120335316387236, + "grad_norm": 0.87385493516922, + "learning_rate": 3.422835032901079e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5605252981185913, + "num_tokens": 7788501659.0, + "step": 15237 + }, + { + "epoch": 4.120605732828556, + "grad_norm": 0.8444978594779968, + "learning_rate": 3.4219844404071733e-06, + "loss": 1.7151, + "mean_token_accuracy": 0.5931870937347412, + "num_tokens": 7789025929.0, + "step": 15238 + }, + { + "epoch": 4.1208761492698756, + "grad_norm": 1.1536146402359009, + "learning_rate": 3.4211340804290714e-06, + "loss": 1.8413, + "mean_token_accuracy": 0.5889779329299927, + "num_tokens": 7789550188.0, + "step": 15239 + }, + { + "epoch": 4.121146565711196, + "grad_norm": 1.0096008777618408, + "learning_rate": 3.4202839529928657e-06, + "loss": 1.9705, + "mean_token_accuracy": 0.5577055215835571, + "num_tokens": 7790025526.0, + "step": 15240 + }, + { + "epoch": 4.121416982152515, + "grad_norm": 0.4709605574607849, + "learning_rate": 3.4194340581246353e-06, + "loss": 1.2319, + "mean_token_accuracy": 0.6935710906982422, + "num_tokens": 7790549793.0, + "step": 15241 + }, + { + "epoch": 4.121687398593835, + "grad_norm": 0.9406017661094666, + "learning_rate": 3.4185843958504626e-06, + "loss": 1.8071, + "mean_token_accuracy": 0.5704092979431152, + "num_tokens": 7791073980.0, + "step": 15242 + }, + { + "epoch": 4.121957815035154, + "grad_norm": 1.043192982673645, + "learning_rate": 3.4177349661964167e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5770610570907593, + "num_tokens": 7791598018.0, + "step": 15243 + }, + { + "epoch": 4.122228231476473, + "grad_norm": 0.9220405220985413, + "learning_rate": 3.4168857691885595e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5875867605209351, + "num_tokens": 7792122294.0, + "step": 15244 + }, + { + "epoch": 4.122498647917793, + "grad_norm": 0.9520568251609802, + "learning_rate": 3.416036804852948e-06, + "loss": 1.7537, + "mean_token_accuracy": 0.6011033058166504, + "num_tokens": 7792606119.0, + "step": 15245 + }, + { + "epoch": 4.122769064359113, + "grad_norm": 0.8894757628440857, + "learning_rate": 3.4151880732156285e-06, + "loss": 1.9095, + "mean_token_accuracy": 0.5808704495429993, + "num_tokens": 7793116642.0, + "step": 15246 + }, + { + "epoch": 4.123039480800433, + "grad_norm": 0.9344538450241089, + "learning_rate": 3.414339574302643e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.558184027671814, + "num_tokens": 7793640759.0, + "step": 15247 + }, + { + "epoch": 4.123309897241752, + "grad_norm": 1.1444432735443115, + "learning_rate": 3.4134913081400313e-06, + "loss": 1.68, + "mean_token_accuracy": 0.5933562517166138, + "num_tokens": 7794164982.0, + "step": 15248 + }, + { + "epoch": 4.123580313683072, + "grad_norm": 1.0657846927642822, + "learning_rate": 3.4126432747538134e-06, + "loss": 1.7291, + "mean_token_accuracy": 0.6032360792160034, + "num_tokens": 7794689261.0, + "step": 15249 + }, + { + "epoch": 4.123850730124391, + "grad_norm": 1.0441197156906128, + "learning_rate": 3.411795474170016e-06, + "loss": 1.8327, + "mean_token_accuracy": 0.5821539163589478, + "num_tokens": 7795213456.0, + "step": 15250 + }, + { + "epoch": 4.124121146565711, + "grad_norm": 0.9677911996841431, + "learning_rate": 3.4109479064146478e-06, + "loss": 1.7557, + "mean_token_accuracy": 0.5886998176574707, + "num_tokens": 7795737634.0, + "step": 15251 + }, + { + "epoch": 4.1243915630070305, + "grad_norm": 1.159632682800293, + "learning_rate": 3.410100571513714e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5808588862419128, + "num_tokens": 7796241957.0, + "step": 15252 + }, + { + "epoch": 4.124661979448351, + "grad_norm": 1.021376609802246, + "learning_rate": 3.4092534694932178e-06, + "loss": 1.7116, + "mean_token_accuracy": 0.600386917591095, + "num_tokens": 7796766049.0, + "step": 15253 + }, + { + "epoch": 4.12493239588967, + "grad_norm": 0.9047567844390869, + "learning_rate": 3.4084066003791477e-06, + "loss": 1.7017, + "mean_token_accuracy": 0.5974293351173401, + "num_tokens": 7797290112.0, + "step": 15254 + }, + { + "epoch": 4.12520281233099, + "grad_norm": 0.9794473648071289, + "learning_rate": 3.407559964197486e-06, + "loss": 1.8527, + "mean_token_accuracy": 0.5904983282089233, + "num_tokens": 7797814321.0, + "step": 15255 + }, + { + "epoch": 4.125473228772309, + "grad_norm": 1.0296279191970825, + "learning_rate": 3.406713560974216e-06, + "loss": 1.7455, + "mean_token_accuracy": 0.5800631046295166, + "num_tokens": 7798338423.0, + "step": 15256 + }, + { + "epoch": 4.125743645213629, + "grad_norm": 0.9252990484237671, + "learning_rate": 3.4058673907353025e-06, + "loss": 1.7701, + "mean_token_accuracy": 0.5967174768447876, + "num_tokens": 7798845980.0, + "step": 15257 + }, + { + "epoch": 4.126014061654948, + "grad_norm": 0.8648182153701782, + "learning_rate": 3.4050214535067093e-06, + "loss": 1.7275, + "mean_token_accuracy": 0.5866723656654358, + "num_tokens": 7799370187.0, + "step": 15258 + }, + { + "epoch": 4.126284478096268, + "grad_norm": 0.7430532574653625, + "learning_rate": 3.4041757493143957e-06, + "loss": 1.7938, + "mean_token_accuracy": 0.5872423648834229, + "num_tokens": 7799894448.0, + "step": 15259 + }, + { + "epoch": 4.126554894537588, + "grad_norm": 1.1062722206115723, + "learning_rate": 3.403330278184306e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.5594817399978638, + "num_tokens": 7800418614.0, + "step": 15260 + }, + { + "epoch": 4.126825310978908, + "grad_norm": 0.33503028750419617, + "learning_rate": 3.402485040142384e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7283695936203003, + "num_tokens": 7800942851.0, + "step": 15261 + }, + { + "epoch": 4.127095727420227, + "grad_norm": 0.8557037115097046, + "learning_rate": 3.4016400352145634e-06, + "loss": 1.7821, + "mean_token_accuracy": 0.5932495594024658, + "num_tokens": 7801467130.0, + "step": 15262 + }, + { + "epoch": 4.127366143861547, + "grad_norm": 0.9975329637527466, + "learning_rate": 3.4007952634267737e-06, + "loss": 1.7546, + "mean_token_accuracy": 0.58554607629776, + "num_tokens": 7801991405.0, + "step": 15263 + }, + { + "epoch": 4.127636560302866, + "grad_norm": 0.8217745423316956, + "learning_rate": 3.399950724804931e-06, + "loss": 1.826, + "mean_token_accuracy": 0.5793427228927612, + "num_tokens": 7802515435.0, + "step": 15264 + }, + { + "epoch": 4.127906976744186, + "grad_norm": 0.8467743396759033, + "learning_rate": 3.399106419374952e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5902256965637207, + "num_tokens": 7803039633.0, + "step": 15265 + }, + { + "epoch": 4.1281773931855055, + "grad_norm": 0.7856627702713013, + "learning_rate": 3.3982623471627413e-06, + "loss": 1.7346, + "mean_token_accuracy": 0.5862028002738953, + "num_tokens": 7803563893.0, + "step": 15266 + }, + { + "epoch": 4.128447809626826, + "grad_norm": 1.0014126300811768, + "learning_rate": 3.3974185081941947e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5752918720245361, + "num_tokens": 7804088112.0, + "step": 15267 + }, + { + "epoch": 4.128718226068145, + "grad_norm": 1.0024216175079346, + "learning_rate": 3.3965749024952077e-06, + "loss": 1.9024, + "mean_token_accuracy": 0.5750539302825928, + "num_tokens": 7804553471.0, + "step": 15268 + }, + { + "epoch": 4.128988642509465, + "grad_norm": 0.9112911224365234, + "learning_rate": 3.3957315300916614e-06, + "loss": 1.8115, + "mean_token_accuracy": 0.593368649482727, + "num_tokens": 7805012897.0, + "step": 15269 + }, + { + "epoch": 4.129259058950784, + "grad_norm": 1.025862693786621, + "learning_rate": 3.3948883910094366e-06, + "loss": 1.6206, + "mean_token_accuracy": 0.6507400870323181, + "num_tokens": 7805474777.0, + "step": 15270 + }, + { + "epoch": 4.129529475392104, + "grad_norm": 0.8293136358261108, + "learning_rate": 3.3940454852744007e-06, + "loss": 1.827, + "mean_token_accuracy": 0.5664198994636536, + "num_tokens": 7805989307.0, + "step": 15271 + }, + { + "epoch": 4.129799891833423, + "grad_norm": 0.7716777324676514, + "learning_rate": 3.393202812912415e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5720760822296143, + "num_tokens": 7806513577.0, + "step": 15272 + }, + { + "epoch": 4.130070308274743, + "grad_norm": 0.9639027118682861, + "learning_rate": 3.3923603739493398e-06, + "loss": 1.9293, + "mean_token_accuracy": 0.5757579803466797, + "num_tokens": 7806985992.0, + "step": 15273 + }, + { + "epoch": 4.130340724716063, + "grad_norm": 0.8472999930381775, + "learning_rate": 3.391518168411019e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.5733922123908997, + "num_tokens": 7807509993.0, + "step": 15274 + }, + { + "epoch": 4.130611141157383, + "grad_norm": 0.731754720211029, + "learning_rate": 3.390676196323297e-06, + "loss": 1.6331, + "mean_token_accuracy": 0.6174474954605103, + "num_tokens": 7808034180.0, + "step": 15275 + }, + { + "epoch": 4.130881557598702, + "grad_norm": 0.8429182767868042, + "learning_rate": 3.3898344577120045e-06, + "loss": 1.7703, + "mean_token_accuracy": 0.5761010646820068, + "num_tokens": 7808558340.0, + "step": 15276 + }, + { + "epoch": 4.131151974040022, + "grad_norm": 0.9088166952133179, + "learning_rate": 3.388992952602973e-06, + "loss": 1.8028, + "mean_token_accuracy": 0.5778899192810059, + "num_tokens": 7809082580.0, + "step": 15277 + }, + { + "epoch": 4.131422390481341, + "grad_norm": 1.0126667022705078, + "learning_rate": 3.3881516810220194e-06, + "loss": 1.9121, + "mean_token_accuracy": 0.5768747329711914, + "num_tokens": 7809606840.0, + "step": 15278 + }, + { + "epoch": 4.131692806922661, + "grad_norm": 0.8077027797698975, + "learning_rate": 3.387310642994959e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5797387361526489, + "num_tokens": 7810131018.0, + "step": 15279 + }, + { + "epoch": 4.1319632233639805, + "grad_norm": 0.8282563090324402, + "learning_rate": 3.3864698385475947e-06, + "loss": 1.8482, + "mean_token_accuracy": 0.565865159034729, + "num_tokens": 7810655220.0, + "step": 15280 + }, + { + "epoch": 4.132233639805301, + "grad_norm": 0.3822958171367645, + "learning_rate": 3.385629267705724e-06, + "loss": 1.094, + "mean_token_accuracy": 0.7059898376464844, + "num_tokens": 7811141276.0, + "step": 15281 + }, + { + "epoch": 4.13250405624662, + "grad_norm": 1.1326603889465332, + "learning_rate": 3.384788930495142e-06, + "loss": 1.8281, + "mean_token_accuracy": 0.5827805995941162, + "num_tokens": 7811652058.0, + "step": 15282 + }, + { + "epoch": 4.13277447268794, + "grad_norm": 1.042653203010559, + "learning_rate": 3.383948826941631e-06, + "loss": 1.8514, + "mean_token_accuracy": 0.5602715611457825, + "num_tokens": 7812176171.0, + "step": 15283 + }, + { + "epoch": 4.133044889129259, + "grad_norm": 0.8727304339408875, + "learning_rate": 3.3831089570709645e-06, + "loss": 1.8251, + "mean_token_accuracy": 0.5612015724182129, + "num_tokens": 7812700346.0, + "step": 15284 + }, + { + "epoch": 4.133315305570578, + "grad_norm": 0.9385483264923096, + "learning_rate": 3.382269320908917e-06, + "loss": 1.646, + "mean_token_accuracy": 0.5975239276885986, + "num_tokens": 7813170273.0, + "step": 15285 + }, + { + "epoch": 4.133585722011898, + "grad_norm": 0.8654086589813232, + "learning_rate": 3.3814299184812492e-06, + "loss": 1.6924, + "mean_token_accuracy": 0.6140270829200745, + "num_tokens": 7813648494.0, + "step": 15286 + }, + { + "epoch": 4.1338561384532175, + "grad_norm": 0.9925684928894043, + "learning_rate": 3.3805907498137146e-06, + "loss": 1.9197, + "mean_token_accuracy": 0.5536980628967285, + "num_tokens": 7814172770.0, + "step": 15287 + }, + { + "epoch": 4.134126554894538, + "grad_norm": 0.8295678496360779, + "learning_rate": 3.379751814932064e-06, + "loss": 1.6552, + "mean_token_accuracy": 0.60820472240448, + "num_tokens": 7814696915.0, + "step": 15288 + }, + { + "epoch": 4.134396971335857, + "grad_norm": 0.9084540605545044, + "learning_rate": 3.378913113862037e-06, + "loss": 1.9853, + "mean_token_accuracy": 0.5479307174682617, + "num_tokens": 7815221131.0, + "step": 15289 + }, + { + "epoch": 4.134667387777177, + "grad_norm": 0.8401504755020142, + "learning_rate": 3.378074646629366e-06, + "loss": 1.8118, + "mean_token_accuracy": 0.5901294350624084, + "num_tokens": 7815713189.0, + "step": 15290 + }, + { + "epoch": 4.134937804218496, + "grad_norm": 0.8487094640731812, + "learning_rate": 3.3772364132597823e-06, + "loss": 1.7843, + "mean_token_accuracy": 0.5704679489135742, + "num_tokens": 7816237314.0, + "step": 15291 + }, + { + "epoch": 4.135208220659816, + "grad_norm": 0.9061310291290283, + "learning_rate": 3.3763984137790014e-06, + "loss": 1.7015, + "mean_token_accuracy": 0.5888723731040955, + "num_tokens": 7816761545.0, + "step": 15292 + }, + { + "epoch": 4.135478637101135, + "grad_norm": 0.8552652597427368, + "learning_rate": 3.3755606482127377e-06, + "loss": 1.8954, + "mean_token_accuracy": 0.5424787402153015, + "num_tokens": 7817285820.0, + "step": 15293 + }, + { + "epoch": 4.1357490535424555, + "grad_norm": 0.9102340340614319, + "learning_rate": 3.3747231165866957e-06, + "loss": 1.9041, + "mean_token_accuracy": 0.5692936182022095, + "num_tokens": 7817810067.0, + "step": 15294 + }, + { + "epoch": 4.136019469983775, + "grad_norm": 0.7425841093063354, + "learning_rate": 3.373885818926571e-06, + "loss": 1.5113, + "mean_token_accuracy": 0.665105938911438, + "num_tokens": 7818334250.0, + "step": 15295 + }, + { + "epoch": 4.136289886425095, + "grad_norm": 0.985947847366333, + "learning_rate": 3.373048755258058e-06, + "loss": 1.8806, + "mean_token_accuracy": 0.5704533457756042, + "num_tokens": 7818858476.0, + "step": 15296 + }, + { + "epoch": 4.136560302866414, + "grad_norm": 0.7942850589752197, + "learning_rate": 3.3722119256068386e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5661438703536987, + "num_tokens": 7819382579.0, + "step": 15297 + }, + { + "epoch": 4.136830719307734, + "grad_norm": 1.0097428560256958, + "learning_rate": 3.3713753299985873e-06, + "loss": 1.7958, + "mean_token_accuracy": 0.5911820530891418, + "num_tokens": 7819857905.0, + "step": 15298 + }, + { + "epoch": 4.137101135749053, + "grad_norm": 0.9400476217269897, + "learning_rate": 3.370538968458977e-06, + "loss": 1.733, + "mean_token_accuracy": 0.5990878939628601, + "num_tokens": 7820295813.0, + "step": 15299 + }, + { + "epoch": 4.137371552190373, + "grad_norm": 0.8803190588951111, + "learning_rate": 3.369702841013668e-06, + "loss": 1.8591, + "mean_token_accuracy": 0.5639090538024902, + "num_tokens": 7820820054.0, + "step": 15300 + }, + { + "epoch": 4.1376419686316925, + "grad_norm": 0.37710580229759216, + "learning_rate": 3.368866947688314e-06, + "loss": 1.1062, + "mean_token_accuracy": 0.7195442318916321, + "num_tokens": 7821279584.0, + "step": 15301 + }, + { + "epoch": 4.137912385073013, + "grad_norm": 0.8988038897514343, + "learning_rate": 3.3680312885085647e-06, + "loss": 1.7065, + "mean_token_accuracy": 0.5962650775909424, + "num_tokens": 7821803504.0, + "step": 15302 + }, + { + "epoch": 4.138182801514332, + "grad_norm": 0.8076335191726685, + "learning_rate": 3.367195863500058e-06, + "loss": 1.913, + "mean_token_accuracy": 0.5486672520637512, + "num_tokens": 7822327747.0, + "step": 15303 + }, + { + "epoch": 4.138453217955652, + "grad_norm": 0.8888456225395203, + "learning_rate": 3.366360672688429e-06, + "loss": 1.7111, + "mean_token_accuracy": 0.6006191968917847, + "num_tokens": 7822851980.0, + "step": 15304 + }, + { + "epoch": 4.138723634396971, + "grad_norm": 1.0319974422454834, + "learning_rate": 3.3655257160993048e-06, + "loss": 1.8589, + "mean_token_accuracy": 0.5805599689483643, + "num_tokens": 7823361908.0, + "step": 15305 + }, + { + "epoch": 4.138994050838291, + "grad_norm": 0.8564968705177307, + "learning_rate": 3.3646909937583037e-06, + "loss": 1.7692, + "mean_token_accuracy": 0.5851491689682007, + "num_tokens": 7823877280.0, + "step": 15306 + }, + { + "epoch": 4.13926446727961, + "grad_norm": 0.973816990852356, + "learning_rate": 3.3638565056910355e-06, + "loss": 1.7408, + "mean_token_accuracy": 0.5898675918579102, + "num_tokens": 7824344832.0, + "step": 15307 + }, + { + "epoch": 4.1395348837209305, + "grad_norm": 0.8171083331108093, + "learning_rate": 3.363022251923107e-06, + "loss": 1.8253, + "mean_token_accuracy": 0.5818513631820679, + "num_tokens": 7824868996.0, + "step": 15308 + }, + { + "epoch": 4.13980530016225, + "grad_norm": 0.9218170046806335, + "learning_rate": 3.3621882324801146e-06, + "loss": 1.7361, + "mean_token_accuracy": 0.5866883993148804, + "num_tokens": 7825393095.0, + "step": 15309 + }, + { + "epoch": 4.14007571660357, + "grad_norm": 0.9228069186210632, + "learning_rate": 3.361354447387648e-06, + "loss": 1.8431, + "mean_token_accuracy": 0.5812966823577881, + "num_tokens": 7825917332.0, + "step": 15310 + }, + { + "epoch": 4.140346133044889, + "grad_norm": 0.92884761095047, + "learning_rate": 3.360520896671292e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5677995681762695, + "num_tokens": 7826441520.0, + "step": 15311 + }, + { + "epoch": 4.140616549486209, + "grad_norm": 0.9828780293464661, + "learning_rate": 3.3596875803566217e-06, + "loss": 1.9123, + "mean_token_accuracy": 0.5547366738319397, + "num_tokens": 7826965642.0, + "step": 15312 + }, + { + "epoch": 4.140886965927528, + "grad_norm": 0.9988670349121094, + "learning_rate": 3.3588544984692017e-06, + "loss": 1.8966, + "mean_token_accuracy": 0.5844129920005798, + "num_tokens": 7827474309.0, + "step": 15313 + }, + { + "epoch": 4.141157382368848, + "grad_norm": 0.9396955370903015, + "learning_rate": 3.3580216510346e-06, + "loss": 1.9418, + "mean_token_accuracy": 0.5502640604972839, + "num_tokens": 7827960801.0, + "step": 15314 + }, + { + "epoch": 4.1414277988101675, + "grad_norm": 0.8796919584274292, + "learning_rate": 3.3571890380783654e-06, + "loss": 1.8497, + "mean_token_accuracy": 0.5780167579650879, + "num_tokens": 7828484974.0, + "step": 15315 + }, + { + "epoch": 4.141698215251488, + "grad_norm": 0.8673965930938721, + "learning_rate": 3.356356659626049e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.5826512575149536, + "num_tokens": 7829009123.0, + "step": 15316 + }, + { + "epoch": 4.141968631692807, + "grad_norm": 1.0921717882156372, + "learning_rate": 3.3555245157031864e-06, + "loss": 1.7523, + "mean_token_accuracy": 0.5808503031730652, + "num_tokens": 7829509789.0, + "step": 15317 + }, + { + "epoch": 4.142239048134127, + "grad_norm": 0.9389763474464417, + "learning_rate": 3.354692606335315e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.586620569229126, + "num_tokens": 7829995023.0, + "step": 15318 + }, + { + "epoch": 4.142509464575446, + "grad_norm": 0.8393735289573669, + "learning_rate": 3.3538609315479554e-06, + "loss": 1.713, + "mean_token_accuracy": 0.5915151238441467, + "num_tokens": 7830519203.0, + "step": 15319 + }, + { + "epoch": 4.142779881016766, + "grad_norm": 1.0666391849517822, + "learning_rate": 3.3530294913666313e-06, + "loss": 1.7321, + "mean_token_accuracy": 0.5797346830368042, + "num_tokens": 7830981495.0, + "step": 15320 + }, + { + "epoch": 4.143050297458085, + "grad_norm": 0.3666529655456543, + "learning_rate": 3.352198285816849e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.7096803188323975, + "num_tokens": 7831499540.0, + "step": 15321 + }, + { + "epoch": 4.1433207138994055, + "grad_norm": 0.9637007117271423, + "learning_rate": 3.3513673149241142e-06, + "loss": 1.7699, + "mean_token_accuracy": 0.5938873291015625, + "num_tokens": 7832023744.0, + "step": 15322 + }, + { + "epoch": 4.143591130340725, + "grad_norm": 0.9827960729598999, + "learning_rate": 3.3505365787139245e-06, + "loss": 1.8787, + "mean_token_accuracy": 0.5673911571502686, + "num_tokens": 7832548025.0, + "step": 15323 + }, + { + "epoch": 4.143861546782045, + "grad_norm": 0.8713919520378113, + "learning_rate": 3.3497060772117652e-06, + "loss": 1.7772, + "mean_token_accuracy": 0.5814307928085327, + "num_tokens": 7833072227.0, + "step": 15324 + }, + { + "epoch": 4.144131963223364, + "grad_norm": 1.1028286218643188, + "learning_rate": 3.3488758104431252e-06, + "loss": 1.7935, + "mean_token_accuracy": 0.5637179613113403, + "num_tokens": 7833596441.0, + "step": 15325 + }, + { + "epoch": 4.144402379664683, + "grad_norm": 0.8653306365013123, + "learning_rate": 3.348045778433474e-06, + "loss": 1.8897, + "mean_token_accuracy": 0.5625890493392944, + "num_tokens": 7834112104.0, + "step": 15326 + }, + { + "epoch": 4.144672796106003, + "grad_norm": 1.1436926126480103, + "learning_rate": 3.347215981208281e-06, + "loss": 1.815, + "mean_token_accuracy": 0.589064359664917, + "num_tokens": 7834636258.0, + "step": 15327 + }, + { + "epoch": 4.1449432125473225, + "grad_norm": 0.950189471244812, + "learning_rate": 3.3463864187930083e-06, + "loss": 1.7203, + "mean_token_accuracy": 0.5923868417739868, + "num_tokens": 7835160435.0, + "step": 15328 + }, + { + "epoch": 4.1452136289886425, + "grad_norm": 0.8526005744934082, + "learning_rate": 3.3455570912131077e-06, + "loss": 1.8695, + "mean_token_accuracy": 0.571807324886322, + "num_tokens": 7835660506.0, + "step": 15329 + }, + { + "epoch": 4.145484045429962, + "grad_norm": 0.9522124528884888, + "learning_rate": 3.344727998494024e-06, + "loss": 1.6929, + "mean_token_accuracy": 0.5763218402862549, + "num_tokens": 7836184753.0, + "step": 15330 + }, + { + "epoch": 4.145754461871282, + "grad_norm": 0.9710083603858948, + "learning_rate": 3.3438991406612e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5616135001182556, + "num_tokens": 7836708933.0, + "step": 15331 + }, + { + "epoch": 4.146024878312601, + "grad_norm": 0.9130182266235352, + "learning_rate": 3.343070517740063e-06, + "loss": 1.8288, + "mean_token_accuracy": 0.5795096158981323, + "num_tokens": 7837199492.0, + "step": 15332 + }, + { + "epoch": 4.146295294753921, + "grad_norm": 1.0207934379577637, + "learning_rate": 3.34224212975604e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5687105059623718, + "num_tokens": 7837723520.0, + "step": 15333 + }, + { + "epoch": 4.14656571119524, + "grad_norm": 0.9144989848136902, + "learning_rate": 3.3414139767345522e-06, + "loss": 1.7867, + "mean_token_accuracy": 0.5900967121124268, + "num_tokens": 7838247800.0, + "step": 15334 + }, + { + "epoch": 4.14683612763656, + "grad_norm": 0.9142890572547913, + "learning_rate": 3.3405860587010048e-06, + "loss": 1.703, + "mean_token_accuracy": 0.6137166023254395, + "num_tokens": 7838771989.0, + "step": 15335 + }, + { + "epoch": 4.14710654407788, + "grad_norm": 0.862173318862915, + "learning_rate": 3.3397583756807994e-06, + "loss": 1.8584, + "mean_token_accuracy": 0.568964421749115, + "num_tokens": 7839296150.0, + "step": 15336 + }, + { + "epoch": 4.1473769605192, + "grad_norm": 1.1370717287063599, + "learning_rate": 3.3389309276993375e-06, + "loss": 1.8982, + "mean_token_accuracy": 0.5612195730209351, + "num_tokens": 7839773564.0, + "step": 15337 + }, + { + "epoch": 4.147647376960519, + "grad_norm": 0.9371160268783569, + "learning_rate": 3.3381037147820016e-06, + "loss": 1.7459, + "mean_token_accuracy": 0.5670652389526367, + "num_tokens": 7840297678.0, + "step": 15338 + }, + { + "epoch": 4.147917793401839, + "grad_norm": 0.9305548071861267, + "learning_rate": 3.3372767369541785e-06, + "loss": 1.7553, + "mean_token_accuracy": 0.5799156427383423, + "num_tokens": 7840821901.0, + "step": 15339 + }, + { + "epoch": 4.148188209843158, + "grad_norm": 1.0121461153030396, + "learning_rate": 3.3364499942412395e-06, + "loss": 1.8189, + "mean_token_accuracy": 0.580382764339447, + "num_tokens": 7841346060.0, + "step": 15340 + }, + { + "epoch": 4.148458626284478, + "grad_norm": 0.41111016273498535, + "learning_rate": 3.335623486668549e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.7152745127677917, + "num_tokens": 7841770871.0, + "step": 15341 + }, + { + "epoch": 4.1487290427257975, + "grad_norm": 0.811601459980011, + "learning_rate": 3.3347972142614732e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5820677280426025, + "num_tokens": 7842295046.0, + "step": 15342 + }, + { + "epoch": 4.1489994591671175, + "grad_norm": 0.9422959089279175, + "learning_rate": 3.3339711770453596e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.5814619064331055, + "num_tokens": 7842819304.0, + "step": 15343 + }, + { + "epoch": 4.149269875608437, + "grad_norm": 0.8795710802078247, + "learning_rate": 3.333145375045553e-06, + "loss": 1.8771, + "mean_token_accuracy": 0.552320122718811, + "num_tokens": 7843343523.0, + "step": 15344 + }, + { + "epoch": 4.149540292049757, + "grad_norm": 0.8445996046066284, + "learning_rate": 3.332319808287394e-06, + "loss": 1.7527, + "mean_token_accuracy": 0.5894695520401001, + "num_tokens": 7843826410.0, + "step": 15345 + }, + { + "epoch": 4.149810708491076, + "grad_norm": 0.7923411726951599, + "learning_rate": 3.331494476796211e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.6078367233276367, + "num_tokens": 7844290547.0, + "step": 15346 + }, + { + "epoch": 4.150081124932396, + "grad_norm": 0.9070187211036682, + "learning_rate": 3.33066938059733e-06, + "loss": 1.893, + "mean_token_accuracy": 0.5491801500320435, + "num_tokens": 7844814742.0, + "step": 15347 + }, + { + "epoch": 4.150351541373715, + "grad_norm": 0.7848578095436096, + "learning_rate": 3.3298445197160665e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5884569883346558, + "num_tokens": 7845339003.0, + "step": 15348 + }, + { + "epoch": 4.150621957815035, + "grad_norm": 0.8544716238975525, + "learning_rate": 3.3290198941777306e-06, + "loss": 1.7783, + "mean_token_accuracy": 0.5742676258087158, + "num_tokens": 7845863240.0, + "step": 15349 + }, + { + "epoch": 4.150892374256355, + "grad_norm": 0.9438462257385254, + "learning_rate": 3.328195504007622e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.5820273160934448, + "num_tokens": 7846387335.0, + "step": 15350 + }, + { + "epoch": 4.151162790697675, + "grad_norm": 0.7918485999107361, + "learning_rate": 3.327371349231038e-06, + "loss": 1.8057, + "mean_token_accuracy": 0.5868320465087891, + "num_tokens": 7846911581.0, + "step": 15351 + }, + { + "epoch": 4.151433207138994, + "grad_norm": 1.0258257389068604, + "learning_rate": 3.3265474298732645e-06, + "loss": 1.9193, + "mean_token_accuracy": 0.5597765445709229, + "num_tokens": 7847435702.0, + "step": 15352 + }, + { + "epoch": 4.151703623580314, + "grad_norm": 0.7477353811264038, + "learning_rate": 3.325723745959581e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.5795550346374512, + "num_tokens": 7847959862.0, + "step": 15353 + }, + { + "epoch": 4.151974040021633, + "grad_norm": 0.8512294292449951, + "learning_rate": 3.3249002975152623e-06, + "loss": 1.7346, + "mean_token_accuracy": 0.5964325666427612, + "num_tokens": 7848484139.0, + "step": 15354 + }, + { + "epoch": 4.152244456462953, + "grad_norm": 0.9810311198234558, + "learning_rate": 3.3240770845655736e-06, + "loss": 1.8079, + "mean_token_accuracy": 0.5804851055145264, + "num_tokens": 7848957223.0, + "step": 15355 + }, + { + "epoch": 4.1525148729042725, + "grad_norm": 0.8138778209686279, + "learning_rate": 3.3232541071357714e-06, + "loss": 1.7493, + "mean_token_accuracy": 0.5716502070426941, + "num_tokens": 7849473724.0, + "step": 15356 + }, + { + "epoch": 4.1527852893455925, + "grad_norm": 0.834823489189148, + "learning_rate": 3.322431365251112e-06, + "loss": 1.8703, + "mean_token_accuracy": 0.5792288780212402, + "num_tokens": 7849997924.0, + "step": 15357 + }, + { + "epoch": 4.153055705786912, + "grad_norm": 0.8476166129112244, + "learning_rate": 3.321608858936837e-06, + "loss": 1.7094, + "mean_token_accuracy": 0.6030305624008179, + "num_tokens": 7850522177.0, + "step": 15358 + }, + { + "epoch": 4.153326122228232, + "grad_norm": 0.8678502440452576, + "learning_rate": 3.320786588218179e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5845955014228821, + "num_tokens": 7851007979.0, + "step": 15359 + }, + { + "epoch": 4.153596538669551, + "grad_norm": 0.8390682339668274, + "learning_rate": 3.3199645531203727e-06, + "loss": 1.5619, + "mean_token_accuracy": 0.651411771774292, + "num_tokens": 7851520333.0, + "step": 15360 + }, + { + "epoch": 4.153866955110871, + "grad_norm": 0.41634708642959595, + "learning_rate": 3.3191427536686403e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.7090774774551392, + "num_tokens": 7851977197.0, + "step": 15361 + }, + { + "epoch": 4.15413737155219, + "grad_norm": 1.0827502012252808, + "learning_rate": 3.318321189888195e-06, + "loss": 1.789, + "mean_token_accuracy": 0.5992189645767212, + "num_tokens": 7852501437.0, + "step": 15362 + }, + { + "epoch": 4.15440778799351, + "grad_norm": 0.9745176434516907, + "learning_rate": 3.3174998618042487e-06, + "loss": 1.795, + "mean_token_accuracy": 0.5694012641906738, + "num_tokens": 7853025660.0, + "step": 15363 + }, + { + "epoch": 4.15467820443483, + "grad_norm": 0.9786089658737183, + "learning_rate": 3.3166787694419957e-06, + "loss": 1.9561, + "mean_token_accuracy": 0.5497641563415527, + "num_tokens": 7853549845.0, + "step": 15364 + }, + { + "epoch": 4.15494862087615, + "grad_norm": 1.0068327188491821, + "learning_rate": 3.3158579128266365e-06, + "loss": 1.9035, + "mean_token_accuracy": 0.5695241689682007, + "num_tokens": 7854073974.0, + "step": 15365 + }, + { + "epoch": 4.155219037317469, + "grad_norm": 0.9681010842323303, + "learning_rate": 3.3150372919833535e-06, + "loss": 1.8474, + "mean_token_accuracy": 0.5960631370544434, + "num_tokens": 7854533769.0, + "step": 15366 + }, + { + "epoch": 4.155489453758788, + "grad_norm": 0.7971224784851074, + "learning_rate": 3.3142169069373255e-06, + "loss": 1.7917, + "mean_token_accuracy": 0.5827842354774475, + "num_tokens": 7855057959.0, + "step": 15367 + }, + { + "epoch": 4.155759870200108, + "grad_norm": 0.8810192346572876, + "learning_rate": 3.313396757713726e-06, + "loss": 1.7009, + "mean_token_accuracy": 0.6275455951690674, + "num_tokens": 7855582195.0, + "step": 15368 + }, + { + "epoch": 4.156030286641427, + "grad_norm": 1.0576910972595215, + "learning_rate": 3.312576844337719e-06, + "loss": 1.7399, + "mean_token_accuracy": 0.5663648247718811, + "num_tokens": 7856106427.0, + "step": 15369 + }, + { + "epoch": 4.1563007030827475, + "grad_norm": 0.7481170892715454, + "learning_rate": 3.3117571668344596e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.5846562385559082, + "num_tokens": 7856630641.0, + "step": 15370 + }, + { + "epoch": 4.156571119524067, + "grad_norm": 0.9184351563453674, + "learning_rate": 3.3109377252291032e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5755614042282104, + "num_tokens": 7857099437.0, + "step": 15371 + }, + { + "epoch": 4.156841535965387, + "grad_norm": 0.7578520178794861, + "learning_rate": 3.310118519546788e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5789941549301147, + "num_tokens": 7857618927.0, + "step": 15372 + }, + { + "epoch": 4.157111952406706, + "grad_norm": 1.047379493713379, + "learning_rate": 3.309299549812649e-06, + "loss": 1.9437, + "mean_token_accuracy": 0.5743008255958557, + "num_tokens": 7858143183.0, + "step": 15373 + }, + { + "epoch": 4.157382368848026, + "grad_norm": 0.8320180773735046, + "learning_rate": 3.3084808160518177e-06, + "loss": 1.9251, + "mean_token_accuracy": 0.5586693286895752, + "num_tokens": 7858667379.0, + "step": 15374 + }, + { + "epoch": 4.157652785289345, + "grad_norm": 0.9032442569732666, + "learning_rate": 3.307662318289415e-06, + "loss": 1.8601, + "mean_token_accuracy": 0.5778038501739502, + "num_tokens": 7859141591.0, + "step": 15375 + }, + { + "epoch": 4.157923201730665, + "grad_norm": 0.7282091379165649, + "learning_rate": 3.306844056550553e-06, + "loss": 1.7679, + "mean_token_accuracy": 0.5925830006599426, + "num_tokens": 7859665777.0, + "step": 15376 + }, + { + "epoch": 4.1581936181719845, + "grad_norm": 0.8379108309745789, + "learning_rate": 3.3060260308603415e-06, + "loss": 1.8165, + "mean_token_accuracy": 0.5714830160140991, + "num_tokens": 7860189954.0, + "step": 15377 + }, + { + "epoch": 4.158464034613305, + "grad_norm": 1.0023045539855957, + "learning_rate": 3.305208241243878e-06, + "loss": 1.9266, + "mean_token_accuracy": 0.5581899285316467, + "num_tokens": 7860664254.0, + "step": 15378 + }, + { + "epoch": 4.158734451054624, + "grad_norm": 0.848373293876648, + "learning_rate": 3.3043906877262514e-06, + "loss": 1.763, + "mean_token_accuracy": 0.5776682496070862, + "num_tokens": 7861188475.0, + "step": 15379 + }, + { + "epoch": 4.159004867495944, + "grad_norm": 1.0256980657577515, + "learning_rate": 3.3035733703325534e-06, + "loss": 1.9347, + "mean_token_accuracy": 0.5636966228485107, + "num_tokens": 7861712698.0, + "step": 15380 + }, + { + "epoch": 4.159275283937263, + "grad_norm": 0.3613283634185791, + "learning_rate": 3.3027562890878573e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.7073954939842224, + "num_tokens": 7862236734.0, + "step": 15381 + }, + { + "epoch": 4.159545700378583, + "grad_norm": 0.9142195582389832, + "learning_rate": 3.3019394440172312e-06, + "loss": 1.7829, + "mean_token_accuracy": 0.5785365104675293, + "num_tokens": 7862761005.0, + "step": 15382 + }, + { + "epoch": 4.159816116819902, + "grad_norm": 0.9845888018608093, + "learning_rate": 3.301122835145745e-06, + "loss": 1.789, + "mean_token_accuracy": 0.6115376353263855, + "num_tokens": 7863222287.0, + "step": 15383 + }, + { + "epoch": 4.1600865332612225, + "grad_norm": 0.9532093405723572, + "learning_rate": 3.300306462498449e-06, + "loss": 1.7471, + "mean_token_accuracy": 0.5848808884620667, + "num_tokens": 7863742600.0, + "step": 15384 + }, + { + "epoch": 4.160356949702542, + "grad_norm": 0.9138814806938171, + "learning_rate": 3.2994903261003965e-06, + "loss": 1.8133, + "mean_token_accuracy": 0.5635602474212646, + "num_tokens": 7864266733.0, + "step": 15385 + }, + { + "epoch": 4.160627366143862, + "grad_norm": 0.9446961879730225, + "learning_rate": 3.298674425976626e-06, + "loss": 1.5061, + "mean_token_accuracy": 0.6373605728149414, + "num_tokens": 7864761353.0, + "step": 15386 + }, + { + "epoch": 4.160897782585181, + "grad_norm": 0.9696152806282043, + "learning_rate": 3.2978587621521694e-06, + "loss": 1.9145, + "mean_token_accuracy": 0.565498948097229, + "num_tokens": 7865278180.0, + "step": 15387 + }, + { + "epoch": 4.161168199026501, + "grad_norm": 0.8386918902397156, + "learning_rate": 3.29704333465206e-06, + "loss": 1.81, + "mean_token_accuracy": 0.5830748081207275, + "num_tokens": 7865802455.0, + "step": 15388 + }, + { + "epoch": 4.16143861546782, + "grad_norm": 1.0089969635009766, + "learning_rate": 3.296228143501311e-06, + "loss": 1.8638, + "mean_token_accuracy": 0.5838065147399902, + "num_tokens": 7866326701.0, + "step": 15389 + }, + { + "epoch": 4.16170903190914, + "grad_norm": 1.032450556755066, + "learning_rate": 3.2954131887249376e-06, + "loss": 1.8765, + "mean_token_accuracy": 0.5679923295974731, + "num_tokens": 7866850867.0, + "step": 15390 + }, + { + "epoch": 4.1619794483504595, + "grad_norm": 1.0947747230529785, + "learning_rate": 3.2945984703479484e-06, + "loss": 1.8602, + "mean_token_accuracy": 0.59600430727005, + "num_tokens": 7867283021.0, + "step": 15391 + }, + { + "epoch": 4.16224986479178, + "grad_norm": 0.9993550777435303, + "learning_rate": 3.2937839883953383e-06, + "loss": 1.9024, + "mean_token_accuracy": 0.5591437816619873, + "num_tokens": 7867807177.0, + "step": 15392 + }, + { + "epoch": 4.162520281233099, + "grad_norm": 0.9087620377540588, + "learning_rate": 3.292969742892095e-06, + "loss": 1.8732, + "mean_token_accuracy": 0.5833513736724854, + "num_tokens": 7868293606.0, + "step": 15393 + }, + { + "epoch": 4.162790697674419, + "grad_norm": 0.7555052042007446, + "learning_rate": 3.2921557338632065e-06, + "loss": 1.9501, + "mean_token_accuracy": 0.5395205020904541, + "num_tokens": 7868817841.0, + "step": 15394 + }, + { + "epoch": 4.163061114115738, + "grad_norm": 1.0663000345230103, + "learning_rate": 3.291341961333648e-06, + "loss": 1.8238, + "mean_token_accuracy": 0.588343620300293, + "num_tokens": 7869342020.0, + "step": 15395 + }, + { + "epoch": 4.163331530557058, + "grad_norm": 0.8721553087234497, + "learning_rate": 3.2905284253283855e-06, + "loss": 1.8126, + "mean_token_accuracy": 0.5894368886947632, + "num_tokens": 7869846614.0, + "step": 15396 + }, + { + "epoch": 4.163601946998377, + "grad_norm": 0.9495638608932495, + "learning_rate": 3.2897151258723856e-06, + "loss": 1.8684, + "mean_token_accuracy": 0.5657171010971069, + "num_tokens": 7870370775.0, + "step": 15397 + }, + { + "epoch": 4.1638723634396975, + "grad_norm": 0.8604031801223755, + "learning_rate": 3.2889020629906004e-06, + "loss": 1.7768, + "mean_token_accuracy": 0.6018232107162476, + "num_tokens": 7870848003.0, + "step": 15398 + }, + { + "epoch": 4.164142779881017, + "grad_norm": 0.8461236357688904, + "learning_rate": 3.288089236707974e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5554784536361694, + "num_tokens": 7871340886.0, + "step": 15399 + }, + { + "epoch": 4.164413196322337, + "grad_norm": 1.0220547914505005, + "learning_rate": 3.287276647049451e-06, + "loss": 1.6937, + "mean_token_accuracy": 0.6256127953529358, + "num_tokens": 7871808985.0, + "step": 15400 + }, + { + "epoch": 4.164683612763656, + "grad_norm": 0.38840076327323914, + "learning_rate": 3.2864642940399627e-06, + "loss": 1.2166, + "mean_token_accuracy": 0.6706056594848633, + "num_tokens": 7872322793.0, + "step": 15401 + }, + { + "epoch": 4.164954029204976, + "grad_norm": 1.0582133531570435, + "learning_rate": 3.2856521777044313e-06, + "loss": 1.7665, + "mean_token_accuracy": 0.5725160837173462, + "num_tokens": 7872847035.0, + "step": 15402 + }, + { + "epoch": 4.165224445646295, + "grad_norm": 0.9817250967025757, + "learning_rate": 3.284840298067778e-06, + "loss": 1.7103, + "mean_token_accuracy": 0.5917022228240967, + "num_tokens": 7873371302.0, + "step": 15403 + }, + { + "epoch": 4.165494862087615, + "grad_norm": 1.0580995082855225, + "learning_rate": 3.284028655154915e-06, + "loss": 1.9252, + "mean_token_accuracy": 0.5476750731468201, + "num_tokens": 7873895547.0, + "step": 15404 + }, + { + "epoch": 4.1657652785289345, + "grad_norm": 0.9259724020957947, + "learning_rate": 3.2832172489907423e-06, + "loss": 1.876, + "mean_token_accuracy": 0.5715039968490601, + "num_tokens": 7874407083.0, + "step": 15405 + }, + { + "epoch": 4.166035694970255, + "grad_norm": 0.8601303100585938, + "learning_rate": 3.28240607960016e-06, + "loss": 1.8134, + "mean_token_accuracy": 0.5787395238876343, + "num_tokens": 7874931357.0, + "step": 15406 + }, + { + "epoch": 4.166306111411574, + "grad_norm": 0.9272674322128296, + "learning_rate": 3.2815951470080522e-06, + "loss": 1.8494, + "mean_token_accuracy": 0.5694679021835327, + "num_tokens": 7875455411.0, + "step": 15407 + }, + { + "epoch": 4.166576527852893, + "grad_norm": 0.9180485010147095, + "learning_rate": 3.2807844512393063e-06, + "loss": 1.9458, + "mean_token_accuracy": 0.5525371432304382, + "num_tokens": 7875973756.0, + "step": 15408 + }, + { + "epoch": 4.166846944294213, + "grad_norm": 0.8852477669715881, + "learning_rate": 3.279973992318794e-06, + "loss": 1.7577, + "mean_token_accuracy": 0.5831507444381714, + "num_tokens": 7876498026.0, + "step": 15409 + }, + { + "epoch": 4.167117360735532, + "grad_norm": 0.9029285311698914, + "learning_rate": 3.2791637702713813e-06, + "loss": 1.839, + "mean_token_accuracy": 0.5848799347877502, + "num_tokens": 7877022058.0, + "step": 15410 + }, + { + "epoch": 4.167387777176852, + "grad_norm": 1.0816794633865356, + "learning_rate": 3.2783537851219306e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.5578233003616333, + "num_tokens": 7877546307.0, + "step": 15411 + }, + { + "epoch": 4.167658193618172, + "grad_norm": 1.2011163234710693, + "learning_rate": 3.277544036895295e-06, + "loss": 1.854, + "mean_token_accuracy": 0.5916714072227478, + "num_tokens": 7878041777.0, + "step": 15412 + }, + { + "epoch": 4.167928610059492, + "grad_norm": 0.8654545545578003, + "learning_rate": 3.2767345256163165e-06, + "loss": 1.8733, + "mean_token_accuracy": 0.5779871940612793, + "num_tokens": 7878566052.0, + "step": 15413 + }, + { + "epoch": 4.168199026500811, + "grad_norm": 0.9480677247047424, + "learning_rate": 3.2759252513098362e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.5874828100204468, + "num_tokens": 7879090208.0, + "step": 15414 + }, + { + "epoch": 4.168469442942131, + "grad_norm": 0.8154512643814087, + "learning_rate": 3.2751162140006843e-06, + "loss": 1.7873, + "mean_token_accuracy": 0.5815095901489258, + "num_tokens": 7879614444.0, + "step": 15415 + }, + { + "epoch": 4.16873985938345, + "grad_norm": 0.7763612866401672, + "learning_rate": 3.274307413713683e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.557570219039917, + "num_tokens": 7880138600.0, + "step": 15416 + }, + { + "epoch": 4.16901027582477, + "grad_norm": 1.2174150943756104, + "learning_rate": 3.2734988504736497e-06, + "loss": 1.7628, + "mean_token_accuracy": 0.5856244564056396, + "num_tokens": 7880662842.0, + "step": 15417 + }, + { + "epoch": 4.169280692266089, + "grad_norm": 0.8948718309402466, + "learning_rate": 3.272690524305396e-06, + "loss": 1.6926, + "mean_token_accuracy": 0.596813440322876, + "num_tokens": 7881187071.0, + "step": 15418 + }, + { + "epoch": 4.1695511087074095, + "grad_norm": 1.0532652139663696, + "learning_rate": 3.2718824352337196e-06, + "loss": 1.8165, + "mean_token_accuracy": 0.5736314058303833, + "num_tokens": 7881696804.0, + "step": 15419 + }, + { + "epoch": 4.169821525148729, + "grad_norm": 0.9628549814224243, + "learning_rate": 3.271074583283419e-06, + "loss": 1.8834, + "mean_token_accuracy": 0.5750362873077393, + "num_tokens": 7882172754.0, + "step": 15420 + }, + { + "epoch": 4.170091941590049, + "grad_norm": 0.3374008536338806, + "learning_rate": 3.27026696847928e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.7033119797706604, + "num_tokens": 7882696767.0, + "step": 15421 + }, + { + "epoch": 4.170362358031368, + "grad_norm": 1.023397445678711, + "learning_rate": 3.269459590846079e-06, + "loss": 1.9033, + "mean_token_accuracy": 0.5550107955932617, + "num_tokens": 7883221046.0, + "step": 15422 + }, + { + "epoch": 4.170632774472688, + "grad_norm": 0.9369773268699646, + "learning_rate": 3.268652450408594e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5808812379837036, + "num_tokens": 7883745313.0, + "step": 15423 + }, + { + "epoch": 4.170903190914007, + "grad_norm": 1.0130244493484497, + "learning_rate": 3.267845547191589e-06, + "loss": 2.0411, + "mean_token_accuracy": 0.535997211933136, + "num_tokens": 7884269448.0, + "step": 15424 + }, + { + "epoch": 4.171173607355327, + "grad_norm": 0.7848594188690186, + "learning_rate": 3.2670388812198196e-06, + "loss": 1.7085, + "mean_token_accuracy": 0.6028742790222168, + "num_tokens": 7884793726.0, + "step": 15425 + }, + { + "epoch": 4.171444023796647, + "grad_norm": 1.0127718448638916, + "learning_rate": 3.2662324525180387e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.5638731122016907, + "num_tokens": 7885317943.0, + "step": 15426 + }, + { + "epoch": 4.171714440237967, + "grad_norm": 0.926027238368988, + "learning_rate": 3.265426261110991e-06, + "loss": 1.7145, + "mean_token_accuracy": 0.5961524844169617, + "num_tokens": 7885842070.0, + "step": 15427 + }, + { + "epoch": 4.171984856679286, + "grad_norm": 0.8827148079872131, + "learning_rate": 3.2646203070234096e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.571006178855896, + "num_tokens": 7886318888.0, + "step": 15428 + }, + { + "epoch": 4.172255273120606, + "grad_norm": 0.7718625664710999, + "learning_rate": 3.2638145902800267e-06, + "loss": 1.7977, + "mean_token_accuracy": 0.5702837705612183, + "num_tokens": 7886843145.0, + "step": 15429 + }, + { + "epoch": 4.172525689561925, + "grad_norm": 0.978289008140564, + "learning_rate": 3.263009110905561e-06, + "loss": 1.9424, + "mean_token_accuracy": 0.5643525719642639, + "num_tokens": 7887367383.0, + "step": 15430 + }, + { + "epoch": 4.172796106003245, + "grad_norm": 0.9936044216156006, + "learning_rate": 3.2622038689247316e-06, + "loss": 1.8494, + "mean_token_accuracy": 0.5751698017120361, + "num_tokens": 7887853776.0, + "step": 15431 + }, + { + "epoch": 4.173066522444564, + "grad_norm": 0.7989057302474976, + "learning_rate": 3.2613988643622395e-06, + "loss": 1.8189, + "mean_token_accuracy": 0.5831499099731445, + "num_tokens": 7888360166.0, + "step": 15432 + }, + { + "epoch": 4.1733369388858845, + "grad_norm": 0.8265247941017151, + "learning_rate": 3.2605940972427895e-06, + "loss": 1.7767, + "mean_token_accuracy": 0.5909698009490967, + "num_tokens": 7888829385.0, + "step": 15433 + }, + { + "epoch": 4.173607355327204, + "grad_norm": 0.9237574338912964, + "learning_rate": 3.2597895675910736e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5853113532066345, + "num_tokens": 7889324512.0, + "step": 15434 + }, + { + "epoch": 4.173877771768524, + "grad_norm": 0.8657374382019043, + "learning_rate": 3.2589852754317773e-06, + "loss": 1.7455, + "mean_token_accuracy": 0.5916717648506165, + "num_tokens": 7889848675.0, + "step": 15435 + }, + { + "epoch": 4.174148188209843, + "grad_norm": 0.7841292023658752, + "learning_rate": 3.2581812207895745e-06, + "loss": 1.7054, + "mean_token_accuracy": 0.5924205183982849, + "num_tokens": 7890372843.0, + "step": 15436 + }, + { + "epoch": 4.174418604651163, + "grad_norm": 0.818071186542511, + "learning_rate": 3.257377403689142e-06, + "loss": 1.7998, + "mean_token_accuracy": 0.5867190361022949, + "num_tokens": 7890897056.0, + "step": 15437 + }, + { + "epoch": 4.174689021092482, + "grad_norm": 0.7811592221260071, + "learning_rate": 3.2565738241551394e-06, + "loss": 1.7763, + "mean_token_accuracy": 0.5844959020614624, + "num_tokens": 7891421305.0, + "step": 15438 + }, + { + "epoch": 4.174959437533802, + "grad_norm": 0.8678499460220337, + "learning_rate": 3.2557704822122217e-06, + "loss": 1.7563, + "mean_token_accuracy": 0.59299635887146, + "num_tokens": 7891944439.0, + "step": 15439 + }, + { + "epoch": 4.175229853975122, + "grad_norm": 0.9149412512779236, + "learning_rate": 3.2549673778850434e-06, + "loss": 1.7199, + "mean_token_accuracy": 0.5797684192657471, + "num_tokens": 7892468619.0, + "step": 15440 + }, + { + "epoch": 4.175500270416442, + "grad_norm": 0.3316202461719513, + "learning_rate": 3.254164511198241e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6981416940689087, + "num_tokens": 7892992834.0, + "step": 15441 + }, + { + "epoch": 4.175770686857761, + "grad_norm": 1.1079604625701904, + "learning_rate": 3.25336188217645e-06, + "loss": 1.9191, + "mean_token_accuracy": 0.5414513349533081, + "num_tokens": 7893517054.0, + "step": 15442 + }, + { + "epoch": 4.176041103299081, + "grad_norm": 0.8571144938468933, + "learning_rate": 3.252559490844298e-06, + "loss": 1.7016, + "mean_token_accuracy": 0.5978642702102661, + "num_tokens": 7894041196.0, + "step": 15443 + }, + { + "epoch": 4.1763115197404, + "grad_norm": 0.8629007339477539, + "learning_rate": 3.251757337226404e-06, + "loss": 1.7789, + "mean_token_accuracy": 0.5849003195762634, + "num_tokens": 7894565359.0, + "step": 15444 + }, + { + "epoch": 4.17658193618172, + "grad_norm": 0.8732332587242126, + "learning_rate": 3.2509554213473825e-06, + "loss": 1.8447, + "mean_token_accuracy": 0.5799955725669861, + "num_tokens": 7895075762.0, + "step": 15445 + }, + { + "epoch": 4.176852352623039, + "grad_norm": 0.7832382321357727, + "learning_rate": 3.250153743231834e-06, + "loss": 1.7279, + "mean_token_accuracy": 0.6237876415252686, + "num_tokens": 7895535742.0, + "step": 15446 + }, + { + "epoch": 4.1771227690643595, + "grad_norm": 0.8445717692375183, + "learning_rate": 3.249352302904363e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5667555332183838, + "num_tokens": 7896059847.0, + "step": 15447 + }, + { + "epoch": 4.177393185505679, + "grad_norm": 0.759824275970459, + "learning_rate": 3.248551100389554e-06, + "loss": 1.7599, + "mean_token_accuracy": 0.5746523141860962, + "num_tokens": 7896584079.0, + "step": 15448 + }, + { + "epoch": 4.177663601946998, + "grad_norm": 0.7932407259941101, + "learning_rate": 3.247750135711995e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5909993648529053, + "num_tokens": 7897108343.0, + "step": 15449 + }, + { + "epoch": 4.177934018388318, + "grad_norm": 0.8380760550498962, + "learning_rate": 3.246949408896259e-06, + "loss": 1.7032, + "mean_token_accuracy": 0.5998022556304932, + "num_tokens": 7897579101.0, + "step": 15450 + }, + { + "epoch": 4.178204434829637, + "grad_norm": 0.815356969833374, + "learning_rate": 3.246148919966915e-06, + "loss": 1.8587, + "mean_token_accuracy": 0.5758835077285767, + "num_tokens": 7898100035.0, + "step": 15451 + }, + { + "epoch": 4.178474851270957, + "grad_norm": 0.7992878556251526, + "learning_rate": 3.2453486689485253e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.5946030616760254, + "num_tokens": 7898624256.0, + "step": 15452 + }, + { + "epoch": 4.1787452677122765, + "grad_norm": 0.8488854169845581, + "learning_rate": 3.2445486558656414e-06, + "loss": 1.6977, + "mean_token_accuracy": 0.5926635265350342, + "num_tokens": 7899139642.0, + "step": 15453 + }, + { + "epoch": 4.179015684153597, + "grad_norm": 0.8302096128463745, + "learning_rate": 3.2437488807428143e-06, + "loss": 1.8257, + "mean_token_accuracy": 0.5701747536659241, + "num_tokens": 7899663842.0, + "step": 15454 + }, + { + "epoch": 4.179286100594916, + "grad_norm": 0.8937949538230896, + "learning_rate": 3.242949343604582e-06, + "loss": 1.7806, + "mean_token_accuracy": 0.5662689805030823, + "num_tokens": 7900142080.0, + "step": 15455 + }, + { + "epoch": 4.179556517036236, + "grad_norm": 0.925533652305603, + "learning_rate": 3.2421500444754727e-06, + "loss": 1.8919, + "mean_token_accuracy": 0.5744227170944214, + "num_tokens": 7900666198.0, + "step": 15456 + }, + { + "epoch": 4.179826933477555, + "grad_norm": 0.9961247444152832, + "learning_rate": 3.2413509833800164e-06, + "loss": 1.6281, + "mean_token_accuracy": 0.6294629573822021, + "num_tokens": 7901190249.0, + "step": 15457 + }, + { + "epoch": 4.180097349918875, + "grad_norm": 0.9319881200790405, + "learning_rate": 3.2405521603427286e-06, + "loss": 1.8214, + "mean_token_accuracy": 0.5841413736343384, + "num_tokens": 7901714423.0, + "step": 15458 + }, + { + "epoch": 4.180367766360194, + "grad_norm": 0.9991909265518188, + "learning_rate": 3.2397535753881183e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.5731071829795837, + "num_tokens": 7902227413.0, + "step": 15459 + }, + { + "epoch": 4.1806381828015144, + "grad_norm": 0.8605247139930725, + "learning_rate": 3.2389552285406887e-06, + "loss": 1.8204, + "mean_token_accuracy": 0.5897256731987, + "num_tokens": 7902751670.0, + "step": 15460 + }, + { + "epoch": 4.180908599242834, + "grad_norm": 0.34596341848373413, + "learning_rate": 3.23815711982494e-06, + "loss": 1.1739, + "mean_token_accuracy": 0.6699411869049072, + "num_tokens": 7903275852.0, + "step": 15461 + }, + { + "epoch": 4.181179015684154, + "grad_norm": 0.8943729996681213, + "learning_rate": 3.237359249265354e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5566758513450623, + "num_tokens": 7903800000.0, + "step": 15462 + }, + { + "epoch": 4.181449432125473, + "grad_norm": 0.8969029784202576, + "learning_rate": 3.2365616168864168e-06, + "loss": 1.7612, + "mean_token_accuracy": 0.5753195285797119, + "num_tokens": 7904324272.0, + "step": 15463 + }, + { + "epoch": 4.181719848566793, + "grad_norm": 0.864512026309967, + "learning_rate": 3.235764222712599e-06, + "loss": 1.8601, + "mean_token_accuracy": 0.5864346027374268, + "num_tokens": 7904848396.0, + "step": 15464 + }, + { + "epoch": 4.181990265008112, + "grad_norm": 0.8959755897521973, + "learning_rate": 3.234967066768368e-06, + "loss": 1.6823, + "mean_token_accuracy": 0.5910057425498962, + "num_tokens": 7905372633.0, + "step": 15465 + }, + { + "epoch": 4.182260681449432, + "grad_norm": 0.8985204100608826, + "learning_rate": 3.2341701490781827e-06, + "loss": 1.8499, + "mean_token_accuracy": 0.5647569894790649, + "num_tokens": 7905874884.0, + "step": 15466 + }, + { + "epoch": 4.1825310978907515, + "grad_norm": 0.8405618071556091, + "learning_rate": 3.233373469666497e-06, + "loss": 1.7803, + "mean_token_accuracy": 0.5817154049873352, + "num_tokens": 7906398884.0, + "step": 15467 + }, + { + "epoch": 4.182801514332072, + "grad_norm": 0.9138949513435364, + "learning_rate": 3.2325770285577507e-06, + "loss": 1.8906, + "mean_token_accuracy": 0.5584987998008728, + "num_tokens": 7906923078.0, + "step": 15468 + }, + { + "epoch": 4.183071930773391, + "grad_norm": 0.908869206905365, + "learning_rate": 3.231780825776385e-06, + "loss": 1.9427, + "mean_token_accuracy": 0.5629251003265381, + "num_tokens": 7907400858.0, + "step": 15469 + }, + { + "epoch": 4.183342347214711, + "grad_norm": 0.8743758797645569, + "learning_rate": 3.2309848613468288e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5728316307067871, + "num_tokens": 7907925128.0, + "step": 15470 + }, + { + "epoch": 4.18361276365603, + "grad_norm": 1.067466378211975, + "learning_rate": 3.230189135293503e-06, + "loss": 1.6695, + "mean_token_accuracy": 0.6109667420387268, + "num_tokens": 7908449310.0, + "step": 15471 + }, + { + "epoch": 4.18388318009735, + "grad_norm": 0.7956824898719788, + "learning_rate": 3.2293936476408243e-06, + "loss": 1.5285, + "mean_token_accuracy": 0.6213274002075195, + "num_tokens": 7908948528.0, + "step": 15472 + }, + { + "epoch": 4.184153596538669, + "grad_norm": 0.7965124249458313, + "learning_rate": 3.2285983984132e-06, + "loss": 1.8426, + "mean_token_accuracy": 0.5758122205734253, + "num_tokens": 7909472769.0, + "step": 15473 + }, + { + "epoch": 4.1844240129799894, + "grad_norm": 0.941768229007721, + "learning_rate": 3.2278033876350303e-06, + "loss": 1.8935, + "mean_token_accuracy": 0.5547268986701965, + "num_tokens": 7909996922.0, + "step": 15474 + }, + { + "epoch": 4.184694429421309, + "grad_norm": 1.0515060424804688, + "learning_rate": 3.227008615330711e-06, + "loss": 1.7906, + "mean_token_accuracy": 0.5855216979980469, + "num_tokens": 7910492860.0, + "step": 15475 + }, + { + "epoch": 4.184964845862629, + "grad_norm": 0.9381214380264282, + "learning_rate": 3.226214081524625e-06, + "loss": 1.8846, + "mean_token_accuracy": 0.5813643336296082, + "num_tokens": 7910980423.0, + "step": 15476 + }, + { + "epoch": 4.185235262303948, + "grad_norm": 1.061229944229126, + "learning_rate": 3.225419786241153e-06, + "loss": 1.8935, + "mean_token_accuracy": 0.5606712102890015, + "num_tokens": 7911504671.0, + "step": 15477 + }, + { + "epoch": 4.185505678745268, + "grad_norm": 0.800725519657135, + "learning_rate": 3.2246257295046674e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5778616666793823, + "num_tokens": 7912028774.0, + "step": 15478 + }, + { + "epoch": 4.185776095186587, + "grad_norm": 0.9024913907051086, + "learning_rate": 3.2238319113395274e-06, + "loss": 1.8051, + "mean_token_accuracy": 0.5818620920181274, + "num_tokens": 7912493741.0, + "step": 15479 + }, + { + "epoch": 4.186046511627907, + "grad_norm": 0.9587734341621399, + "learning_rate": 3.223038331770094e-06, + "loss": 1.8784, + "mean_token_accuracy": 0.5862579345703125, + "num_tokens": 7912968019.0, + "step": 15480 + }, + { + "epoch": 4.1863169280692265, + "grad_norm": 0.3911513686180115, + "learning_rate": 3.2222449908207154e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.7329698801040649, + "num_tokens": 7913421817.0, + "step": 15481 + }, + { + "epoch": 4.186587344510547, + "grad_norm": 1.0842097997665405, + "learning_rate": 3.2214518885157316e-06, + "loss": 1.8805, + "mean_token_accuracy": 0.578348696231842, + "num_tokens": 7913919394.0, + "step": 15482 + }, + { + "epoch": 4.186857760951866, + "grad_norm": 1.1196742057800293, + "learning_rate": 3.220659024879481e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5845979452133179, + "num_tokens": 7914443627.0, + "step": 15483 + }, + { + "epoch": 4.187128177393186, + "grad_norm": 0.9047033786773682, + "learning_rate": 3.2198663999362895e-06, + "loss": 1.9294, + "mean_token_accuracy": 0.5786373615264893, + "num_tokens": 7914904471.0, + "step": 15484 + }, + { + "epoch": 4.187398593834505, + "grad_norm": 0.8240403532981873, + "learning_rate": 3.219074013710474e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5590859651565552, + "num_tokens": 7915428585.0, + "step": 15485 + }, + { + "epoch": 4.187669010275825, + "grad_norm": 0.9454014301300049, + "learning_rate": 3.2182818662263524e-06, + "loss": 1.8997, + "mean_token_accuracy": 0.5593971014022827, + "num_tokens": 7915939885.0, + "step": 15486 + }, + { + "epoch": 4.187939426717144, + "grad_norm": 1.1702157258987427, + "learning_rate": 3.217489957508225e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.5816754102706909, + "num_tokens": 7916464026.0, + "step": 15487 + }, + { + "epoch": 4.1882098431584645, + "grad_norm": 1.1162102222442627, + "learning_rate": 3.2166982875803953e-06, + "loss": 1.8336, + "mean_token_accuracy": 0.5889888405799866, + "num_tokens": 7916943682.0, + "step": 15488 + }, + { + "epoch": 4.188480259599784, + "grad_norm": 0.9370297789573669, + "learning_rate": 3.215906856467149e-06, + "loss": 1.9152, + "mean_token_accuracy": 0.5651674270629883, + "num_tokens": 7917467897.0, + "step": 15489 + }, + { + "epoch": 4.188750676041103, + "grad_norm": 1.139618992805481, + "learning_rate": 3.2151156641927737e-06, + "loss": 1.7096, + "mean_token_accuracy": 0.599333643913269, + "num_tokens": 7917992101.0, + "step": 15490 + }, + { + "epoch": 4.189021092482423, + "grad_norm": 0.9484391808509827, + "learning_rate": 3.214324710781543e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.5984280705451965, + "num_tokens": 7918516369.0, + "step": 15491 + }, + { + "epoch": 4.189291508923742, + "grad_norm": 0.9384584426879883, + "learning_rate": 3.2135339962577266e-06, + "loss": 1.7412, + "mean_token_accuracy": 0.5917701125144958, + "num_tokens": 7919040397.0, + "step": 15492 + }, + { + "epoch": 4.189561925365062, + "grad_norm": 1.029439926147461, + "learning_rate": 3.212743520645587e-06, + "loss": 1.8789, + "mean_token_accuracy": 0.5547306537628174, + "num_tokens": 7919564557.0, + "step": 15493 + }, + { + "epoch": 4.189832341806381, + "grad_norm": 1.0512362718582153, + "learning_rate": 3.211953283969374e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.6035717129707336, + "num_tokens": 7920088585.0, + "step": 15494 + }, + { + "epoch": 4.1901027582477015, + "grad_norm": 0.9317050576210022, + "learning_rate": 3.21116328625334e-06, + "loss": 1.8221, + "mean_token_accuracy": 0.5672165751457214, + "num_tokens": 7920553174.0, + "step": 15495 + }, + { + "epoch": 4.190373174689021, + "grad_norm": 0.8779667615890503, + "learning_rate": 3.210373527521722e-06, + "loss": 1.8847, + "mean_token_accuracy": 0.5664210319519043, + "num_tokens": 7921077394.0, + "step": 15496 + }, + { + "epoch": 4.190643591130341, + "grad_norm": 0.8965930342674255, + "learning_rate": 3.2095840077987502e-06, + "loss": 1.8104, + "mean_token_accuracy": 0.5724270343780518, + "num_tokens": 7921601533.0, + "step": 15497 + }, + { + "epoch": 4.19091400757166, + "grad_norm": 0.9887931942939758, + "learning_rate": 3.2087947271086524e-06, + "loss": 1.7798, + "mean_token_accuracy": 0.5833746194839478, + "num_tokens": 7922125723.0, + "step": 15498 + }, + { + "epoch": 4.19118442401298, + "grad_norm": 1.02891206741333, + "learning_rate": 3.208005685475643e-06, + "loss": 1.7751, + "mean_token_accuracy": 0.5938624143600464, + "num_tokens": 7922649990.0, + "step": 15499 + }, + { + "epoch": 4.191454840454299, + "grad_norm": 1.0446492433547974, + "learning_rate": 3.207216882923935e-06, + "loss": 1.8767, + "mean_token_accuracy": 0.5828876495361328, + "num_tokens": 7923174169.0, + "step": 15500 + }, + { + "epoch": 4.191725256895619, + "grad_norm": 0.33367639780044556, + "learning_rate": 3.206428319477732e-06, + "loss": 1.097, + "mean_token_accuracy": 0.7088121175765991, + "num_tokens": 7923698443.0, + "step": 15501 + }, + { + "epoch": 4.191995673336939, + "grad_norm": 0.933326005935669, + "learning_rate": 3.205639995161225e-06, + "loss": 1.8948, + "mean_token_accuracy": 0.5808738470077515, + "num_tokens": 7924183912.0, + "step": 15502 + }, + { + "epoch": 4.192266089778259, + "grad_norm": 0.9978241324424744, + "learning_rate": 3.2048519099986044e-06, + "loss": 1.7986, + "mean_token_accuracy": 0.5862921476364136, + "num_tokens": 7924708189.0, + "step": 15503 + }, + { + "epoch": 4.192536506219578, + "grad_norm": 0.9289114475250244, + "learning_rate": 3.204064064014053e-06, + "loss": 1.794, + "mean_token_accuracy": 0.5828832387924194, + "num_tokens": 7925214235.0, + "step": 15504 + }, + { + "epoch": 4.192806922660898, + "grad_norm": 0.8490204215049744, + "learning_rate": 3.2032764572317397e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5751782059669495, + "num_tokens": 7925738329.0, + "step": 15505 + }, + { + "epoch": 4.193077339102217, + "grad_norm": 0.8615307807922363, + "learning_rate": 3.202489089675838e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5779592990875244, + "num_tokens": 7926262597.0, + "step": 15506 + }, + { + "epoch": 4.193347755543537, + "grad_norm": 0.8340576887130737, + "learning_rate": 3.201701961370499e-06, + "loss": 1.8388, + "mean_token_accuracy": 0.5792994499206543, + "num_tokens": 7926786774.0, + "step": 15507 + }, + { + "epoch": 4.193618171984856, + "grad_norm": 0.9094356894493103, + "learning_rate": 3.2009150723398772e-06, + "loss": 1.9086, + "mean_token_accuracy": 0.559180736541748, + "num_tokens": 7927269551.0, + "step": 15508 + }, + { + "epoch": 4.1938885884261765, + "grad_norm": 0.8244717717170715, + "learning_rate": 3.200128422608118e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5793864727020264, + "num_tokens": 7927793830.0, + "step": 15509 + }, + { + "epoch": 4.194159004867496, + "grad_norm": 0.8840101957321167, + "learning_rate": 3.1993420121993555e-06, + "loss": 1.5487, + "mean_token_accuracy": 0.6410004496574402, + "num_tokens": 7928276208.0, + "step": 15510 + }, + { + "epoch": 4.194429421308816, + "grad_norm": 0.8358547687530518, + "learning_rate": 3.1985558411377203e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.5645429491996765, + "num_tokens": 7928800493.0, + "step": 15511 + }, + { + "epoch": 4.194699837750135, + "grad_norm": 0.8814243674278259, + "learning_rate": 3.197769909447335e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.5711016058921814, + "num_tokens": 7929324664.0, + "step": 15512 + }, + { + "epoch": 4.194970254191455, + "grad_norm": 0.9693335890769958, + "learning_rate": 3.1969842171523146e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5626790523529053, + "num_tokens": 7929834817.0, + "step": 15513 + }, + { + "epoch": 4.195240670632774, + "grad_norm": 0.8588262796401978, + "learning_rate": 3.1961987642767635e-06, + "loss": 1.7143, + "mean_token_accuracy": 0.5999071598052979, + "num_tokens": 7930359026.0, + "step": 15514 + }, + { + "epoch": 4.195511087074094, + "grad_norm": 0.8350685834884644, + "learning_rate": 3.1954135508447855e-06, + "loss": 1.7514, + "mean_token_accuracy": 0.592553973197937, + "num_tokens": 7930883289.0, + "step": 15515 + }, + { + "epoch": 4.195781503515414, + "grad_norm": 0.8555416464805603, + "learning_rate": 3.1946285768804698e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.5675160884857178, + "num_tokens": 7931407562.0, + "step": 15516 + }, + { + "epoch": 4.196051919956734, + "grad_norm": 0.7465729713439941, + "learning_rate": 3.1938438424079025e-06, + "loss": 1.7701, + "mean_token_accuracy": 0.590246856212616, + "num_tokens": 7931931745.0, + "step": 15517 + }, + { + "epoch": 4.196322336398053, + "grad_norm": 0.9103558659553528, + "learning_rate": 3.1930593474511657e-06, + "loss": 1.8797, + "mean_token_accuracy": 0.575128436088562, + "num_tokens": 7932455968.0, + "step": 15518 + }, + { + "epoch": 4.196592752839373, + "grad_norm": 0.8655238747596741, + "learning_rate": 3.1922750920343266e-06, + "loss": 1.7587, + "mean_token_accuracy": 0.5946163535118103, + "num_tokens": 7932937902.0, + "step": 15519 + }, + { + "epoch": 4.196863169280692, + "grad_norm": 0.8339836001396179, + "learning_rate": 3.1914910761814464e-06, + "loss": 1.651, + "mean_token_accuracy": 0.6356610059738159, + "num_tokens": 7933434729.0, + "step": 15520 + }, + { + "epoch": 4.197133585722012, + "grad_norm": 0.34053751826286316, + "learning_rate": 3.1907072999165855e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7193163633346558, + "num_tokens": 7933958946.0, + "step": 15521 + }, + { + "epoch": 4.197404002163331, + "grad_norm": 0.9229127764701843, + "learning_rate": 3.1899237632637873e-06, + "loss": 1.8748, + "mean_token_accuracy": 0.5555859208106995, + "num_tokens": 7934483099.0, + "step": 15522 + }, + { + "epoch": 4.1976744186046515, + "grad_norm": 0.8019940257072449, + "learning_rate": 3.1891404662471e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.5671330094337463, + "num_tokens": 7935007367.0, + "step": 15523 + }, + { + "epoch": 4.197944835045971, + "grad_norm": 1.1576898097991943, + "learning_rate": 3.1883574088905517e-06, + "loss": 1.6613, + "mean_token_accuracy": 0.5868774056434631, + "num_tokens": 7935531641.0, + "step": 15524 + }, + { + "epoch": 4.198215251487291, + "grad_norm": 0.8229277729988098, + "learning_rate": 3.1875745912181695e-06, + "loss": 1.7727, + "mean_token_accuracy": 0.5836122035980225, + "num_tokens": 7936055796.0, + "step": 15525 + }, + { + "epoch": 4.19848566792861, + "grad_norm": 0.8444176316261292, + "learning_rate": 3.1867920132539755e-06, + "loss": 1.9164, + "mean_token_accuracy": 0.5758326053619385, + "num_tokens": 7936528126.0, + "step": 15526 + }, + { + "epoch": 4.19875608436993, + "grad_norm": 0.8289045095443726, + "learning_rate": 3.1860096750219798e-06, + "loss": 1.7408, + "mean_token_accuracy": 0.5891995429992676, + "num_tokens": 7937052401.0, + "step": 15527 + }, + { + "epoch": 4.199026500811249, + "grad_norm": 0.8793628215789795, + "learning_rate": 3.1852275765461847e-06, + "loss": 1.7202, + "mean_token_accuracy": 0.5943650603294373, + "num_tokens": 7937540454.0, + "step": 15528 + }, + { + "epoch": 4.199296917252569, + "grad_norm": 1.0048154592514038, + "learning_rate": 3.184445717850591e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5810507535934448, + "num_tokens": 7938010664.0, + "step": 15529 + }, + { + "epoch": 4.199567333693889, + "grad_norm": 0.7840017676353455, + "learning_rate": 3.183664098959185e-06, + "loss": 1.6129, + "mean_token_accuracy": 0.6384332180023193, + "num_tokens": 7938481841.0, + "step": 15530 + }, + { + "epoch": 4.199837750135208, + "grad_norm": 0.8640189170837402, + "learning_rate": 3.1828827198959522e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5700849294662476, + "num_tokens": 7939006073.0, + "step": 15531 + }, + { + "epoch": 4.200108166576528, + "grad_norm": 0.8710561394691467, + "learning_rate": 3.182101580684865e-06, + "loss": 1.885, + "mean_token_accuracy": 0.5602842569351196, + "num_tokens": 7939530320.0, + "step": 15532 + }, + { + "epoch": 4.200378583017847, + "grad_norm": 1.313667893409729, + "learning_rate": 3.1813206813498944e-06, + "loss": 1.903, + "mean_token_accuracy": 0.5473076105117798, + "num_tokens": 7940054541.0, + "step": 15533 + }, + { + "epoch": 4.200648999459167, + "grad_norm": 0.8014974594116211, + "learning_rate": 3.180540021914995e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.577621340751648, + "num_tokens": 7940578720.0, + "step": 15534 + }, + { + "epoch": 4.200919415900486, + "grad_norm": 0.8788586854934692, + "learning_rate": 3.1797596024041273e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5822133421897888, + "num_tokens": 7941102990.0, + "step": 15535 + }, + { + "epoch": 4.201189832341806, + "grad_norm": 1.0276525020599365, + "learning_rate": 3.1789794228412317e-06, + "loss": 1.7782, + "mean_token_accuracy": 0.584927499294281, + "num_tokens": 7941627134.0, + "step": 15536 + }, + { + "epoch": 4.201460248783126, + "grad_norm": 0.9475459456443787, + "learning_rate": 3.178199483250245e-06, + "loss": 1.8851, + "mean_token_accuracy": 0.5616271495819092, + "num_tokens": 7942151336.0, + "step": 15537 + }, + { + "epoch": 4.201730665224446, + "grad_norm": 0.9401665329933167, + "learning_rate": 3.177419783655104e-06, + "loss": 1.7846, + "mean_token_accuracy": 0.5774518251419067, + "num_tokens": 7942675477.0, + "step": 15538 + }, + { + "epoch": 4.202001081665765, + "grad_norm": 0.8455514311790466, + "learning_rate": 3.176640324079728e-06, + "loss": 1.7247, + "mean_token_accuracy": 0.5869524478912354, + "num_tokens": 7943199666.0, + "step": 15539 + }, + { + "epoch": 4.202271498107085, + "grad_norm": 0.7835617065429688, + "learning_rate": 3.175861104548032e-06, + "loss": 1.7206, + "mean_token_accuracy": 0.6014971733093262, + "num_tokens": 7943723870.0, + "step": 15540 + }, + { + "epoch": 4.202541914548404, + "grad_norm": 0.34819674491882324, + "learning_rate": 3.1750821250839276e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.705008864402771, + "num_tokens": 7944248055.0, + "step": 15541 + }, + { + "epoch": 4.202812330989724, + "grad_norm": 1.2966203689575195, + "learning_rate": 3.174303385711316e-06, + "loss": 1.8817, + "mean_token_accuracy": 0.5532292127609253, + "num_tokens": 7944772205.0, + "step": 15542 + }, + { + "epoch": 4.2030827474310435, + "grad_norm": 1.1349260807037354, + "learning_rate": 3.173524886454089e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5829565525054932, + "num_tokens": 7945296413.0, + "step": 15543 + }, + { + "epoch": 4.203353163872364, + "grad_norm": 0.860355794429779, + "learning_rate": 3.172746627336134e-06, + "loss": 1.6213, + "mean_token_accuracy": 0.6122173070907593, + "num_tokens": 7945769026.0, + "step": 15544 + }, + { + "epoch": 4.203623580313683, + "grad_norm": 0.8252685070037842, + "learning_rate": 3.171968608381333e-06, + "loss": 1.8826, + "mean_token_accuracy": 0.5678681135177612, + "num_tokens": 7946293268.0, + "step": 15545 + }, + { + "epoch": 4.203893996755003, + "grad_norm": 1.0437285900115967, + "learning_rate": 3.171190829613554e-06, + "loss": 1.7929, + "mean_token_accuracy": 0.5862149596214294, + "num_tokens": 7946723890.0, + "step": 15546 + }, + { + "epoch": 4.204164413196322, + "grad_norm": 0.8690840005874634, + "learning_rate": 3.1704132910566654e-06, + "loss": 1.7103, + "mean_token_accuracy": 0.6112524271011353, + "num_tokens": 7947248060.0, + "step": 15547 + }, + { + "epoch": 4.204434829637642, + "grad_norm": 1.0426180362701416, + "learning_rate": 3.16963599273452e-06, + "loss": 1.9019, + "mean_token_accuracy": 0.5748316645622253, + "num_tokens": 7947724042.0, + "step": 15548 + }, + { + "epoch": 4.204705246078961, + "grad_norm": 0.9788483381271362, + "learning_rate": 3.168858934670972e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.593948245048523, + "num_tokens": 7948110369.0, + "step": 15549 + }, + { + "epoch": 4.204975662520281, + "grad_norm": 0.8439491987228394, + "learning_rate": 3.1680821168898625e-06, + "loss": 1.7407, + "mean_token_accuracy": 0.603736937046051, + "num_tokens": 7948634486.0, + "step": 15550 + }, + { + "epoch": 4.205246078961601, + "grad_norm": 0.9523175358772278, + "learning_rate": 3.1673055394150224e-06, + "loss": 1.8838, + "mean_token_accuracy": 0.5600578784942627, + "num_tokens": 7949143243.0, + "step": 15551 + }, + { + "epoch": 4.205516495402921, + "grad_norm": 0.9235263466835022, + "learning_rate": 3.166529202270285e-06, + "loss": 1.781, + "mean_token_accuracy": 0.5822243690490723, + "num_tokens": 7949667510.0, + "step": 15552 + }, + { + "epoch": 4.20578691184424, + "grad_norm": 0.8108153343200684, + "learning_rate": 3.16575310547947e-06, + "loss": 1.8766, + "mean_token_accuracy": 0.5697996616363525, + "num_tokens": 7950191569.0, + "step": 15553 + }, + { + "epoch": 4.20605732828556, + "grad_norm": 0.7944639325141907, + "learning_rate": 3.1649772490663855e-06, + "loss": 1.7618, + "mean_token_accuracy": 0.5865899324417114, + "num_tokens": 7950715847.0, + "step": 15554 + }, + { + "epoch": 4.206327744726879, + "grad_norm": 1.0545545816421509, + "learning_rate": 3.164201633054843e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5826809406280518, + "num_tokens": 7951126847.0, + "step": 15555 + }, + { + "epoch": 4.206598161168199, + "grad_norm": 1.059248924255371, + "learning_rate": 3.1634262574686374e-06, + "loss": 1.7768, + "mean_token_accuracy": 0.59361332654953, + "num_tokens": 7951650897.0, + "step": 15556 + }, + { + "epoch": 4.2068685776095185, + "grad_norm": 0.8653879761695862, + "learning_rate": 3.162651122331558e-06, + "loss": 1.786, + "mean_token_accuracy": 0.575811505317688, + "num_tokens": 7952175132.0, + "step": 15557 + }, + { + "epoch": 4.207138994050839, + "grad_norm": 0.8517701625823975, + "learning_rate": 3.161876227667393e-06, + "loss": 1.7199, + "mean_token_accuracy": 0.6068382263183594, + "num_tokens": 7952645763.0, + "step": 15558 + }, + { + "epoch": 4.207409410492158, + "grad_norm": 0.9190499186515808, + "learning_rate": 3.1611015734999123e-06, + "loss": 1.7488, + "mean_token_accuracy": 0.5905181169509888, + "num_tokens": 7953169994.0, + "step": 15559 + }, + { + "epoch": 4.207679826933478, + "grad_norm": 0.9371936917304993, + "learning_rate": 3.1603271598528893e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5818871259689331, + "num_tokens": 7953694173.0, + "step": 15560 + }, + { + "epoch": 4.207950243374797, + "grad_norm": 0.3690279424190521, + "learning_rate": 3.1595529867500863e-06, + "loss": 1.1383, + "mean_token_accuracy": 0.6906436681747437, + "num_tokens": 7954218448.0, + "step": 15561 + }, + { + "epoch": 4.208220659816117, + "grad_norm": 0.9819737076759338, + "learning_rate": 3.158779054215254e-06, + "loss": 1.8174, + "mean_token_accuracy": 0.6069852113723755, + "num_tokens": 7954662403.0, + "step": 15562 + }, + { + "epoch": 4.208491076257436, + "grad_norm": 0.9652633666992188, + "learning_rate": 3.158005362272138e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5801422595977783, + "num_tokens": 7955186533.0, + "step": 15563 + }, + { + "epoch": 4.208761492698756, + "grad_norm": 0.927842915058136, + "learning_rate": 3.1572319109444815e-06, + "loss": 1.8142, + "mean_token_accuracy": 0.620979905128479, + "num_tokens": 7955604278.0, + "step": 15564 + }, + { + "epoch": 4.209031909140076, + "grad_norm": 3.9670279026031494, + "learning_rate": 3.1564587002560143e-06, + "loss": 1.6451, + "mean_token_accuracy": 0.6303404569625854, + "num_tokens": 7956088450.0, + "step": 15565 + }, + { + "epoch": 4.209302325581396, + "grad_norm": 1.0964223146438599, + "learning_rate": 3.155685730230459e-06, + "loss": 1.9594, + "mean_token_accuracy": 0.5626185536384583, + "num_tokens": 7956596150.0, + "step": 15566 + }, + { + "epoch": 4.209572742022715, + "grad_norm": 1.0241889953613281, + "learning_rate": 3.1549130008915346e-06, + "loss": 1.8565, + "mean_token_accuracy": 0.5759811401367188, + "num_tokens": 7957120261.0, + "step": 15567 + }, + { + "epoch": 4.209843158464035, + "grad_norm": 1.0152361392974854, + "learning_rate": 3.15414051226295e-06, + "loss": 1.7808, + "mean_token_accuracy": 0.5851998329162598, + "num_tokens": 7957638743.0, + "step": 15568 + }, + { + "epoch": 4.210113574905354, + "grad_norm": 0.8168857097625732, + "learning_rate": 3.1533682643684083e-06, + "loss": 1.9234, + "mean_token_accuracy": 0.5419604778289795, + "num_tokens": 7958162739.0, + "step": 15569 + }, + { + "epoch": 4.210383991346674, + "grad_norm": 0.7848456501960754, + "learning_rate": 3.1525962572316033e-06, + "loss": 1.7768, + "mean_token_accuracy": 0.5745117664337158, + "num_tokens": 7958686929.0, + "step": 15570 + }, + { + "epoch": 4.2106544077879935, + "grad_norm": 0.8890524506568909, + "learning_rate": 3.151824490876222e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.5710111856460571, + "num_tokens": 7959211094.0, + "step": 15571 + }, + { + "epoch": 4.210924824229313, + "grad_norm": 1.040418028831482, + "learning_rate": 3.151052965325947e-06, + "loss": 1.819, + "mean_token_accuracy": 0.5758394002914429, + "num_tokens": 7959735106.0, + "step": 15572 + }, + { + "epoch": 4.211195240670633, + "grad_norm": 0.86070317029953, + "learning_rate": 3.1502816806044467e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5662543177604675, + "num_tokens": 7960259322.0, + "step": 15573 + }, + { + "epoch": 4.211465657111952, + "grad_norm": 0.919019341468811, + "learning_rate": 3.1495106367353916e-06, + "loss": 1.7993, + "mean_token_accuracy": 0.5906800627708435, + "num_tokens": 7960783363.0, + "step": 15574 + }, + { + "epoch": 4.211736073553272, + "grad_norm": 0.904419481754303, + "learning_rate": 3.1487398337424354e-06, + "loss": 1.7782, + "mean_token_accuracy": 0.5803214311599731, + "num_tokens": 7961307558.0, + "step": 15575 + }, + { + "epoch": 4.212006489994591, + "grad_norm": 1.046125054359436, + "learning_rate": 3.1479692716492315e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5874662399291992, + "num_tokens": 7961798866.0, + "step": 15576 + }, + { + "epoch": 4.212276906435911, + "grad_norm": 0.8237900733947754, + "learning_rate": 3.1471989504794197e-06, + "loss": 1.827, + "mean_token_accuracy": 0.573654055595398, + "num_tokens": 7962323112.0, + "step": 15577 + }, + { + "epoch": 4.2125473228772305, + "grad_norm": 0.7837530970573425, + "learning_rate": 3.146428870256639e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5799816846847534, + "num_tokens": 7962847392.0, + "step": 15578 + }, + { + "epoch": 4.212817739318551, + "grad_norm": 0.8469160795211792, + "learning_rate": 3.145659031004517e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5785696506500244, + "num_tokens": 7963363212.0, + "step": 15579 + }, + { + "epoch": 4.21308815575987, + "grad_norm": 0.9305585026741028, + "learning_rate": 3.1448894327466726e-06, + "loss": 1.8207, + "mean_token_accuracy": 0.5914331078529358, + "num_tokens": 7963887338.0, + "step": 15580 + }, + { + "epoch": 4.21335857220119, + "grad_norm": 0.3441467881202698, + "learning_rate": 3.1441200755067216e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.716853678226471, + "num_tokens": 7964411540.0, + "step": 15581 + }, + { + "epoch": 4.213628988642509, + "grad_norm": 1.1054738759994507, + "learning_rate": 3.143350959308271e-06, + "loss": 1.7568, + "mean_token_accuracy": 0.5982095003128052, + "num_tokens": 7964892303.0, + "step": 15582 + }, + { + "epoch": 4.213899405083829, + "grad_norm": 0.9318658113479614, + "learning_rate": 3.1425820841749144e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.5855902433395386, + "num_tokens": 7965416530.0, + "step": 15583 + }, + { + "epoch": 4.214169821525148, + "grad_norm": 0.8474789261817932, + "learning_rate": 3.1418134501302487e-06, + "loss": 1.927, + "mean_token_accuracy": 0.5440248250961304, + "num_tokens": 7965940790.0, + "step": 15584 + }, + { + "epoch": 4.2144402379664685, + "grad_norm": 0.9429296851158142, + "learning_rate": 3.1410450571978567e-06, + "loss": 1.7582, + "mean_token_accuracy": 0.5876575112342834, + "num_tokens": 7966371628.0, + "step": 15585 + }, + { + "epoch": 4.214710654407788, + "grad_norm": 0.8273027539253235, + "learning_rate": 3.1402769054013117e-06, + "loss": 1.818, + "mean_token_accuracy": 0.5893976092338562, + "num_tokens": 7966895754.0, + "step": 15586 + }, + { + "epoch": 4.214981070849108, + "grad_norm": 1.0590171813964844, + "learning_rate": 3.139508994764185e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5897972583770752, + "num_tokens": 7967420027.0, + "step": 15587 + }, + { + "epoch": 4.215251487290427, + "grad_norm": 0.9006847739219666, + "learning_rate": 3.1387413253100403e-06, + "loss": 1.893, + "mean_token_accuracy": 0.5718735456466675, + "num_tokens": 7967944257.0, + "step": 15588 + }, + { + "epoch": 4.215521903731747, + "grad_norm": 0.9293177723884583, + "learning_rate": 3.1379738970624284e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.561177670955658, + "num_tokens": 7968468446.0, + "step": 15589 + }, + { + "epoch": 4.215792320173066, + "grad_norm": 1.002463698387146, + "learning_rate": 3.1372067100449e-06, + "loss": 1.8752, + "mean_token_accuracy": 0.5738207101821899, + "num_tokens": 7968992599.0, + "step": 15590 + }, + { + "epoch": 4.216062736614386, + "grad_norm": 0.8219294548034668, + "learning_rate": 3.1364397642809913e-06, + "loss": 1.9393, + "mean_token_accuracy": 0.5563330054283142, + "num_tokens": 7969516796.0, + "step": 15591 + }, + { + "epoch": 4.2163331530557056, + "grad_norm": 1.0127025842666626, + "learning_rate": 3.1356730597942375e-06, + "loss": 1.7536, + "mean_token_accuracy": 0.6075044870376587, + "num_tokens": 7970041035.0, + "step": 15592 + }, + { + "epoch": 4.216603569497026, + "grad_norm": 0.9148758053779602, + "learning_rate": 3.134906596608161e-06, + "loss": 1.8885, + "mean_token_accuracy": 0.5627696514129639, + "num_tokens": 7970565218.0, + "step": 15593 + }, + { + "epoch": 4.216873985938345, + "grad_norm": 0.836257815361023, + "learning_rate": 3.1341403747462783e-06, + "loss": 1.8581, + "mean_token_accuracy": 0.5697891712188721, + "num_tokens": 7971089455.0, + "step": 15594 + }, + { + "epoch": 4.217144402379665, + "grad_norm": 0.7743946313858032, + "learning_rate": 3.133374394232104e-06, + "loss": 1.8651, + "mean_token_accuracy": 0.5731208920478821, + "num_tokens": 7971613652.0, + "step": 15595 + }, + { + "epoch": 4.217414818820984, + "grad_norm": 0.7904679179191589, + "learning_rate": 3.1326086550891354e-06, + "loss": 1.7635, + "mean_token_accuracy": 0.5890388488769531, + "num_tokens": 7972137913.0, + "step": 15596 + }, + { + "epoch": 4.217685235262304, + "grad_norm": 1.1022727489471436, + "learning_rate": 3.1318431573408693e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5864179134368896, + "num_tokens": 7972599406.0, + "step": 15597 + }, + { + "epoch": 4.217955651703623, + "grad_norm": 0.9705872535705566, + "learning_rate": 3.131077901010795e-06, + "loss": 1.7761, + "mean_token_accuracy": 0.5769186019897461, + "num_tokens": 7973123643.0, + "step": 15598 + }, + { + "epoch": 4.2182260681449435, + "grad_norm": 0.9758059978485107, + "learning_rate": 3.1303128861223923e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5668466091156006, + "num_tokens": 7973647915.0, + "step": 15599 + }, + { + "epoch": 4.218496484586263, + "grad_norm": 0.8821284770965576, + "learning_rate": 3.1295481126991316e-06, + "loss": 1.8295, + "mean_token_accuracy": 0.5873370170593262, + "num_tokens": 7974172069.0, + "step": 15600 + }, + { + "epoch": 4.218766901027583, + "grad_norm": 0.3183010220527649, + "learning_rate": 3.1287835807644813e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6975852251052856, + "num_tokens": 7974696250.0, + "step": 15601 + }, + { + "epoch": 4.219037317468902, + "grad_norm": 0.9049046635627747, + "learning_rate": 3.1280192903418983e-06, + "loss": 1.804, + "mean_token_accuracy": 0.5744576454162598, + "num_tokens": 7975212513.0, + "step": 15602 + }, + { + "epoch": 4.219307733910222, + "grad_norm": 1.0542614459991455, + "learning_rate": 3.127255241454831e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5910782814025879, + "num_tokens": 7975736783.0, + "step": 15603 + }, + { + "epoch": 4.219578150351541, + "grad_norm": 0.8456296324729919, + "learning_rate": 3.126491434126728e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.5619821548461914, + "num_tokens": 7976260991.0, + "step": 15604 + }, + { + "epoch": 4.219848566792861, + "grad_norm": 0.7824822068214417, + "learning_rate": 3.1257278683810237e-06, + "loss": 1.8811, + "mean_token_accuracy": 0.5694794654846191, + "num_tokens": 7976785196.0, + "step": 15605 + }, + { + "epoch": 4.2201189832341806, + "grad_norm": 1.050758957862854, + "learning_rate": 3.1249645442411425e-06, + "loss": 1.9043, + "mean_token_accuracy": 0.5669729709625244, + "num_tokens": 7977309319.0, + "step": 15606 + }, + { + "epoch": 4.220389399675501, + "grad_norm": 0.7618679404258728, + "learning_rate": 3.12420146173051e-06, + "loss": 1.8983, + "mean_token_accuracy": 0.5542472004890442, + "num_tokens": 7977833442.0, + "step": 15607 + }, + { + "epoch": 4.22065981611682, + "grad_norm": 0.8859933614730835, + "learning_rate": 3.123438620872537e-06, + "loss": 1.8, + "mean_token_accuracy": 0.5926003456115723, + "num_tokens": 7978357727.0, + "step": 15608 + }, + { + "epoch": 4.22093023255814, + "grad_norm": 0.9317355155944824, + "learning_rate": 3.1226760216906295e-06, + "loss": 1.9127, + "mean_token_accuracy": 0.5472450256347656, + "num_tokens": 7978860770.0, + "step": 15609 + }, + { + "epoch": 4.221200648999459, + "grad_norm": 0.9140036106109619, + "learning_rate": 3.1219136642081894e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5827447772026062, + "num_tokens": 7979332393.0, + "step": 15610 + }, + { + "epoch": 4.221471065440779, + "grad_norm": 0.8443605303764343, + "learning_rate": 3.121151548448604e-06, + "loss": 1.9104, + "mean_token_accuracy": 0.5616886019706726, + "num_tokens": 7979856655.0, + "step": 15611 + }, + { + "epoch": 4.221741481882098, + "grad_norm": 1.026323676109314, + "learning_rate": 3.12038967443526e-06, + "loss": 1.8443, + "mean_token_accuracy": 0.5819472074508667, + "num_tokens": 7980347681.0, + "step": 15612 + }, + { + "epoch": 4.222011898323418, + "grad_norm": 0.798441469669342, + "learning_rate": 3.1196280421915337e-06, + "loss": 1.757, + "mean_token_accuracy": 0.594340980052948, + "num_tokens": 7980871929.0, + "step": 15613 + }, + { + "epoch": 4.222282314764738, + "grad_norm": 0.7911881804466248, + "learning_rate": 3.1188666517407923e-06, + "loss": 1.8285, + "mean_token_accuracy": 0.5694657564163208, + "num_tokens": 7981337469.0, + "step": 15614 + }, + { + "epoch": 4.222552731206058, + "grad_norm": 0.9321943521499634, + "learning_rate": 3.1181055031064005e-06, + "loss": 1.7295, + "mean_token_accuracy": 0.590925931930542, + "num_tokens": 7981861721.0, + "step": 15615 + }, + { + "epoch": 4.222823147647377, + "grad_norm": 0.8265132308006287, + "learning_rate": 3.1173445963117096e-06, + "loss": 1.7588, + "mean_token_accuracy": 0.5841714143753052, + "num_tokens": 7982385824.0, + "step": 15616 + }, + { + "epoch": 4.223093564088696, + "grad_norm": 0.8581583499908447, + "learning_rate": 3.1165839313800677e-06, + "loss": 1.74, + "mean_token_accuracy": 0.6031504273414612, + "num_tokens": 7982892116.0, + "step": 15617 + }, + { + "epoch": 4.223363980530016, + "grad_norm": 0.7248483896255493, + "learning_rate": 3.115823508334817e-06, + "loss": 1.6558, + "mean_token_accuracy": 0.5974358320236206, + "num_tokens": 7983386756.0, + "step": 15618 + }, + { + "epoch": 4.2236343969713355, + "grad_norm": 1.0495984554290771, + "learning_rate": 3.1150633271992854e-06, + "loss": 1.8313, + "mean_token_accuracy": 0.5849453210830688, + "num_tokens": 7983911028.0, + "step": 15619 + }, + { + "epoch": 4.223904813412656, + "grad_norm": 0.9755991101264954, + "learning_rate": 3.114303387996798e-06, + "loss": 1.8636, + "mean_token_accuracy": 0.5774750113487244, + "num_tokens": 7984435238.0, + "step": 15620 + }, + { + "epoch": 4.224175229853975, + "grad_norm": 0.3145398497581482, + "learning_rate": 3.1135436907506743e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.7090571522712708, + "num_tokens": 7984959516.0, + "step": 15621 + }, + { + "epoch": 4.224445646295295, + "grad_norm": 0.9152554273605347, + "learning_rate": 3.1127842354842235e-06, + "loss": 1.7482, + "mean_token_accuracy": 0.5790623426437378, + "num_tokens": 7985448342.0, + "step": 15622 + }, + { + "epoch": 4.224716062736614, + "grad_norm": 0.8107939958572388, + "learning_rate": 3.1120250222207448e-06, + "loss": 1.903, + "mean_token_accuracy": 0.5742855072021484, + "num_tokens": 7985972609.0, + "step": 15623 + }, + { + "epoch": 4.224986479177934, + "grad_norm": 0.9896188974380493, + "learning_rate": 3.1112660509835374e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5796749591827393, + "num_tokens": 7986496775.0, + "step": 15624 + }, + { + "epoch": 4.225256895619253, + "grad_norm": 0.9253901839256287, + "learning_rate": 3.110507321795887e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5735190510749817, + "num_tokens": 7986961766.0, + "step": 15625 + }, + { + "epoch": 4.225527312060573, + "grad_norm": 0.8397118449211121, + "learning_rate": 3.109748834681071e-06, + "loss": 1.8838, + "mean_token_accuracy": 0.5683073997497559, + "num_tokens": 7987456688.0, + "step": 15626 + }, + { + "epoch": 4.225797728501893, + "grad_norm": 0.9718931317329407, + "learning_rate": 3.1089905896623673e-06, + "loss": 1.641, + "mean_token_accuracy": 0.6050736904144287, + "num_tokens": 7987980817.0, + "step": 15627 + }, + { + "epoch": 4.226068144943213, + "grad_norm": 0.8479400873184204, + "learning_rate": 3.108232586763038e-06, + "loss": 1.7594, + "mean_token_accuracy": 0.5815634727478027, + "num_tokens": 7988505094.0, + "step": 15628 + }, + { + "epoch": 4.226338561384532, + "grad_norm": 1.024376392364502, + "learning_rate": 3.107474826006339e-06, + "loss": 1.8335, + "mean_token_accuracy": 0.5779623985290527, + "num_tokens": 7988991866.0, + "step": 15629 + }, + { + "epoch": 4.226608977825852, + "grad_norm": 0.9749489426612854, + "learning_rate": 3.1067173074155222e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.57383131980896, + "num_tokens": 7989516066.0, + "step": 15630 + }, + { + "epoch": 4.226879394267171, + "grad_norm": 0.8607714176177979, + "learning_rate": 3.1059600310138337e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5709785223007202, + "num_tokens": 7989997998.0, + "step": 15631 + }, + { + "epoch": 4.227149810708491, + "grad_norm": 0.9555068016052246, + "learning_rate": 3.105202996824504e-06, + "loss": 1.8978, + "mean_token_accuracy": 0.5663117170333862, + "num_tokens": 7990522212.0, + "step": 15632 + }, + { + "epoch": 4.2274202271498105, + "grad_norm": 0.8107074499130249, + "learning_rate": 3.1044462048707657e-06, + "loss": 1.8257, + "mean_token_accuracy": 0.5646358728408813, + "num_tokens": 7991046420.0, + "step": 15633 + }, + { + "epoch": 4.227690643591131, + "grad_norm": 0.8526707887649536, + "learning_rate": 3.1036896551758367e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5752675533294678, + "num_tokens": 7991512595.0, + "step": 15634 + }, + { + "epoch": 4.22796106003245, + "grad_norm": 0.7842535376548767, + "learning_rate": 3.102933347762929e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5907111167907715, + "num_tokens": 7992017526.0, + "step": 15635 + }, + { + "epoch": 4.22823147647377, + "grad_norm": 0.8159026503562927, + "learning_rate": 3.1021772826552517e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5698568820953369, + "num_tokens": 7992541794.0, + "step": 15636 + }, + { + "epoch": 4.228501892915089, + "grad_norm": 0.8956451416015625, + "learning_rate": 3.101421459876e-06, + "loss": 1.9653, + "mean_token_accuracy": 0.5482982397079468, + "num_tokens": 7993066001.0, + "step": 15637 + }, + { + "epoch": 4.228772309356409, + "grad_norm": 0.9442197680473328, + "learning_rate": 3.100665879448367e-06, + "loss": 1.758, + "mean_token_accuracy": 0.592889666557312, + "num_tokens": 7993540149.0, + "step": 15638 + }, + { + "epoch": 4.229042725797728, + "grad_norm": 0.9229522347450256, + "learning_rate": 3.0999105413955366e-06, + "loss": 1.8982, + "mean_token_accuracy": 0.5566539764404297, + "num_tokens": 7994064331.0, + "step": 15639 + }, + { + "epoch": 4.229313142239048, + "grad_norm": 0.8511927723884583, + "learning_rate": 3.0991554457406803e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.5833820104598999, + "num_tokens": 7994588510.0, + "step": 15640 + }, + { + "epoch": 4.229583558680368, + "grad_norm": 0.3589933216571808, + "learning_rate": 3.0984005925069715e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.7123029828071594, + "num_tokens": 7995112661.0, + "step": 15641 + }, + { + "epoch": 4.229853975121688, + "grad_norm": 0.9020017385482788, + "learning_rate": 3.0976459817175703e-06, + "loss": 1.8964, + "mean_token_accuracy": 0.5688208937644958, + "num_tokens": 7995636897.0, + "step": 15642 + }, + { + "epoch": 4.230124391563007, + "grad_norm": 0.8244271278381348, + "learning_rate": 3.0968916133956282e-06, + "loss": 1.6889, + "mean_token_accuracy": 0.6031795740127563, + "num_tokens": 7996161101.0, + "step": 15643 + }, + { + "epoch": 4.230394808004327, + "grad_norm": 0.8848704695701599, + "learning_rate": 3.0961374875642946e-06, + "loss": 1.8078, + "mean_token_accuracy": 0.5917903184890747, + "num_tokens": 7996659116.0, + "step": 15644 + }, + { + "epoch": 4.230665224445646, + "grad_norm": 1.0141122341156006, + "learning_rate": 3.0953836042467046e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5924822092056274, + "num_tokens": 7997183214.0, + "step": 15645 + }, + { + "epoch": 4.230935640886966, + "grad_norm": 0.7551764249801636, + "learning_rate": 3.0946299634659917e-06, + "loss": 1.7721, + "mean_token_accuracy": 0.5892511010169983, + "num_tokens": 7997707403.0, + "step": 15646 + }, + { + "epoch": 4.2312060573282855, + "grad_norm": 0.851440966129303, + "learning_rate": 3.0938765652452806e-06, + "loss": 1.7864, + "mean_token_accuracy": 0.5820269584655762, + "num_tokens": 7998224700.0, + "step": 15647 + }, + { + "epoch": 4.231476473769606, + "grad_norm": 1.1876251697540283, + "learning_rate": 3.093123409607688e-06, + "loss": 1.869, + "mean_token_accuracy": 0.5686709880828857, + "num_tokens": 7998748970.0, + "step": 15648 + }, + { + "epoch": 4.231746890210925, + "grad_norm": 0.8853554725646973, + "learning_rate": 3.0923704965763195e-06, + "loss": 1.7675, + "mean_token_accuracy": 0.598196804523468, + "num_tokens": 7999223889.0, + "step": 15649 + }, + { + "epoch": 4.232017306652245, + "grad_norm": 0.8953871726989746, + "learning_rate": 3.091617826174281e-06, + "loss": 1.8494, + "mean_token_accuracy": 0.580065906047821, + "num_tokens": 7999715646.0, + "step": 15650 + }, + { + "epoch": 4.232287723093564, + "grad_norm": 0.9076602458953857, + "learning_rate": 3.090865398424665e-06, + "loss": 1.7423, + "mean_token_accuracy": 0.6059865951538086, + "num_tokens": 8000153424.0, + "step": 15651 + }, + { + "epoch": 4.232558139534884, + "grad_norm": 0.8638784289360046, + "learning_rate": 3.090113213350555e-06, + "loss": 1.7946, + "mean_token_accuracy": 0.5844902992248535, + "num_tokens": 8000677635.0, + "step": 15652 + }, + { + "epoch": 4.232828555976203, + "grad_norm": 1.0202165842056274, + "learning_rate": 3.089361270975035e-06, + "loss": 1.8704, + "mean_token_accuracy": 0.5599807500839233, + "num_tokens": 8001201854.0, + "step": 15653 + }, + { + "epoch": 4.2330989724175225, + "grad_norm": 0.8814350366592407, + "learning_rate": 3.0886095713211755e-06, + "loss": 1.7963, + "mean_token_accuracy": 0.579261064529419, + "num_tokens": 8001720859.0, + "step": 15654 + }, + { + "epoch": 4.233369388858843, + "grad_norm": 0.9097314476966858, + "learning_rate": 3.087858114412038e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5812911987304688, + "num_tokens": 8002245081.0, + "step": 15655 + }, + { + "epoch": 4.233639805300163, + "grad_norm": 1.0240012407302856, + "learning_rate": 3.087106900270684e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5514829754829407, + "num_tokens": 8002769294.0, + "step": 15656 + }, + { + "epoch": 4.233910221741482, + "grad_norm": 0.955619215965271, + "learning_rate": 3.086355928920159e-06, + "loss": 1.9556, + "mean_token_accuracy": 0.5514675378799438, + "num_tokens": 8003293426.0, + "step": 15657 + }, + { + "epoch": 4.234180638182801, + "grad_norm": 0.8400932550430298, + "learning_rate": 3.085605200383508e-06, + "loss": 1.8776, + "mean_token_accuracy": 0.5730787515640259, + "num_tokens": 8003817674.0, + "step": 15658 + }, + { + "epoch": 4.234451054624121, + "grad_norm": 0.8309256434440613, + "learning_rate": 3.0848547146837616e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5819348096847534, + "num_tokens": 8004341942.0, + "step": 15659 + }, + { + "epoch": 4.23472147106544, + "grad_norm": 0.777683675289154, + "learning_rate": 3.0841044718439507e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.5625879168510437, + "num_tokens": 8004866125.0, + "step": 15660 + }, + { + "epoch": 4.2349918875067605, + "grad_norm": 0.34329041838645935, + "learning_rate": 3.083354471887095e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7280291318893433, + "num_tokens": 8005390160.0, + "step": 15661 + }, + { + "epoch": 4.23526230394808, + "grad_norm": 1.089249610900879, + "learning_rate": 3.0826047148362048e-06, + "loss": 1.7543, + "mean_token_accuracy": 0.5948090553283691, + "num_tokens": 8005914409.0, + "step": 15662 + }, + { + "epoch": 4.2355327203894, + "grad_norm": 1.1400656700134277, + "learning_rate": 3.081855200714285e-06, + "loss": 1.8355, + "mean_token_accuracy": 0.5624117255210876, + "num_tokens": 8006438583.0, + "step": 15663 + }, + { + "epoch": 4.235803136830719, + "grad_norm": 0.8832749128341675, + "learning_rate": 3.081105929544334e-06, + "loss": 1.7832, + "mean_token_accuracy": 0.5902206897735596, + "num_tokens": 8006921567.0, + "step": 15664 + }, + { + "epoch": 4.236073553272039, + "grad_norm": 0.7917910814285278, + "learning_rate": 3.080356901349342e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5687557458877563, + "num_tokens": 8007445815.0, + "step": 15665 + }, + { + "epoch": 4.236343969713358, + "grad_norm": 0.9763825535774231, + "learning_rate": 3.079608116152288e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5658968687057495, + "num_tokens": 8007970082.0, + "step": 15666 + }, + { + "epoch": 4.236614386154678, + "grad_norm": 0.9593749046325684, + "learning_rate": 3.0788595739761505e-06, + "loss": 1.8178, + "mean_token_accuracy": 0.5745890140533447, + "num_tokens": 8008436975.0, + "step": 15667 + }, + { + "epoch": 4.2368848025959975, + "grad_norm": 1.0802075862884521, + "learning_rate": 3.0781112748438975e-06, + "loss": 1.7512, + "mean_token_accuracy": 0.5971478223800659, + "num_tokens": 8008923770.0, + "step": 15668 + }, + { + "epoch": 4.237155219037318, + "grad_norm": 0.9180949926376343, + "learning_rate": 3.077363218778485e-06, + "loss": 1.8933, + "mean_token_accuracy": 0.5532320737838745, + "num_tokens": 8009447931.0, + "step": 15669 + }, + { + "epoch": 4.237425635478637, + "grad_norm": 0.8245334625244141, + "learning_rate": 3.0766154058028684e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5718621015548706, + "num_tokens": 8009972194.0, + "step": 15670 + }, + { + "epoch": 4.237696051919957, + "grad_norm": 0.8710187673568726, + "learning_rate": 3.075867835939994e-06, + "loss": 1.9885, + "mean_token_accuracy": 0.5412494540214539, + "num_tokens": 8010496353.0, + "step": 15671 + }, + { + "epoch": 4.237966468361276, + "grad_norm": 0.8802533745765686, + "learning_rate": 3.0751205092127956e-06, + "loss": 1.7669, + "mean_token_accuracy": 0.5927989482879639, + "num_tokens": 8011020546.0, + "step": 15672 + }, + { + "epoch": 4.238236884802596, + "grad_norm": 1.0137717723846436, + "learning_rate": 3.0743734256442048e-06, + "loss": 1.8083, + "mean_token_accuracy": 0.5814376473426819, + "num_tokens": 8011544747.0, + "step": 15673 + }, + { + "epoch": 4.238507301243915, + "grad_norm": 0.948236346244812, + "learning_rate": 3.0736265852571472e-06, + "loss": 1.7833, + "mean_token_accuracy": 0.586513102054596, + "num_tokens": 8012068977.0, + "step": 15674 + }, + { + "epoch": 4.2387777176852355, + "grad_norm": 0.9239481687545776, + "learning_rate": 3.0728799880745343e-06, + "loss": 1.8531, + "mean_token_accuracy": 0.5835803747177124, + "num_tokens": 8012593154.0, + "step": 15675 + }, + { + "epoch": 4.239048134126555, + "grad_norm": 0.8789611458778381, + "learning_rate": 3.0721336341192767e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5672273635864258, + "num_tokens": 8013117409.0, + "step": 15676 + }, + { + "epoch": 4.239318550567875, + "grad_norm": 1.039465069770813, + "learning_rate": 3.071387523414274e-06, + "loss": 1.7629, + "mean_token_accuracy": 0.6044057607650757, + "num_tokens": 8013580937.0, + "step": 15677 + }, + { + "epoch": 4.239588967009194, + "grad_norm": 0.993015706539154, + "learning_rate": 3.070641655982417e-06, + "loss": 1.653, + "mean_token_accuracy": 0.6401764154434204, + "num_tokens": 8014041620.0, + "step": 15678 + }, + { + "epoch": 4.239859383450514, + "grad_norm": 1.0157169103622437, + "learning_rate": 3.0698960318465943e-06, + "loss": 1.9241, + "mean_token_accuracy": 0.5563982129096985, + "num_tokens": 8014555127.0, + "step": 15679 + }, + { + "epoch": 4.240129799891833, + "grad_norm": 1.1638379096984863, + "learning_rate": 3.0691506510296826e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5751856565475464, + "num_tokens": 8015079261.0, + "step": 15680 + }, + { + "epoch": 4.240400216333153, + "grad_norm": 0.335222452878952, + "learning_rate": 3.068405513554549e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.721961259841919, + "num_tokens": 8015545756.0, + "step": 15681 + }, + { + "epoch": 4.2406706327744725, + "grad_norm": 1.0741057395935059, + "learning_rate": 3.067660619444063e-06, + "loss": 1.8972, + "mean_token_accuracy": 0.5704386234283447, + "num_tokens": 8016070021.0, + "step": 15682 + }, + { + "epoch": 4.240941049215793, + "grad_norm": 0.9759216904640198, + "learning_rate": 3.0669159687210738e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.569317102432251, + "num_tokens": 8016594280.0, + "step": 15683 + }, + { + "epoch": 4.241211465657112, + "grad_norm": 0.898053765296936, + "learning_rate": 3.0661715614084342e-06, + "loss": 1.7292, + "mean_token_accuracy": 0.5941696763038635, + "num_tokens": 8017118485.0, + "step": 15684 + }, + { + "epoch": 4.241481882098432, + "grad_norm": 0.882863461971283, + "learning_rate": 3.065427397528983e-06, + "loss": 1.7469, + "mean_token_accuracy": 0.5774343609809875, + "num_tokens": 8017642763.0, + "step": 15685 + }, + { + "epoch": 4.241752298539751, + "grad_norm": 0.9016907811164856, + "learning_rate": 3.0646834771055523e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5778015851974487, + "num_tokens": 8018167041.0, + "step": 15686 + }, + { + "epoch": 4.242022714981071, + "grad_norm": 0.8395940661430359, + "learning_rate": 3.0639398001609676e-06, + "loss": 1.8475, + "mean_token_accuracy": 0.5777431130409241, + "num_tokens": 8018691256.0, + "step": 15687 + }, + { + "epoch": 4.24229313142239, + "grad_norm": 0.9070533514022827, + "learning_rate": 3.063196366718051e-06, + "loss": 1.7999, + "mean_token_accuracy": 0.5831247568130493, + "num_tokens": 8019215349.0, + "step": 15688 + }, + { + "epoch": 4.2425635478637105, + "grad_norm": 0.9966555833816528, + "learning_rate": 3.0624531767996087e-06, + "loss": 1.8475, + "mean_token_accuracy": 0.5814568996429443, + "num_tokens": 8019681880.0, + "step": 15689 + }, + { + "epoch": 4.24283396430503, + "grad_norm": 0.9369180798530579, + "learning_rate": 3.061710230428447e-06, + "loss": 1.7844, + "mean_token_accuracy": 0.5849399566650391, + "num_tokens": 8020206111.0, + "step": 15690 + }, + { + "epoch": 4.24310438074635, + "grad_norm": 1.048829197883606, + "learning_rate": 3.060967527627361e-06, + "loss": 1.7614, + "mean_token_accuracy": 0.623931884765625, + "num_tokens": 8020704417.0, + "step": 15691 + }, + { + "epoch": 4.243374797187669, + "grad_norm": 1.0819765329360962, + "learning_rate": 3.060225068419136e-06, + "loss": 1.9544, + "mean_token_accuracy": 0.5636597275733948, + "num_tokens": 8021149580.0, + "step": 15692 + }, + { + "epoch": 4.243645213628989, + "grad_norm": 0.9005938768386841, + "learning_rate": 3.059482852826558e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5720946788787842, + "num_tokens": 8021673844.0, + "step": 15693 + }, + { + "epoch": 4.243915630070308, + "grad_norm": 0.9787634015083313, + "learning_rate": 3.058740880872396e-06, + "loss": 1.7575, + "mean_token_accuracy": 0.5718546509742737, + "num_tokens": 8022198126.0, + "step": 15694 + }, + { + "epoch": 4.2441860465116275, + "grad_norm": 0.9167470932006836, + "learning_rate": 3.057999152579417e-06, + "loss": 1.7731, + "mean_token_accuracy": 0.5910571813583374, + "num_tokens": 8022722149.0, + "step": 15695 + }, + { + "epoch": 4.2444564629529475, + "grad_norm": 1.2327091693878174, + "learning_rate": 3.057257667970382e-06, + "loss": 1.9756, + "mean_token_accuracy": 0.5581106543540955, + "num_tokens": 8023246359.0, + "step": 15696 + }, + { + "epoch": 4.244726879394268, + "grad_norm": 1.3227354288101196, + "learning_rate": 3.056516427068038e-06, + "loss": 1.7233, + "mean_token_accuracy": 0.586612343788147, + "num_tokens": 8023770537.0, + "step": 15697 + }, + { + "epoch": 4.244997295835587, + "grad_norm": 1.0154590606689453, + "learning_rate": 3.05577542989513e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5696665048599243, + "num_tokens": 8024294822.0, + "step": 15698 + }, + { + "epoch": 4.245267712276906, + "grad_norm": 1.0467334985733032, + "learning_rate": 3.0550346764743964e-06, + "loss": 1.8679, + "mean_token_accuracy": 0.567719578742981, + "num_tokens": 8024818951.0, + "step": 15699 + }, + { + "epoch": 4.245538128718226, + "grad_norm": 1.0086103677749634, + "learning_rate": 3.0542941668285608e-06, + "loss": 1.9518, + "mean_token_accuracy": 0.5694832801818848, + "num_tokens": 8025278830.0, + "step": 15700 + }, + { + "epoch": 4.245808545159545, + "grad_norm": 0.34972336888313293, + "learning_rate": 3.0535539009803487e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7199295163154602, + "num_tokens": 8025802986.0, + "step": 15701 + }, + { + "epoch": 4.246078961600865, + "grad_norm": 1.0659805536270142, + "learning_rate": 3.0528138789524703e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5931339263916016, + "num_tokens": 8026327263.0, + "step": 15702 + }, + { + "epoch": 4.246349378042185, + "grad_norm": 1.1085342168807983, + "learning_rate": 3.052074100767636e-06, + "loss": 1.8821, + "mean_token_accuracy": 0.5571223497390747, + "num_tokens": 8026851466.0, + "step": 15703 + }, + { + "epoch": 4.246619794483505, + "grad_norm": 1.0973244905471802, + "learning_rate": 3.051334566448539e-06, + "loss": 1.7769, + "mean_token_accuracy": 0.5848637223243713, + "num_tokens": 8027375698.0, + "step": 15704 + }, + { + "epoch": 4.246890210924824, + "grad_norm": 0.9957461357116699, + "learning_rate": 3.0505952760178744e-06, + "loss": 1.9367, + "mean_token_accuracy": 0.5537209510803223, + "num_tokens": 8027899930.0, + "step": 15705 + }, + { + "epoch": 4.247160627366144, + "grad_norm": 0.9265205264091492, + "learning_rate": 3.049856229498323e-06, + "loss": 1.8033, + "mean_token_accuracy": 0.5842301249504089, + "num_tokens": 8028424122.0, + "step": 15706 + }, + { + "epoch": 4.247431043807463, + "grad_norm": 0.8848790526390076, + "learning_rate": 3.049117426912564e-06, + "loss": 1.7224, + "mean_token_accuracy": 0.6097370386123657, + "num_tokens": 8028948386.0, + "step": 15707 + }, + { + "epoch": 4.247701460248783, + "grad_norm": 1.0506244897842407, + "learning_rate": 3.048378868283264e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5575198531150818, + "num_tokens": 8029472488.0, + "step": 15708 + }, + { + "epoch": 4.2479718766901025, + "grad_norm": 1.0351710319519043, + "learning_rate": 3.0476405536330833e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5755243897438049, + "num_tokens": 8029996744.0, + "step": 15709 + }, + { + "epoch": 4.2482422931314225, + "grad_norm": 0.9459490776062012, + "learning_rate": 3.0469024829846782e-06, + "loss": 1.8623, + "mean_token_accuracy": 0.562717080116272, + "num_tokens": 8030475157.0, + "step": 15710 + }, + { + "epoch": 4.248512709572742, + "grad_norm": 0.8468155264854431, + "learning_rate": 3.0461646563606944e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5799093246459961, + "num_tokens": 8030999416.0, + "step": 15711 + }, + { + "epoch": 4.248783126014062, + "grad_norm": 0.8157243132591248, + "learning_rate": 3.0454270737837667e-06, + "loss": 1.7977, + "mean_token_accuracy": 0.5747231245040894, + "num_tokens": 8031523641.0, + "step": 15712 + }, + { + "epoch": 4.249053542455381, + "grad_norm": 0.939670979976654, + "learning_rate": 3.0446897352765312e-06, + "loss": 1.8171, + "mean_token_accuracy": 0.5852611064910889, + "num_tokens": 8032047816.0, + "step": 15713 + }, + { + "epoch": 4.249323958896701, + "grad_norm": 0.9160243272781372, + "learning_rate": 3.0439526408616084e-06, + "loss": 1.862, + "mean_token_accuracy": 0.5664082765579224, + "num_tokens": 8032571982.0, + "step": 15714 + }, + { + "epoch": 4.24959437533802, + "grad_norm": 0.9390335083007812, + "learning_rate": 3.0432157905616174e-06, + "loss": 2.013, + "mean_token_accuracy": 0.5488096475601196, + "num_tokens": 8033096222.0, + "step": 15715 + }, + { + "epoch": 4.24986479177934, + "grad_norm": 0.8617006540298462, + "learning_rate": 3.0424791843991634e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5785166025161743, + "num_tokens": 8033620486.0, + "step": 15716 + }, + { + "epoch": 4.25013520822066, + "grad_norm": 0.8407573103904724, + "learning_rate": 3.0417428223968516e-06, + "loss": 1.8061, + "mean_token_accuracy": 0.5871666669845581, + "num_tokens": 8034144625.0, + "step": 15717 + }, + { + "epoch": 4.25040562466198, + "grad_norm": 0.9068138599395752, + "learning_rate": 3.041006704577272e-06, + "loss": 1.8793, + "mean_token_accuracy": 0.5739534497261047, + "num_tokens": 8034668709.0, + "step": 15718 + }, + { + "epoch": 4.250676041103299, + "grad_norm": 1.0485173463821411, + "learning_rate": 3.040270830963014e-06, + "loss": 1.8021, + "mean_token_accuracy": 0.582107424736023, + "num_tokens": 8035147304.0, + "step": 15719 + }, + { + "epoch": 4.250946457544619, + "grad_norm": 0.8266992568969727, + "learning_rate": 3.0395352015766553e-06, + "loss": 1.7975, + "mean_token_accuracy": 0.6025890111923218, + "num_tokens": 8035624408.0, + "step": 15720 + }, + { + "epoch": 4.251216873985938, + "grad_norm": 0.3278249502182007, + "learning_rate": 3.0387998164407644e-06, + "loss": 1.1397, + "mean_token_accuracy": 0.6948855519294739, + "num_tokens": 8036148590.0, + "step": 15721 + }, + { + "epoch": 4.251487290427258, + "grad_norm": 0.818476676940918, + "learning_rate": 3.0380646755779103e-06, + "loss": 1.8803, + "mean_token_accuracy": 0.5721099376678467, + "num_tokens": 8036672816.0, + "step": 15722 + }, + { + "epoch": 4.2517577068685775, + "grad_norm": 0.8576869964599609, + "learning_rate": 3.0373297790106458e-06, + "loss": 1.7032, + "mean_token_accuracy": 0.6044033169746399, + "num_tokens": 8037180743.0, + "step": 15723 + }, + { + "epoch": 4.2520281233098975, + "grad_norm": 0.9515917301177979, + "learning_rate": 3.036595126761518e-06, + "loss": 1.9125, + "mean_token_accuracy": 0.569234311580658, + "num_tokens": 8037704958.0, + "step": 15724 + }, + { + "epoch": 4.252298539751217, + "grad_norm": 0.8585834503173828, + "learning_rate": 3.035860718853073e-06, + "loss": 1.7176, + "mean_token_accuracy": 0.589084804058075, + "num_tokens": 8038229198.0, + "step": 15725 + }, + { + "epoch": 4.252568956192537, + "grad_norm": 0.9611542224884033, + "learning_rate": 3.0351265553078425e-06, + "loss": 1.9274, + "mean_token_accuracy": 0.5926547050476074, + "num_tokens": 8038688518.0, + "step": 15726 + }, + { + "epoch": 4.252839372633856, + "grad_norm": 0.7714700698852539, + "learning_rate": 3.034392636148351e-06, + "loss": 1.7733, + "mean_token_accuracy": 0.5987839698791504, + "num_tokens": 8039212768.0, + "step": 15727 + }, + { + "epoch": 4.253109789075176, + "grad_norm": 1.1236666440963745, + "learning_rate": 3.03365896139712e-06, + "loss": 1.8836, + "mean_token_accuracy": 0.5815834403038025, + "num_tokens": 8039664241.0, + "step": 15728 + }, + { + "epoch": 4.253380205516495, + "grad_norm": 0.9317880272865295, + "learning_rate": 3.0329255310766575e-06, + "loss": 1.7763, + "mean_token_accuracy": 0.568435788154602, + "num_tokens": 8040188421.0, + "step": 15729 + }, + { + "epoch": 4.253650621957815, + "grad_norm": 0.8253021240234375, + "learning_rate": 3.0321923452094687e-06, + "loss": 1.7914, + "mean_token_accuracy": 0.575754702091217, + "num_tokens": 8040712649.0, + "step": 15730 + }, + { + "epoch": 4.253921038399135, + "grad_norm": 0.9101951718330383, + "learning_rate": 3.0314594038180533e-06, + "loss": 1.4709, + "mean_token_accuracy": 0.6282196640968323, + "num_tokens": 8041195512.0, + "step": 15731 + }, + { + "epoch": 4.254191454840455, + "grad_norm": 1.0668660402297974, + "learning_rate": 3.0307267069248958e-06, + "loss": 1.8903, + "mean_token_accuracy": 0.5773318409919739, + "num_tokens": 8041719724.0, + "step": 15732 + }, + { + "epoch": 4.254461871281774, + "grad_norm": 0.9441039562225342, + "learning_rate": 3.0299942545524797e-06, + "loss": 1.8202, + "mean_token_accuracy": 0.5711852312088013, + "num_tokens": 8042243884.0, + "step": 15733 + }, + { + "epoch": 4.254732287723094, + "grad_norm": 0.9456778764724731, + "learning_rate": 3.02926204672328e-06, + "loss": 1.7431, + "mean_token_accuracy": 0.5936321020126343, + "num_tokens": 8042767952.0, + "step": 15734 + }, + { + "epoch": 4.255002704164413, + "grad_norm": 0.8850846290588379, + "learning_rate": 3.028530083459757e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5627705454826355, + "num_tokens": 8043292135.0, + "step": 15735 + }, + { + "epoch": 4.255273120605732, + "grad_norm": 0.8415524959564209, + "learning_rate": 3.027798364784377e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.5901269316673279, + "num_tokens": 8043816410.0, + "step": 15736 + }, + { + "epoch": 4.2555435370470525, + "grad_norm": 0.9513978958129883, + "learning_rate": 3.0270668907195873e-06, + "loss": 1.8888, + "mean_token_accuracy": 0.5622618198394775, + "num_tokens": 8044340568.0, + "step": 15737 + }, + { + "epoch": 4.2558139534883725, + "grad_norm": 1.1163657903671265, + "learning_rate": 3.026335661287831e-06, + "loss": 1.8189, + "mean_token_accuracy": 0.5779496431350708, + "num_tokens": 8044864762.0, + "step": 15738 + }, + { + "epoch": 4.256084369929692, + "grad_norm": 0.8387271761894226, + "learning_rate": 3.0256046765115464e-06, + "loss": 1.8935, + "mean_token_accuracy": 0.555997371673584, + "num_tokens": 8045388976.0, + "step": 15739 + }, + { + "epoch": 4.256354786371011, + "grad_norm": 0.999975323677063, + "learning_rate": 3.0248739364131623e-06, + "loss": 1.9041, + "mean_token_accuracy": 0.5792380571365356, + "num_tokens": 8045913190.0, + "step": 15740 + }, + { + "epoch": 4.256625202812331, + "grad_norm": 0.3299347162246704, + "learning_rate": 3.024143441015097e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.7077183127403259, + "num_tokens": 8046395889.0, + "step": 15741 + }, + { + "epoch": 4.25689561925365, + "grad_norm": 1.0145049095153809, + "learning_rate": 3.023413190339769e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.5606439709663391, + "num_tokens": 8046920086.0, + "step": 15742 + }, + { + "epoch": 4.25716603569497, + "grad_norm": 0.9933614134788513, + "learning_rate": 3.0226831844095784e-06, + "loss": 1.7603, + "mean_token_accuracy": 0.5764787197113037, + "num_tokens": 8047444354.0, + "step": 15743 + }, + { + "epoch": 4.2574364521362895, + "grad_norm": 0.962787926197052, + "learning_rate": 3.0219534232469306e-06, + "loss": 1.7831, + "mean_token_accuracy": 0.5724696516990662, + "num_tokens": 8047968629.0, + "step": 15744 + }, + { + "epoch": 4.25770686857761, + "grad_norm": 1.045605182647705, + "learning_rate": 3.021223906874212e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.5343551635742188, + "num_tokens": 8048492717.0, + "step": 15745 + }, + { + "epoch": 4.257977285018929, + "grad_norm": 0.8863493204116821, + "learning_rate": 3.0204946353138086e-06, + "loss": 1.895, + "mean_token_accuracy": 0.5608561038970947, + "num_tokens": 8048959113.0, + "step": 15746 + }, + { + "epoch": 4.258247701460249, + "grad_norm": 0.7481003403663635, + "learning_rate": 3.0197656085880954e-06, + "loss": 1.7726, + "mean_token_accuracy": 0.5876219272613525, + "num_tokens": 8049483270.0, + "step": 15747 + }, + { + "epoch": 4.258518117901568, + "grad_norm": 1.0375548601150513, + "learning_rate": 3.0190368267194437e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5859272480010986, + "num_tokens": 8049977716.0, + "step": 15748 + }, + { + "epoch": 4.258788534342888, + "grad_norm": 1.0475977659225464, + "learning_rate": 3.0183082897302117e-06, + "loss": 1.7298, + "mean_token_accuracy": 0.6049712896347046, + "num_tokens": 8050501891.0, + "step": 15749 + }, + { + "epoch": 4.259058950784207, + "grad_norm": 0.9313533902168274, + "learning_rate": 3.0175799976427526e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5928220748901367, + "num_tokens": 8051026164.0, + "step": 15750 + }, + { + "epoch": 4.2593293672255275, + "grad_norm": 0.8748089075088501, + "learning_rate": 3.0168519504794154e-06, + "loss": 1.7112, + "mean_token_accuracy": 0.6006307601928711, + "num_tokens": 8051525805.0, + "step": 15751 + }, + { + "epoch": 4.259599783666847, + "grad_norm": 0.8156229257583618, + "learning_rate": 3.016124148262538e-06, + "loss": 1.7499, + "mean_token_accuracy": 0.5954747796058655, + "num_tokens": 8052009734.0, + "step": 15752 + }, + { + "epoch": 4.259870200108167, + "grad_norm": 0.9866945743560791, + "learning_rate": 3.015396591014448e-06, + "loss": 1.7425, + "mean_token_accuracy": 0.6030243039131165, + "num_tokens": 8052534010.0, + "step": 15753 + }, + { + "epoch": 4.260140616549486, + "grad_norm": 1.057713270187378, + "learning_rate": 3.0146692787574743e-06, + "loss": 1.8554, + "mean_token_accuracy": 0.5742790699005127, + "num_tokens": 8053058205.0, + "step": 15754 + }, + { + "epoch": 4.260411032990806, + "grad_norm": 0.9092673063278198, + "learning_rate": 3.0139422115139283e-06, + "loss": 1.8895, + "mean_token_accuracy": 0.5685285925865173, + "num_tokens": 8053582484.0, + "step": 15755 + }, + { + "epoch": 4.260681449432125, + "grad_norm": 0.8206307888031006, + "learning_rate": 3.013215389306123e-06, + "loss": 1.7924, + "mean_token_accuracy": 0.5923207402229309, + "num_tokens": 8054106680.0, + "step": 15756 + }, + { + "epoch": 4.260951865873445, + "grad_norm": 0.8348398208618164, + "learning_rate": 3.0124888121563555e-06, + "loss": 1.7429, + "mean_token_accuracy": 0.5792802572250366, + "num_tokens": 8054630872.0, + "step": 15757 + }, + { + "epoch": 4.2612222823147645, + "grad_norm": 0.9240078330039978, + "learning_rate": 3.0117624800869216e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5671900510787964, + "num_tokens": 8055155113.0, + "step": 15758 + }, + { + "epoch": 4.261492698756085, + "grad_norm": 0.9768334031105042, + "learning_rate": 3.0110363931201053e-06, + "loss": 1.846, + "mean_token_accuracy": 0.570569634437561, + "num_tokens": 8055679393.0, + "step": 15759 + }, + { + "epoch": 4.261763115197404, + "grad_norm": 0.7894323468208313, + "learning_rate": 3.010310551278188e-06, + "loss": 1.78, + "mean_token_accuracy": 0.5791677832603455, + "num_tokens": 8056203590.0, + "step": 15760 + }, + { + "epoch": 4.262033531638724, + "grad_norm": 0.36056649684906006, + "learning_rate": 3.0095849545834395e-06, + "loss": 1.2338, + "mean_token_accuracy": 0.6834476590156555, + "num_tokens": 8056727862.0, + "step": 15761 + }, + { + "epoch": 4.262303948080043, + "grad_norm": 0.9898759722709656, + "learning_rate": 3.0088596030581223e-06, + "loss": 1.9292, + "mean_token_accuracy": 0.5598711967468262, + "num_tokens": 8057252010.0, + "step": 15762 + }, + { + "epoch": 4.262574364521363, + "grad_norm": 0.8524028062820435, + "learning_rate": 3.008134496724494e-06, + "loss": 1.8561, + "mean_token_accuracy": 0.5591707229614258, + "num_tokens": 8057776244.0, + "step": 15763 + }, + { + "epoch": 4.262844780962682, + "grad_norm": 0.9644772410392761, + "learning_rate": 3.0074096356047994e-06, + "loss": 1.7585, + "mean_token_accuracy": 0.57509446144104, + "num_tokens": 8058294917.0, + "step": 15764 + }, + { + "epoch": 4.2631151974040025, + "grad_norm": 0.9745551943778992, + "learning_rate": 3.0066850197212845e-06, + "loss": 1.7089, + "mean_token_accuracy": 0.6065457463264465, + "num_tokens": 8058819046.0, + "step": 15765 + }, + { + "epoch": 4.263385613845322, + "grad_norm": 0.9763920903205872, + "learning_rate": 3.005960649096179e-06, + "loss": 1.8066, + "mean_token_accuracy": 0.5949824452400208, + "num_tokens": 8059343069.0, + "step": 15766 + }, + { + "epoch": 4.263656030286642, + "grad_norm": 0.908302366733551, + "learning_rate": 3.0052365237517088e-06, + "loss": 1.7841, + "mean_token_accuracy": 0.5855088233947754, + "num_tokens": 8059782587.0, + "step": 15767 + }, + { + "epoch": 4.263926446727961, + "grad_norm": 1.028349757194519, + "learning_rate": 3.004512643710094e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5998942852020264, + "num_tokens": 8060267037.0, + "step": 15768 + }, + { + "epoch": 4.264196863169281, + "grad_norm": 0.8243272304534912, + "learning_rate": 3.003789008993546e-06, + "loss": 1.833, + "mean_token_accuracy": 0.5711351037025452, + "num_tokens": 8060791317.0, + "step": 15769 + }, + { + "epoch": 4.2644672796106, + "grad_norm": 1.004533052444458, + "learning_rate": 3.003065619624263e-06, + "loss": 1.825, + "mean_token_accuracy": 0.5908225774765015, + "num_tokens": 8061241884.0, + "step": 15770 + }, + { + "epoch": 4.26473769605192, + "grad_norm": 0.8109115958213806, + "learning_rate": 3.0023424756244467e-06, + "loss": 1.8937, + "mean_token_accuracy": 0.5629119873046875, + "num_tokens": 8061766099.0, + "step": 15771 + }, + { + "epoch": 4.2650081124932395, + "grad_norm": 1.0045287609100342, + "learning_rate": 3.0016195770162816e-06, + "loss": 1.7479, + "mean_token_accuracy": 0.5736223459243774, + "num_tokens": 8062290310.0, + "step": 15772 + }, + { + "epoch": 4.26527852893456, + "grad_norm": 0.8821856379508972, + "learning_rate": 3.000896923821949e-06, + "loss": 1.8274, + "mean_token_accuracy": 0.5709657669067383, + "num_tokens": 8062814376.0, + "step": 15773 + }, + { + "epoch": 4.265548945375879, + "grad_norm": 0.7942702174186707, + "learning_rate": 3.000174516063624e-06, + "loss": 1.7552, + "mean_token_accuracy": 0.5947706699371338, + "num_tokens": 8063338643.0, + "step": 15774 + }, + { + "epoch": 4.265819361817199, + "grad_norm": 0.8957757949829102, + "learning_rate": 2.999452353763471e-06, + "loss": 1.7397, + "mean_token_accuracy": 0.59601891040802, + "num_tokens": 8063862882.0, + "step": 15775 + }, + { + "epoch": 4.266089778258518, + "grad_norm": 0.8796368837356567, + "learning_rate": 2.9987304369436464e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5782124400138855, + "num_tokens": 8064361693.0, + "step": 15776 + }, + { + "epoch": 4.266360194699837, + "grad_norm": 0.9485512971878052, + "learning_rate": 2.9980087656263045e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5761712789535522, + "num_tokens": 8064885904.0, + "step": 15777 + }, + { + "epoch": 4.266630611141157, + "grad_norm": 0.9367539882659912, + "learning_rate": 2.9972873398335838e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5660222768783569, + "num_tokens": 8065410187.0, + "step": 15778 + }, + { + "epoch": 4.2669010275824775, + "grad_norm": 0.9231092929840088, + "learning_rate": 2.996566159587623e-06, + "loss": 1.8849, + "mean_token_accuracy": 0.5521160960197449, + "num_tokens": 8065903846.0, + "step": 15779 + }, + { + "epoch": 4.267171444023797, + "grad_norm": 0.8626428246498108, + "learning_rate": 2.9958452249105497e-06, + "loss": 1.7306, + "mean_token_accuracy": 0.6043357253074646, + "num_tokens": 8066428088.0, + "step": 15780 + }, + { + "epoch": 4.267441860465116, + "grad_norm": 0.34620457887649536, + "learning_rate": 2.9951245358244816e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.7152453064918518, + "num_tokens": 8066952180.0, + "step": 15781 + }, + { + "epoch": 4.267712276906436, + "grad_norm": 0.9253663420677185, + "learning_rate": 2.994404092351536e-06, + "loss": 1.8321, + "mean_token_accuracy": 0.5644670724868774, + "num_tokens": 8067476353.0, + "step": 15782 + }, + { + "epoch": 4.267982693347755, + "grad_norm": 1.1806033849716187, + "learning_rate": 2.993683894513815e-06, + "loss": 1.8766, + "mean_token_accuracy": 0.5621358156204224, + "num_tokens": 8068000519.0, + "step": 15783 + }, + { + "epoch": 4.268253109789075, + "grad_norm": 0.8634838461875916, + "learning_rate": 2.9929639423334157e-06, + "loss": 1.7105, + "mean_token_accuracy": 0.6028591990470886, + "num_tokens": 8068524696.0, + "step": 15784 + }, + { + "epoch": 4.268523526230394, + "grad_norm": 0.986223578453064, + "learning_rate": 2.9922442358324318e-06, + "loss": 1.7819, + "mean_token_accuracy": 0.6011512279510498, + "num_tokens": 8069023837.0, + "step": 15785 + }, + { + "epoch": 4.2687939426717145, + "grad_norm": 0.9720627069473267, + "learning_rate": 2.9915247750329414e-06, + "loss": 1.7511, + "mean_token_accuracy": 0.5988948345184326, + "num_tokens": 8069548015.0, + "step": 15786 + }, + { + "epoch": 4.269064359113034, + "grad_norm": 0.8101304173469543, + "learning_rate": 2.9908055599570233e-06, + "loss": 1.7692, + "mean_token_accuracy": 0.5897619128227234, + "num_tokens": 8070017638.0, + "step": 15787 + }, + { + "epoch": 4.269334775554354, + "grad_norm": 0.8902228474617004, + "learning_rate": 2.990086590626745e-06, + "loss": 1.8669, + "mean_token_accuracy": 0.5584316253662109, + "num_tokens": 8070541863.0, + "step": 15788 + }, + { + "epoch": 4.269605191995673, + "grad_norm": 0.8609419465065002, + "learning_rate": 2.9893678670641663e-06, + "loss": 1.8512, + "mean_token_accuracy": 0.5762048959732056, + "num_tokens": 8071066071.0, + "step": 15789 + }, + { + "epoch": 4.269875608436993, + "grad_norm": 0.8750802874565125, + "learning_rate": 2.988649389291337e-06, + "loss": 1.818, + "mean_token_accuracy": 0.5816211700439453, + "num_tokens": 8071590257.0, + "step": 15790 + }, + { + "epoch": 4.270146024878312, + "grad_norm": 0.8641218543052673, + "learning_rate": 2.987931157330306e-06, + "loss": 1.7512, + "mean_token_accuracy": 0.5835084915161133, + "num_tokens": 8072114536.0, + "step": 15791 + }, + { + "epoch": 4.270416441319632, + "grad_norm": 0.7651070356369019, + "learning_rate": 2.98721317120311e-06, + "loss": 1.7116, + "mean_token_accuracy": 0.5949511528015137, + "num_tokens": 8072638546.0, + "step": 15792 + }, + { + "epoch": 4.270686857760952, + "grad_norm": 0.8886109590530396, + "learning_rate": 2.9864954309317755e-06, + "loss": 1.8497, + "mean_token_accuracy": 0.5794683694839478, + "num_tokens": 8073110556.0, + "step": 15793 + }, + { + "epoch": 4.270957274202272, + "grad_norm": 0.9747320413589478, + "learning_rate": 2.9857779365383287e-06, + "loss": 1.8051, + "mean_token_accuracy": 0.5859924554824829, + "num_tokens": 8073634769.0, + "step": 15794 + }, + { + "epoch": 4.271227690643591, + "grad_norm": 0.9400225281715393, + "learning_rate": 2.985060688044784e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.5692086815834045, + "num_tokens": 8074158976.0, + "step": 15795 + }, + { + "epoch": 4.271498107084911, + "grad_norm": 0.8416931629180908, + "learning_rate": 2.9843436854731457e-06, + "loss": 1.9098, + "mean_token_accuracy": 0.5669549703598022, + "num_tokens": 8074683248.0, + "step": 15796 + }, + { + "epoch": 4.27176852352623, + "grad_norm": 0.966415286064148, + "learning_rate": 2.9836269288454168e-06, + "loss": 1.7933, + "mean_token_accuracy": 0.5626919269561768, + "num_tokens": 8075207516.0, + "step": 15797 + }, + { + "epoch": 4.27203893996755, + "grad_norm": 0.8339337110519409, + "learning_rate": 2.9829104181835895e-06, + "loss": 1.7936, + "mean_token_accuracy": 0.594006359577179, + "num_tokens": 8075731619.0, + "step": 15798 + }, + { + "epoch": 4.272309356408869, + "grad_norm": 1.1078882217407227, + "learning_rate": 2.9821941535096443e-06, + "loss": 1.9778, + "mean_token_accuracy": 0.5623621940612793, + "num_tokens": 8076171692.0, + "step": 15799 + }, + { + "epoch": 4.2725797728501895, + "grad_norm": 0.8741810321807861, + "learning_rate": 2.9814781348455623e-06, + "loss": 1.8009, + "mean_token_accuracy": 0.578781008720398, + "num_tokens": 8076695930.0, + "step": 15800 + }, + { + "epoch": 4.272850189291509, + "grad_norm": 0.41350188851356506, + "learning_rate": 2.9807623622133138e-06, + "loss": 1.1544, + "mean_token_accuracy": 0.6818783283233643, + "num_tokens": 8077196271.0, + "step": 15801 + }, + { + "epoch": 4.273120605732829, + "grad_norm": 0.7859947681427002, + "learning_rate": 2.9800468356348563e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5619188547134399, + "num_tokens": 8077720521.0, + "step": 15802 + }, + { + "epoch": 4.273391022174148, + "grad_norm": 1.0098518133163452, + "learning_rate": 2.9793315551321496e-06, + "loss": 1.681, + "mean_token_accuracy": 0.6091834306716919, + "num_tokens": 8078208888.0, + "step": 15803 + }, + { + "epoch": 4.273661438615468, + "grad_norm": 0.8758094310760498, + "learning_rate": 2.9786165207271354e-06, + "loss": 1.8283, + "mean_token_accuracy": 0.5630900263786316, + "num_tokens": 8078733145.0, + "step": 15804 + }, + { + "epoch": 4.273931855056787, + "grad_norm": 0.8198479413986206, + "learning_rate": 2.9779017324417593e-06, + "loss": 1.9359, + "mean_token_accuracy": 0.5665735006332397, + "num_tokens": 8079196120.0, + "step": 15805 + }, + { + "epoch": 4.274202271498107, + "grad_norm": 0.8161959648132324, + "learning_rate": 2.977187190297947e-06, + "loss": 1.7929, + "mean_token_accuracy": 0.5662022233009338, + "num_tokens": 8079720399.0, + "step": 15806 + }, + { + "epoch": 4.274472687939427, + "grad_norm": 0.9313225746154785, + "learning_rate": 2.976472894317625e-06, + "loss": 1.9205, + "mean_token_accuracy": 0.5647132396697998, + "num_tokens": 8080216453.0, + "step": 15807 + }, + { + "epoch": 4.274743104380747, + "grad_norm": 0.7822522521018982, + "learning_rate": 2.975758844522712e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5745280981063843, + "num_tokens": 8080740657.0, + "step": 15808 + }, + { + "epoch": 4.275013520822066, + "grad_norm": 1.000999093055725, + "learning_rate": 2.9750450409351157e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.577022135257721, + "num_tokens": 8081264654.0, + "step": 15809 + }, + { + "epoch": 4.275283937263386, + "grad_norm": 0.825603723526001, + "learning_rate": 2.974331483576735e-06, + "loss": 1.9425, + "mean_token_accuracy": 0.5545674562454224, + "num_tokens": 8081788931.0, + "step": 15810 + }, + { + "epoch": 4.275554353704705, + "grad_norm": 0.8182373046875, + "learning_rate": 2.973618172469468e-06, + "loss": 1.6749, + "mean_token_accuracy": 0.6059765815734863, + "num_tokens": 8082313031.0, + "step": 15811 + }, + { + "epoch": 4.275824770146025, + "grad_norm": 0.8605077266693115, + "learning_rate": 2.9729051076352e-06, + "loss": 1.7818, + "mean_token_accuracy": 0.5585926175117493, + "num_tokens": 8082837289.0, + "step": 15812 + }, + { + "epoch": 4.276095186587344, + "grad_norm": 0.8804932832717896, + "learning_rate": 2.972192289095808e-06, + "loss": 1.77, + "mean_token_accuracy": 0.5767143964767456, + "num_tokens": 8083361408.0, + "step": 15813 + }, + { + "epoch": 4.2763656030286645, + "grad_norm": 0.8670743703842163, + "learning_rate": 2.9714797168731656e-06, + "loss": 1.9007, + "mean_token_accuracy": 0.5594855546951294, + "num_tokens": 8083885659.0, + "step": 15814 + }, + { + "epoch": 4.276636019469984, + "grad_norm": 0.8535037636756897, + "learning_rate": 2.970767390989135e-06, + "loss": 1.7945, + "mean_token_accuracy": 0.5944042205810547, + "num_tokens": 8084352564.0, + "step": 15815 + }, + { + "epoch": 4.276906435911304, + "grad_norm": 1.1821445226669312, + "learning_rate": 2.970055311465573e-06, + "loss": 1.5107, + "mean_token_accuracy": 0.6359910368919373, + "num_tokens": 8084876849.0, + "step": 15816 + }, + { + "epoch": 4.277176852352623, + "grad_norm": 0.8178868293762207, + "learning_rate": 2.969343478324329e-06, + "loss": 1.9039, + "mean_token_accuracy": 0.5698983669281006, + "num_tokens": 8085401018.0, + "step": 15817 + }, + { + "epoch": 4.277447268793942, + "grad_norm": 0.8652157187461853, + "learning_rate": 2.9686318915872446e-06, + "loss": 1.802, + "mean_token_accuracy": 0.5965976715087891, + "num_tokens": 8085867646.0, + "step": 15818 + }, + { + "epoch": 4.277717685235262, + "grad_norm": 1.0549991130828857, + "learning_rate": 2.9679205512761523e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5689720511436462, + "num_tokens": 8086391908.0, + "step": 15819 + }, + { + "epoch": 4.277988101676582, + "grad_norm": 1.131965160369873, + "learning_rate": 2.9672094574128786e-06, + "loss": 1.9415, + "mean_token_accuracy": 0.5341248512268066, + "num_tokens": 8086916190.0, + "step": 15820 + }, + { + "epoch": 4.278258518117902, + "grad_norm": 0.336997389793396, + "learning_rate": 2.9664986100192418e-06, + "loss": 1.1249, + "mean_token_accuracy": 0.7037519216537476, + "num_tokens": 8087440433.0, + "step": 15821 + }, + { + "epoch": 4.278528934559221, + "grad_norm": 0.9969682693481445, + "learning_rate": 2.96578800911705e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5710241794586182, + "num_tokens": 8087964590.0, + "step": 15822 + }, + { + "epoch": 4.278799351000541, + "grad_norm": 0.8053178191184998, + "learning_rate": 2.9650776547281123e-06, + "loss": 1.8138, + "mean_token_accuracy": 0.5817698836326599, + "num_tokens": 8088488654.0, + "step": 15823 + }, + { + "epoch": 4.27906976744186, + "grad_norm": 1.319175124168396, + "learning_rate": 2.964367546874219e-06, + "loss": 1.9488, + "mean_token_accuracy": 0.5480359196662903, + "num_tokens": 8089012922.0, + "step": 15824 + }, + { + "epoch": 4.27934018388318, + "grad_norm": 0.8866209387779236, + "learning_rate": 2.963657685577162e-06, + "loss": 1.7516, + "mean_token_accuracy": 0.5810984969139099, + "num_tokens": 8089537129.0, + "step": 15825 + }, + { + "epoch": 4.279610600324499, + "grad_norm": 0.9330477714538574, + "learning_rate": 2.962948070858721e-06, + "loss": 1.7793, + "mean_token_accuracy": 0.5814226269721985, + "num_tokens": 8090040567.0, + "step": 15826 + }, + { + "epoch": 4.2798810167658194, + "grad_norm": 0.9600873589515686, + "learning_rate": 2.9622387027406674e-06, + "loss": 1.737, + "mean_token_accuracy": 0.5660172700881958, + "num_tokens": 8090564777.0, + "step": 15827 + }, + { + "epoch": 4.280151433207139, + "grad_norm": 0.8626407384872437, + "learning_rate": 2.9615295812447692e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5746595859527588, + "num_tokens": 8091088926.0, + "step": 15828 + }, + { + "epoch": 4.280421849648459, + "grad_norm": 0.8025882840156555, + "learning_rate": 2.9608207063927815e-06, + "loss": 1.9261, + "mean_token_accuracy": 0.5569972395896912, + "num_tokens": 8091613090.0, + "step": 15829 + }, + { + "epoch": 4.280692266089778, + "grad_norm": 0.8535448312759399, + "learning_rate": 2.960112078206455e-06, + "loss": 2.0077, + "mean_token_accuracy": 0.5497647523880005, + "num_tokens": 8092137178.0, + "step": 15830 + }, + { + "epoch": 4.280962682531098, + "grad_norm": 1.0997569561004639, + "learning_rate": 2.9594036967075366e-06, + "loss": 1.9032, + "mean_token_accuracy": 0.5782267451286316, + "num_tokens": 8092611337.0, + "step": 15831 + }, + { + "epoch": 4.281233098972417, + "grad_norm": 0.8747401833534241, + "learning_rate": 2.958695561917758e-06, + "loss": 1.8523, + "mean_token_accuracy": 0.5668717622756958, + "num_tokens": 8093091601.0, + "step": 15832 + }, + { + "epoch": 4.281503515413737, + "grad_norm": 0.8427963256835938, + "learning_rate": 2.9579876738588465e-06, + "loss": 1.9247, + "mean_token_accuracy": 0.5696325302124023, + "num_tokens": 8093615873.0, + "step": 15833 + }, + { + "epoch": 4.2817739318550565, + "grad_norm": 1.0073860883712769, + "learning_rate": 2.9572800325525243e-06, + "loss": 1.7884, + "mean_token_accuracy": 0.6050682067871094, + "num_tokens": 8094091493.0, + "step": 15834 + }, + { + "epoch": 4.282044348296377, + "grad_norm": 0.977263867855072, + "learning_rate": 2.9565726380205027e-06, + "loss": 1.878, + "mean_token_accuracy": 0.568435788154602, + "num_tokens": 8094615768.0, + "step": 15835 + }, + { + "epoch": 4.282314764737696, + "grad_norm": 0.851309597492218, + "learning_rate": 2.9558654902844852e-06, + "loss": 1.596, + "mean_token_accuracy": 0.6182220578193665, + "num_tokens": 8095140005.0, + "step": 15836 + }, + { + "epoch": 4.282585181179016, + "grad_norm": 0.7846018671989441, + "learning_rate": 2.9551585893661733e-06, + "loss": 1.8422, + "mean_token_accuracy": 0.5711818337440491, + "num_tokens": 8095664191.0, + "step": 15837 + }, + { + "epoch": 4.282855597620335, + "grad_norm": 1.07059645652771, + "learning_rate": 2.9544519352872535e-06, + "loss": 1.9613, + "mean_token_accuracy": 0.5831485986709595, + "num_tokens": 8096125650.0, + "step": 15838 + }, + { + "epoch": 4.283126014061655, + "grad_norm": 0.9348751306533813, + "learning_rate": 2.9537455280694065e-06, + "loss": 1.8542, + "mean_token_accuracy": 0.567426323890686, + "num_tokens": 8096649827.0, + "step": 15839 + }, + { + "epoch": 4.283396430502974, + "grad_norm": 0.8335037231445312, + "learning_rate": 2.953039367734312e-06, + "loss": 1.848, + "mean_token_accuracy": 0.574310302734375, + "num_tokens": 8097174048.0, + "step": 15840 + }, + { + "epoch": 4.2836668469442944, + "grad_norm": 0.321107417345047, + "learning_rate": 2.9523334543036335e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.7169449925422668, + "num_tokens": 8097698160.0, + "step": 15841 + }, + { + "epoch": 4.283937263385614, + "grad_norm": 0.9213724136352539, + "learning_rate": 2.9516277877990303e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5393328666687012, + "num_tokens": 8098222167.0, + "step": 15842 + }, + { + "epoch": 4.284207679826934, + "grad_norm": 0.8407071232795715, + "learning_rate": 2.9509223682421547e-06, + "loss": 1.7837, + "mean_token_accuracy": 0.6136616468429565, + "num_tokens": 8098746313.0, + "step": 15843 + }, + { + "epoch": 4.284478096268253, + "grad_norm": 0.914840817451477, + "learning_rate": 2.950217195654653e-06, + "loss": 1.7846, + "mean_token_accuracy": 0.5610374212265015, + "num_tokens": 8099270516.0, + "step": 15844 + }, + { + "epoch": 4.284748512709573, + "grad_norm": 0.8033846616744995, + "learning_rate": 2.949512270058159e-06, + "loss": 1.7688, + "mean_token_accuracy": 0.5896005630493164, + "num_tokens": 8099755879.0, + "step": 15845 + }, + { + "epoch": 4.285018929150892, + "grad_norm": 0.8701907992362976, + "learning_rate": 2.9488075914743064e-06, + "loss": 1.8865, + "mean_token_accuracy": 0.5675041675567627, + "num_tokens": 8100280050.0, + "step": 15846 + }, + { + "epoch": 4.285289345592212, + "grad_norm": 0.8256609439849854, + "learning_rate": 2.9481031599247113e-06, + "loss": 1.9426, + "mean_token_accuracy": 0.5667048692703247, + "num_tokens": 8100804224.0, + "step": 15847 + }, + { + "epoch": 4.2855597620335315, + "grad_norm": 0.7688664197921753, + "learning_rate": 2.9473989754309913e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.5695316791534424, + "num_tokens": 8101328470.0, + "step": 15848 + }, + { + "epoch": 4.285830178474852, + "grad_norm": 0.8112910389900208, + "learning_rate": 2.946695038014753e-06, + "loss": 1.8774, + "mean_token_accuracy": 0.5555589199066162, + "num_tokens": 8101844387.0, + "step": 15849 + }, + { + "epoch": 4.286100594916171, + "grad_norm": 0.820608377456665, + "learning_rate": 2.945991347697592e-06, + "loss": 1.8064, + "mean_token_accuracy": 0.5822240114212036, + "num_tokens": 8102368634.0, + "step": 15850 + }, + { + "epoch": 4.286371011357491, + "grad_norm": 0.8465803265571594, + "learning_rate": 2.945287904501104e-06, + "loss": 1.9115, + "mean_token_accuracy": 0.5504176616668701, + "num_tokens": 8102861580.0, + "step": 15851 + }, + { + "epoch": 4.28664142779881, + "grad_norm": 0.9721680283546448, + "learning_rate": 2.9445847084468698e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5995051860809326, + "num_tokens": 8103323836.0, + "step": 15852 + }, + { + "epoch": 4.28691184424013, + "grad_norm": 0.8386605978012085, + "learning_rate": 2.9438817595564645e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5849733352661133, + "num_tokens": 8103848008.0, + "step": 15853 + }, + { + "epoch": 4.287182260681449, + "grad_norm": 0.8475130200386047, + "learning_rate": 2.94317905785146e-06, + "loss": 1.9064, + "mean_token_accuracy": 0.5554007887840271, + "num_tokens": 8104341710.0, + "step": 15854 + }, + { + "epoch": 4.2874526771227695, + "grad_norm": 0.9143393039703369, + "learning_rate": 2.942476603353417e-06, + "loss": 1.9062, + "mean_token_accuracy": 0.5504640340805054, + "num_tokens": 8104865835.0, + "step": 15855 + }, + { + "epoch": 4.287723093564089, + "grad_norm": 0.7982882857322693, + "learning_rate": 2.941774396083884e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5830157995223999, + "num_tokens": 8105390111.0, + "step": 15856 + }, + { + "epoch": 4.287993510005409, + "grad_norm": 0.9596551656723022, + "learning_rate": 2.9410724360644106e-06, + "loss": 1.7497, + "mean_token_accuracy": 0.5922637581825256, + "num_tokens": 8105871638.0, + "step": 15857 + }, + { + "epoch": 4.288263926446728, + "grad_norm": 0.7389838695526123, + "learning_rate": 2.9403707233165367e-06, + "loss": 1.6389, + "mean_token_accuracy": 0.6113969683647156, + "num_tokens": 8106395773.0, + "step": 15858 + }, + { + "epoch": 4.288534342888047, + "grad_norm": 0.7946953773498535, + "learning_rate": 2.9396692578617884e-06, + "loss": 1.7896, + "mean_token_accuracy": 0.5969589352607727, + "num_tokens": 8106919900.0, + "step": 15859 + }, + { + "epoch": 4.288804759329367, + "grad_norm": 0.8689219951629639, + "learning_rate": 2.938968039721694e-06, + "loss": 1.8627, + "mean_token_accuracy": 0.5572963953018188, + "num_tokens": 8107444156.0, + "step": 15860 + }, + { + "epoch": 4.289075175770687, + "grad_norm": 0.3176876902580261, + "learning_rate": 2.938267068917765e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.7198675870895386, + "num_tokens": 8107968432.0, + "step": 15861 + }, + { + "epoch": 4.2893455922120065, + "grad_norm": 0.8638576865196228, + "learning_rate": 2.9375663454715088e-06, + "loss": 1.8032, + "mean_token_accuracy": 0.5880104303359985, + "num_tokens": 8108492668.0, + "step": 15862 + }, + { + "epoch": 4.289616008653326, + "grad_norm": 0.7991220951080322, + "learning_rate": 2.9368658694044294e-06, + "loss": 1.8159, + "mean_token_accuracy": 0.5843700170516968, + "num_tokens": 8109016913.0, + "step": 15863 + }, + { + "epoch": 4.289886425094646, + "grad_norm": 0.8710989952087402, + "learning_rate": 2.9361656407380167e-06, + "loss": 1.8102, + "mean_token_accuracy": 0.588998556137085, + "num_tokens": 8109487117.0, + "step": 15864 + }, + { + "epoch": 4.290156841535965, + "grad_norm": 0.8274568319320679, + "learning_rate": 2.935465659493755e-06, + "loss": 1.8455, + "mean_token_accuracy": 0.5861338973045349, + "num_tokens": 8110011396.0, + "step": 15865 + }, + { + "epoch": 4.290427257977285, + "grad_norm": 0.8239126801490784, + "learning_rate": 2.9347659256931256e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.5416690707206726, + "num_tokens": 8110535594.0, + "step": 15866 + }, + { + "epoch": 4.290697674418604, + "grad_norm": 0.9386093020439148, + "learning_rate": 2.9340664393575948e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.5689563751220703, + "num_tokens": 8111059662.0, + "step": 15867 + }, + { + "epoch": 4.290968090859924, + "grad_norm": 0.8703566193580627, + "learning_rate": 2.9333672005086244e-06, + "loss": 1.8902, + "mean_token_accuracy": 0.563651442527771, + "num_tokens": 8111571592.0, + "step": 15868 + }, + { + "epoch": 4.291238507301244, + "grad_norm": 0.8949381709098816, + "learning_rate": 2.932668209167673e-06, + "loss": 1.7675, + "mean_token_accuracy": 0.5898301005363464, + "num_tokens": 8112095856.0, + "step": 15869 + }, + { + "epoch": 4.291508923742564, + "grad_norm": 0.9464347958564758, + "learning_rate": 2.931969465356184e-06, + "loss": 1.8406, + "mean_token_accuracy": 0.580428957939148, + "num_tokens": 8112620136.0, + "step": 15870 + }, + { + "epoch": 4.291779340183883, + "grad_norm": 0.9460494518280029, + "learning_rate": 2.931270969095599e-06, + "loss": 1.9211, + "mean_token_accuracy": 0.5639302134513855, + "num_tokens": 8113144387.0, + "step": 15871 + }, + { + "epoch": 4.292049756625203, + "grad_norm": 0.968805193901062, + "learning_rate": 2.930572720407348e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5741069316864014, + "num_tokens": 8113613841.0, + "step": 15872 + }, + { + "epoch": 4.292320173066522, + "grad_norm": 0.8903757929801941, + "learning_rate": 2.929874719312856e-06, + "loss": 1.8026, + "mean_token_accuracy": 0.5941352248191833, + "num_tokens": 8114138093.0, + "step": 15873 + }, + { + "epoch": 4.292590589507842, + "grad_norm": 1.1428470611572266, + "learning_rate": 2.9291769658335423e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5721765756607056, + "num_tokens": 8114614772.0, + "step": 15874 + }, + { + "epoch": 4.292861005949161, + "grad_norm": 1.1933436393737793, + "learning_rate": 2.9284794599908144e-06, + "loss": 1.6542, + "mean_token_accuracy": 0.5787663459777832, + "num_tokens": 8115139031.0, + "step": 15875 + }, + { + "epoch": 4.2931314223904815, + "grad_norm": 0.9120476245880127, + "learning_rate": 2.927782201806072e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5757433176040649, + "num_tokens": 8115663251.0, + "step": 15876 + }, + { + "epoch": 4.293401838831801, + "grad_norm": 1.0934299230575562, + "learning_rate": 2.9270851913007107e-06, + "loss": 1.8258, + "mean_token_accuracy": 0.5768864750862122, + "num_tokens": 8116187398.0, + "step": 15877 + }, + { + "epoch": 4.293672255273121, + "grad_norm": 0.8751406073570251, + "learning_rate": 2.926388428496118e-06, + "loss": 1.5702, + "mean_token_accuracy": 0.6298061013221741, + "num_tokens": 8116711635.0, + "step": 15878 + }, + { + "epoch": 4.29394267171444, + "grad_norm": 1.001861333847046, + "learning_rate": 2.92569191341367e-06, + "loss": 1.8844, + "mean_token_accuracy": 0.5609232187271118, + "num_tokens": 8117235887.0, + "step": 15879 + }, + { + "epoch": 4.29421308815576, + "grad_norm": 0.7198852896690369, + "learning_rate": 2.9249956460747397e-06, + "loss": 1.7333, + "mean_token_accuracy": 0.5980902910232544, + "num_tokens": 8117760078.0, + "step": 15880 + }, + { + "epoch": 4.294483504597079, + "grad_norm": 0.35179901123046875, + "learning_rate": 2.924299626500691e-06, + "loss": 1.097, + "mean_token_accuracy": 0.7031122446060181, + "num_tokens": 8118284180.0, + "step": 15881 + }, + { + "epoch": 4.294753921038399, + "grad_norm": 0.8867132067680359, + "learning_rate": 2.9236038547128757e-06, + "loss": 1.7488, + "mean_token_accuracy": 0.5992822647094727, + "num_tokens": 8118797253.0, + "step": 15882 + }, + { + "epoch": 4.295024337479719, + "grad_norm": 1.0174680948257446, + "learning_rate": 2.922908330732648e-06, + "loss": 1.831, + "mean_token_accuracy": 0.5859870910644531, + "num_tokens": 8119258285.0, + "step": 15883 + }, + { + "epoch": 4.295294753921039, + "grad_norm": 0.933321475982666, + "learning_rate": 2.9222130545813453e-06, + "loss": 1.7351, + "mean_token_accuracy": 0.5792841911315918, + "num_tokens": 8119782500.0, + "step": 15884 + }, + { + "epoch": 4.295565170362358, + "grad_norm": 0.8077302575111389, + "learning_rate": 2.9215180262803004e-06, + "loss": 1.8951, + "mean_token_accuracy": 0.5657650232315063, + "num_tokens": 8120266396.0, + "step": 15885 + }, + { + "epoch": 4.295835586803678, + "grad_norm": 0.857815146446228, + "learning_rate": 2.92082324585084e-06, + "loss": 1.849, + "mean_token_accuracy": 0.5743446946144104, + "num_tokens": 8120790548.0, + "step": 15886 + }, + { + "epoch": 4.296106003244997, + "grad_norm": 0.8568466901779175, + "learning_rate": 2.920128713314282e-06, + "loss": 1.8754, + "mean_token_accuracy": 0.5682780742645264, + "num_tokens": 8121314719.0, + "step": 15887 + }, + { + "epoch": 4.296376419686317, + "grad_norm": 1.1455458402633667, + "learning_rate": 2.9194344286919356e-06, + "loss": 1.8844, + "mean_token_accuracy": 0.5784240961074829, + "num_tokens": 8121838888.0, + "step": 15888 + }, + { + "epoch": 4.296646836127636, + "grad_norm": 0.9729211926460266, + "learning_rate": 2.9187403920051064e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.5784316658973694, + "num_tokens": 8122341103.0, + "step": 15889 + }, + { + "epoch": 4.2969172525689565, + "grad_norm": 0.9353715181350708, + "learning_rate": 2.918046603275087e-06, + "loss": 1.796, + "mean_token_accuracy": 0.58687424659729, + "num_tokens": 8122865191.0, + "step": 15890 + }, + { + "epoch": 4.297187669010276, + "grad_norm": 0.9218816161155701, + "learning_rate": 2.9173530625231627e-06, + "loss": 1.9477, + "mean_token_accuracy": 0.5519866347312927, + "num_tokens": 8123389464.0, + "step": 15891 + }, + { + "epoch": 4.297458085451596, + "grad_norm": 0.9467753171920776, + "learning_rate": 2.9166597697706187e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5870577096939087, + "num_tokens": 8123913648.0, + "step": 15892 + }, + { + "epoch": 4.297728501892915, + "grad_norm": 0.9424861073493958, + "learning_rate": 2.9159667250387225e-06, + "loss": 1.7605, + "mean_token_accuracy": 0.5963852405548096, + "num_tokens": 8124437797.0, + "step": 15893 + }, + { + "epoch": 4.297998918334235, + "grad_norm": 0.823499321937561, + "learning_rate": 2.9152739283487422e-06, + "loss": 1.7179, + "mean_token_accuracy": 0.5938311815261841, + "num_tokens": 8124961881.0, + "step": 15894 + }, + { + "epoch": 4.298269334775554, + "grad_norm": 0.865078866481781, + "learning_rate": 2.9145813797219346e-06, + "loss": 1.8722, + "mean_token_accuracy": 0.5521125793457031, + "num_tokens": 8125486160.0, + "step": 15895 + }, + { + "epoch": 4.298539751216874, + "grad_norm": 0.9665138125419617, + "learning_rate": 2.9138890791795456e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5765770077705383, + "num_tokens": 8125970942.0, + "step": 15896 + }, + { + "epoch": 4.298810167658194, + "grad_norm": 0.9913576245307922, + "learning_rate": 2.91319702674282e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5786089897155762, + "num_tokens": 8126492258.0, + "step": 15897 + }, + { + "epoch": 4.299080584099514, + "grad_norm": 0.7973789572715759, + "learning_rate": 2.9125052224329932e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5804743766784668, + "num_tokens": 8127016504.0, + "step": 15898 + }, + { + "epoch": 4.299351000540833, + "grad_norm": 1.1639302968978882, + "learning_rate": 2.911813666271287e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5735957622528076, + "num_tokens": 8127540666.0, + "step": 15899 + }, + { + "epoch": 4.299621416982152, + "grad_norm": 1.0582586526870728, + "learning_rate": 2.9111223582789233e-06, + "loss": 1.8092, + "mean_token_accuracy": 0.5845527052879333, + "num_tokens": 8128064681.0, + "step": 15900 + }, + { + "epoch": 4.299891833423472, + "grad_norm": 0.3836839497089386, + "learning_rate": 2.9104312984771154e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.7130126357078552, + "num_tokens": 8128588905.0, + "step": 15901 + }, + { + "epoch": 4.300162249864792, + "grad_norm": 0.9562731385231018, + "learning_rate": 2.9097404868870637e-06, + "loss": 1.7754, + "mean_token_accuracy": 0.5812138915061951, + "num_tokens": 8129113186.0, + "step": 15902 + }, + { + "epoch": 4.300432666306111, + "grad_norm": 0.7984776496887207, + "learning_rate": 2.9090499235299673e-06, + "loss": 1.7476, + "mean_token_accuracy": 0.5838866829872131, + "num_tokens": 8129585487.0, + "step": 15903 + }, + { + "epoch": 4.300703082747431, + "grad_norm": 1.0076264142990112, + "learning_rate": 2.9083596084270126e-06, + "loss": 1.9173, + "mean_token_accuracy": 0.5746424198150635, + "num_tokens": 8130079571.0, + "step": 15904 + }, + { + "epoch": 4.300973499188751, + "grad_norm": 0.8958116769790649, + "learning_rate": 2.907669541599381e-06, + "loss": 1.7647, + "mean_token_accuracy": 0.6000007390975952, + "num_tokens": 8130603739.0, + "step": 15905 + }, + { + "epoch": 4.30124391563007, + "grad_norm": 1.1123608350753784, + "learning_rate": 2.906979723068246e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5748218894004822, + "num_tokens": 8131127947.0, + "step": 15906 + }, + { + "epoch": 4.30151433207139, + "grad_norm": 0.8788946270942688, + "learning_rate": 2.9062901528547734e-06, + "loss": 1.8334, + "mean_token_accuracy": 0.5792600512504578, + "num_tokens": 8131652133.0, + "step": 15907 + }, + { + "epoch": 4.301784748512709, + "grad_norm": 0.8156062960624695, + "learning_rate": 2.9056008309801196e-06, + "loss": 1.8158, + "mean_token_accuracy": 0.5877691507339478, + "num_tokens": 8132176409.0, + "step": 15908 + }, + { + "epoch": 4.302055164954029, + "grad_norm": 0.8355039954185486, + "learning_rate": 2.9049117574654384e-06, + "loss": 1.7957, + "mean_token_accuracy": 0.5918174982070923, + "num_tokens": 8132643196.0, + "step": 15909 + }, + { + "epoch": 4.3023255813953485, + "grad_norm": 1.026625394821167, + "learning_rate": 2.9042229323318703e-06, + "loss": 1.7831, + "mean_token_accuracy": 0.5936421155929565, + "num_tokens": 8133145514.0, + "step": 15910 + }, + { + "epoch": 4.302595997836669, + "grad_norm": 0.8439825773239136, + "learning_rate": 2.9035343556005485e-06, + "loss": 1.7391, + "mean_token_accuracy": 0.5730891227722168, + "num_tokens": 8133669580.0, + "step": 15911 + }, + { + "epoch": 4.302866414277988, + "grad_norm": 1.0837483406066895, + "learning_rate": 2.902846027292605e-06, + "loss": 1.4919, + "mean_token_accuracy": 0.6290822625160217, + "num_tokens": 8134115717.0, + "step": 15912 + }, + { + "epoch": 4.303136830719308, + "grad_norm": 1.2652757167816162, + "learning_rate": 2.902157947429156e-06, + "loss": 1.9195, + "mean_token_accuracy": 0.5637170076370239, + "num_tokens": 8134603282.0, + "step": 15913 + }, + { + "epoch": 4.303407247160627, + "grad_norm": 0.8576095700263977, + "learning_rate": 2.901470116031316e-06, + "loss": 1.6156, + "mean_token_accuracy": 0.6186865568161011, + "num_tokens": 8135118899.0, + "step": 15914 + }, + { + "epoch": 4.303677663601947, + "grad_norm": 0.8456058502197266, + "learning_rate": 2.9007825331201888e-06, + "loss": 1.7743, + "mean_token_accuracy": 0.5869170427322388, + "num_tokens": 8135643130.0, + "step": 15915 + }, + { + "epoch": 4.303948080043266, + "grad_norm": 0.916072428226471, + "learning_rate": 2.900095198716871e-06, + "loss": 1.6398, + "mean_token_accuracy": 0.6149842739105225, + "num_tokens": 8136132931.0, + "step": 15916 + }, + { + "epoch": 4.304218496484586, + "grad_norm": 0.9145296216011047, + "learning_rate": 2.899408112842455e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5979889631271362, + "num_tokens": 8136657026.0, + "step": 15917 + }, + { + "epoch": 4.304488912925906, + "grad_norm": 0.8606458306312561, + "learning_rate": 2.89872127551802e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5730669498443604, + "num_tokens": 8137181255.0, + "step": 15918 + }, + { + "epoch": 4.304759329367226, + "grad_norm": 0.7781980633735657, + "learning_rate": 2.8980346867646385e-06, + "loss": 1.8519, + "mean_token_accuracy": 0.5920724868774414, + "num_tokens": 8137689422.0, + "step": 15919 + }, + { + "epoch": 4.305029745808545, + "grad_norm": 0.7656370401382446, + "learning_rate": 2.8973483466033814e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.592300295829773, + "num_tokens": 8138213559.0, + "step": 15920 + }, + { + "epoch": 4.305300162249865, + "grad_norm": 0.32128778100013733, + "learning_rate": 2.8966622550553046e-06, + "loss": 1.088, + "mean_token_accuracy": 0.7050958871841431, + "num_tokens": 8138737839.0, + "step": 15921 + }, + { + "epoch": 4.305570578691184, + "grad_norm": 0.9913583397865295, + "learning_rate": 2.8959764121414595e-06, + "loss": 1.8007, + "mean_token_accuracy": 0.5890331268310547, + "num_tokens": 8139219127.0, + "step": 15922 + }, + { + "epoch": 4.305840995132504, + "grad_norm": 0.8875507116317749, + "learning_rate": 2.895290817882892e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5798395872116089, + "num_tokens": 8139730700.0, + "step": 15923 + }, + { + "epoch": 4.3061114115738235, + "grad_norm": 0.7780248522758484, + "learning_rate": 2.8946054723006367e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5925337076187134, + "num_tokens": 8140243604.0, + "step": 15924 + }, + { + "epoch": 4.306381828015144, + "grad_norm": 0.7880702614784241, + "learning_rate": 2.89392037541572e-06, + "loss": 1.9395, + "mean_token_accuracy": 0.5401111245155334, + "num_tokens": 8140767788.0, + "step": 15925 + }, + { + "epoch": 4.306652244456463, + "grad_norm": 0.8965921998023987, + "learning_rate": 2.8932355272491652e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5697462558746338, + "num_tokens": 8141256765.0, + "step": 15926 + }, + { + "epoch": 4.306922660897783, + "grad_norm": 0.8354714512825012, + "learning_rate": 2.8925509278219845e-06, + "loss": 1.8788, + "mean_token_accuracy": 0.5624213218688965, + "num_tokens": 8141781025.0, + "step": 15927 + }, + { + "epoch": 4.307193077339102, + "grad_norm": 0.7947868704795837, + "learning_rate": 2.8918665771551853e-06, + "loss": 1.8254, + "mean_token_accuracy": 0.5856142640113831, + "num_tokens": 8142300308.0, + "step": 15928 + }, + { + "epoch": 4.307463493780422, + "grad_norm": 0.8789559602737427, + "learning_rate": 2.8911824752697614e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5754801034927368, + "num_tokens": 8142781781.0, + "step": 15929 + }, + { + "epoch": 4.307733910221741, + "grad_norm": 0.9593954682350159, + "learning_rate": 2.890498622186708e-06, + "loss": 1.8208, + "mean_token_accuracy": 0.5589122772216797, + "num_tokens": 8143306022.0, + "step": 15930 + }, + { + "epoch": 4.308004326663061, + "grad_norm": 0.9210253357887268, + "learning_rate": 2.8898150179270047e-06, + "loss": 1.6861, + "mean_token_accuracy": 0.627021312713623, + "num_tokens": 8143830281.0, + "step": 15931 + }, + { + "epoch": 4.308274743104381, + "grad_norm": 0.9058327078819275, + "learning_rate": 2.8891316625116273e-06, + "loss": 1.8628, + "mean_token_accuracy": 0.588467001914978, + "num_tokens": 8144354441.0, + "step": 15932 + }, + { + "epoch": 4.308545159545701, + "grad_norm": 0.7348705530166626, + "learning_rate": 2.888448555961544e-06, + "loss": 1.5965, + "mean_token_accuracy": 0.6365336179733276, + "num_tokens": 8144813684.0, + "step": 15933 + }, + { + "epoch": 4.30881557598702, + "grad_norm": 1.187400460243225, + "learning_rate": 2.8877656982977113e-06, + "loss": 1.7118, + "mean_token_accuracy": 0.5991266965866089, + "num_tokens": 8145314335.0, + "step": 15934 + }, + { + "epoch": 4.30908599242834, + "grad_norm": 1.0358223915100098, + "learning_rate": 2.8870830895410857e-06, + "loss": 1.8322, + "mean_token_accuracy": 0.57938551902771, + "num_tokens": 8145760241.0, + "step": 15935 + }, + { + "epoch": 4.309356408869659, + "grad_norm": 0.88126140832901, + "learning_rate": 2.8864007297126084e-06, + "loss": 1.8305, + "mean_token_accuracy": 0.5717917084693909, + "num_tokens": 8146284479.0, + "step": 15936 + }, + { + "epoch": 4.309626825310979, + "grad_norm": 0.883980393409729, + "learning_rate": 2.8857186188332163e-06, + "loss": 1.6215, + "mean_token_accuracy": 0.6135637760162354, + "num_tokens": 8146808757.0, + "step": 15937 + }, + { + "epoch": 4.3098972417522985, + "grad_norm": 0.9575917720794678, + "learning_rate": 2.8850367569238413e-06, + "loss": 1.828, + "mean_token_accuracy": 0.5720691680908203, + "num_tokens": 8147332780.0, + "step": 15938 + }, + { + "epoch": 4.310167658193619, + "grad_norm": 1.0162416696548462, + "learning_rate": 2.8843551440054013e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.5954970121383667, + "num_tokens": 8147797794.0, + "step": 15939 + }, + { + "epoch": 4.310438074634938, + "grad_norm": 0.8917569518089294, + "learning_rate": 2.883673780098813e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.5894289016723633, + "num_tokens": 8148321888.0, + "step": 15940 + }, + { + "epoch": 4.310708491076257, + "grad_norm": 0.3367104232311249, + "learning_rate": 2.8829926652249823e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.710795521736145, + "num_tokens": 8148846063.0, + "step": 15941 + }, + { + "epoch": 4.310978907517577, + "grad_norm": 0.9505345821380615, + "learning_rate": 2.882311799404807e-06, + "loss": 1.8259, + "mean_token_accuracy": 0.5616682171821594, + "num_tokens": 8149370346.0, + "step": 15942 + }, + { + "epoch": 4.311249323958897, + "grad_norm": 1.1735702753067017, + "learning_rate": 2.8816311826591766e-06, + "loss": 1.9732, + "mean_token_accuracy": 0.5503944754600525, + "num_tokens": 8149894606.0, + "step": 15943 + }, + { + "epoch": 4.311519740400216, + "grad_norm": 0.8458353281021118, + "learning_rate": 2.8809508150089777e-06, + "loss": 1.8331, + "mean_token_accuracy": 0.5669549703598022, + "num_tokens": 8150418773.0, + "step": 15944 + }, + { + "epoch": 4.3117901568415355, + "grad_norm": 0.8199582099914551, + "learning_rate": 2.8802706964750828e-06, + "loss": 1.5862, + "mean_token_accuracy": 0.6189801692962646, + "num_tokens": 8150943036.0, + "step": 15945 + }, + { + "epoch": 4.312060573282856, + "grad_norm": 0.9580886960029602, + "learning_rate": 2.8795908270783635e-06, + "loss": 1.9245, + "mean_token_accuracy": 0.5630209445953369, + "num_tokens": 8151467278.0, + "step": 15946 + }, + { + "epoch": 4.312330989724175, + "grad_norm": 0.931352436542511, + "learning_rate": 2.878911206839678e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.580016016960144, + "num_tokens": 8151981229.0, + "step": 15947 + }, + { + "epoch": 4.312601406165495, + "grad_norm": 0.92955482006073, + "learning_rate": 2.878231835779878e-06, + "loss": 1.7417, + "mean_token_accuracy": 0.5761591196060181, + "num_tokens": 8152505503.0, + "step": 15948 + }, + { + "epoch": 4.312871822606814, + "grad_norm": 1.0521353483200073, + "learning_rate": 2.8775527139198114e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.5597087144851685, + "num_tokens": 8153029707.0, + "step": 15949 + }, + { + "epoch": 4.313142239048134, + "grad_norm": 0.7896300554275513, + "learning_rate": 2.876873841280313e-06, + "loss": 1.7456, + "mean_token_accuracy": 0.5837346315383911, + "num_tokens": 8153533380.0, + "step": 15950 + }, + { + "epoch": 4.313412655489453, + "grad_norm": 0.9614995718002319, + "learning_rate": 2.8761952178822123e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5614488124847412, + "num_tokens": 8154057521.0, + "step": 15951 + }, + { + "epoch": 4.3136830719307735, + "grad_norm": 0.900968611240387, + "learning_rate": 2.8755168437463356e-06, + "loss": 1.7968, + "mean_token_accuracy": 0.5837340950965881, + "num_tokens": 8154581772.0, + "step": 15952 + }, + { + "epoch": 4.313953488372093, + "grad_norm": 0.9197359085083008, + "learning_rate": 2.8748387188934944e-06, + "loss": 1.8234, + "mean_token_accuracy": 0.5808831453323364, + "num_tokens": 8155064208.0, + "step": 15953 + }, + { + "epoch": 4.314223904813413, + "grad_norm": 0.8630716800689697, + "learning_rate": 2.874160843344494e-06, + "loss": 1.8434, + "mean_token_accuracy": 0.5634669065475464, + "num_tokens": 8155588385.0, + "step": 15954 + }, + { + "epoch": 4.314494321254732, + "grad_norm": 0.7422308921813965, + "learning_rate": 2.8734832171201365e-06, + "loss": 1.8406, + "mean_token_accuracy": 0.5713128447532654, + "num_tokens": 8156112575.0, + "step": 15955 + }, + { + "epoch": 4.314764737696052, + "grad_norm": 0.9924066662788391, + "learning_rate": 2.872805840241212e-06, + "loss": 1.7773, + "mean_token_accuracy": 0.5835620760917664, + "num_tokens": 8156624145.0, + "step": 15956 + }, + { + "epoch": 4.315035154137371, + "grad_norm": 0.937238335609436, + "learning_rate": 2.872128712728504e-06, + "loss": 1.8918, + "mean_token_accuracy": 0.5738767385482788, + "num_tokens": 8157094236.0, + "step": 15957 + }, + { + "epoch": 4.315305570578691, + "grad_norm": 0.8885315656661987, + "learning_rate": 2.8714518346027918e-06, + "loss": 1.8654, + "mean_token_accuracy": 0.5595484972000122, + "num_tokens": 8157618278.0, + "step": 15958 + }, + { + "epoch": 4.3155759870200106, + "grad_norm": 0.930501401424408, + "learning_rate": 2.870775205884841e-06, + "loss": 1.877, + "mean_token_accuracy": 0.5737546682357788, + "num_tokens": 8158142471.0, + "step": 15959 + }, + { + "epoch": 4.315846403461331, + "grad_norm": 0.8330304026603699, + "learning_rate": 2.8700988265954125e-06, + "loss": 1.9675, + "mean_token_accuracy": 0.5421904921531677, + "num_tokens": 8158666752.0, + "step": 15960 + }, + { + "epoch": 4.31611681990265, + "grad_norm": 0.3801279067993164, + "learning_rate": 2.8694226967552618e-06, + "loss": 1.1451, + "mean_token_accuracy": 0.6806707382202148, + "num_tokens": 8159139763.0, + "step": 15961 + }, + { + "epoch": 4.31638723634397, + "grad_norm": 0.8591891527175903, + "learning_rate": 2.8687468163851316e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5838732719421387, + "num_tokens": 8159611286.0, + "step": 15962 + }, + { + "epoch": 4.316657652785289, + "grad_norm": 0.9986473321914673, + "learning_rate": 2.8680711855057635e-06, + "loss": 1.8529, + "mean_token_accuracy": 0.5789358615875244, + "num_tokens": 8160135434.0, + "step": 15963 + }, + { + "epoch": 4.316928069226609, + "grad_norm": 0.8431248068809509, + "learning_rate": 2.867395804137886e-06, + "loss": 1.8633, + "mean_token_accuracy": 0.5666567087173462, + "num_tokens": 8160659447.0, + "step": 15964 + }, + { + "epoch": 4.317198485667928, + "grad_norm": 0.7192694544792175, + "learning_rate": 2.866720672302219e-06, + "loss": 1.7782, + "mean_token_accuracy": 0.5863119959831238, + "num_tokens": 8161183623.0, + "step": 15965 + }, + { + "epoch": 4.3174689021092485, + "grad_norm": 0.9253131747245789, + "learning_rate": 2.8660457900194827e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5710580348968506, + "num_tokens": 8161707832.0, + "step": 15966 + }, + { + "epoch": 4.317739318550568, + "grad_norm": 0.927044153213501, + "learning_rate": 2.8653711573103817e-06, + "loss": 2.0033, + "mean_token_accuracy": 0.5661441087722778, + "num_tokens": 8162168330.0, + "step": 15967 + }, + { + "epoch": 4.318009734991888, + "grad_norm": 1.071592926979065, + "learning_rate": 2.8646967741956144e-06, + "loss": 1.9047, + "mean_token_accuracy": 0.5460039377212524, + "num_tokens": 8162692521.0, + "step": 15968 + }, + { + "epoch": 4.318280151433207, + "grad_norm": 0.9544832110404968, + "learning_rate": 2.8640226406958752e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.5737118721008301, + "num_tokens": 8163216681.0, + "step": 15969 + }, + { + "epoch": 4.318550567874527, + "grad_norm": 0.9067662358283997, + "learning_rate": 2.8633487568318474e-06, + "loss": 1.8331, + "mean_token_accuracy": 0.5755307674407959, + "num_tokens": 8163739974.0, + "step": 15970 + }, + { + "epoch": 4.318820984315846, + "grad_norm": 0.9173509478569031, + "learning_rate": 2.8626751226242094e-06, + "loss": 1.8194, + "mean_token_accuracy": 0.5627931356430054, + "num_tokens": 8164264149.0, + "step": 15971 + }, + { + "epoch": 4.319091400757166, + "grad_norm": 0.9390804767608643, + "learning_rate": 2.8620017380936272e-06, + "loss": 1.8062, + "mean_token_accuracy": 0.5508593320846558, + "num_tokens": 8164788348.0, + "step": 15972 + }, + { + "epoch": 4.3193618171984856, + "grad_norm": 0.9608287811279297, + "learning_rate": 2.8613286032607656e-06, + "loss": 1.8345, + "mean_token_accuracy": 0.5735206604003906, + "num_tokens": 8165312625.0, + "step": 15973 + }, + { + "epoch": 4.319632233639806, + "grad_norm": 0.9437222480773926, + "learning_rate": 2.8606557181462746e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.5886566638946533, + "num_tokens": 8165778757.0, + "step": 15974 + }, + { + "epoch": 4.319902650081125, + "grad_norm": 0.9251498579978943, + "learning_rate": 2.859983082770804e-06, + "loss": 1.7624, + "mean_token_accuracy": 0.6101195812225342, + "num_tokens": 8166239059.0, + "step": 15975 + }, + { + "epoch": 4.320173066522445, + "grad_norm": 0.9497489929199219, + "learning_rate": 2.859310697154991e-06, + "loss": 1.7259, + "mean_token_accuracy": 0.5904492139816284, + "num_tokens": 8166763178.0, + "step": 15976 + }, + { + "epoch": 4.320443482963764, + "grad_norm": 0.9993155002593994, + "learning_rate": 2.8586385613194657e-06, + "loss": 1.8058, + "mean_token_accuracy": 0.5872468948364258, + "num_tokens": 8167287410.0, + "step": 15977 + }, + { + "epoch": 4.320713899405084, + "grad_norm": 0.9076895713806152, + "learning_rate": 2.857966675284851e-06, + "loss": 1.791, + "mean_token_accuracy": 0.5926429629325867, + "num_tokens": 8167811647.0, + "step": 15978 + }, + { + "epoch": 4.320984315846403, + "grad_norm": 1.0120404958724976, + "learning_rate": 2.857295039071764e-06, + "loss": 1.8133, + "mean_token_accuracy": 0.5677850842475891, + "num_tokens": 8168313292.0, + "step": 15979 + }, + { + "epoch": 4.3212547322877235, + "grad_norm": 0.8976348042488098, + "learning_rate": 2.8566236527008094e-06, + "loss": 1.9023, + "mean_token_accuracy": 0.5664254426956177, + "num_tokens": 8168829618.0, + "step": 15980 + }, + { + "epoch": 4.321525148729043, + "grad_norm": 0.34466761350631714, + "learning_rate": 2.8559525161925915e-06, + "loss": 1.099, + "mean_token_accuracy": 0.704899251461029, + "num_tokens": 8169353701.0, + "step": 15981 + }, + { + "epoch": 4.321795565170362, + "grad_norm": 1.0961815118789673, + "learning_rate": 2.8552816295677e-06, + "loss": 1.9055, + "mean_token_accuracy": 0.5520955920219421, + "num_tokens": 8169877859.0, + "step": 15982 + }, + { + "epoch": 4.322065981611682, + "grad_norm": 1.2030493021011353, + "learning_rate": 2.8546109928467185e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5730593204498291, + "num_tokens": 8170398017.0, + "step": 15983 + }, + { + "epoch": 4.322336398053002, + "grad_norm": 0.876677393913269, + "learning_rate": 2.8539406060502274e-06, + "loss": 1.7127, + "mean_token_accuracy": 0.6210339069366455, + "num_tokens": 8170910994.0, + "step": 15984 + }, + { + "epoch": 4.322606814494321, + "grad_norm": 0.8943285942077637, + "learning_rate": 2.853270469198793e-06, + "loss": 1.7587, + "mean_token_accuracy": 0.5837361812591553, + "num_tokens": 8171415460.0, + "step": 15985 + }, + { + "epoch": 4.3228772309356405, + "grad_norm": 0.8710810542106628, + "learning_rate": 2.852600582312978e-06, + "loss": 1.8035, + "mean_token_accuracy": 0.5844444036483765, + "num_tokens": 8171939734.0, + "step": 15986 + }, + { + "epoch": 4.323147647376961, + "grad_norm": 1.0729725360870361, + "learning_rate": 2.85193094541334e-06, + "loss": 1.8149, + "mean_token_accuracy": 0.5846507549285889, + "num_tokens": 8172463876.0, + "step": 15987 + }, + { + "epoch": 4.32341806381828, + "grad_norm": 0.9586576223373413, + "learning_rate": 2.8512615585204194e-06, + "loss": 1.7434, + "mean_token_accuracy": 0.5868360996246338, + "num_tokens": 8172956690.0, + "step": 15988 + }, + { + "epoch": 4.3236884802596, + "grad_norm": 0.864210307598114, + "learning_rate": 2.850592421654761e-06, + "loss": 1.7235, + "mean_token_accuracy": 0.6247878074645996, + "num_tokens": 8173480946.0, + "step": 15989 + }, + { + "epoch": 4.323958896700919, + "grad_norm": 0.8186022639274597, + "learning_rate": 2.849923534836892e-06, + "loss": 1.8713, + "mean_token_accuracy": 0.5530902743339539, + "num_tokens": 8174005040.0, + "step": 15990 + }, + { + "epoch": 4.324229313142239, + "grad_norm": 0.9860795736312866, + "learning_rate": 2.849254898087335e-06, + "loss": 1.8792, + "mean_token_accuracy": 0.5829296112060547, + "num_tokens": 8174468208.0, + "step": 15991 + }, + { + "epoch": 4.324499729583558, + "grad_norm": 0.8768576383590698, + "learning_rate": 2.84858651142661e-06, + "loss": 1.7881, + "mean_token_accuracy": 0.603206992149353, + "num_tokens": 8174988086.0, + "step": 15992 + }, + { + "epoch": 4.324770146024878, + "grad_norm": 0.9352619647979736, + "learning_rate": 2.847918374875222e-06, + "loss": 1.8848, + "mean_token_accuracy": 0.5704877376556396, + "num_tokens": 8175512361.0, + "step": 15993 + }, + { + "epoch": 4.325040562466198, + "grad_norm": 0.8591110706329346, + "learning_rate": 2.847250488453671e-06, + "loss": 1.8173, + "mean_token_accuracy": 0.5774954557418823, + "num_tokens": 8176013263.0, + "step": 15994 + }, + { + "epoch": 4.325310978907518, + "grad_norm": 0.9094780683517456, + "learning_rate": 2.8465828521824514e-06, + "loss": 1.8095, + "mean_token_accuracy": 0.5745844841003418, + "num_tokens": 8176537473.0, + "step": 15995 + }, + { + "epoch": 4.325581395348837, + "grad_norm": 0.9756960868835449, + "learning_rate": 2.8459154660820476e-06, + "loss": 1.9882, + "mean_token_accuracy": 0.5345721244812012, + "num_tokens": 8177061573.0, + "step": 15996 + }, + { + "epoch": 4.325851811790157, + "grad_norm": 0.7991519570350647, + "learning_rate": 2.8452483301729354e-06, + "loss": 1.7823, + "mean_token_accuracy": 0.562896728515625, + "num_tokens": 8177585758.0, + "step": 15997 + }, + { + "epoch": 4.326122228231476, + "grad_norm": 0.8391000032424927, + "learning_rate": 2.844581444475588e-06, + "loss": 1.9181, + "mean_token_accuracy": 0.5450427532196045, + "num_tokens": 8178109802.0, + "step": 15998 + }, + { + "epoch": 4.326392644672796, + "grad_norm": 0.9716799855232239, + "learning_rate": 2.843914809010464e-06, + "loss": 1.8851, + "mean_token_accuracy": 0.5750865340232849, + "num_tokens": 8178576922.0, + "step": 15999 + }, + { + "epoch": 4.3266630611141155, + "grad_norm": 0.930889904499054, + "learning_rate": 2.8432484237980183e-06, + "loss": 1.9, + "mean_token_accuracy": 0.5740143060684204, + "num_tokens": 8179040938.0, + "step": 16000 + }, + { + "epoch": 4.326933477555436, + "grad_norm": 0.37588831782341003, + "learning_rate": 2.842582288858701e-06, + "loss": 1.0999, + "mean_token_accuracy": 0.7128127217292786, + "num_tokens": 8179502150.0, + "step": 16001 + }, + { + "epoch": 4.327203893996755, + "grad_norm": 0.9099023938179016, + "learning_rate": 2.8419164042129484e-06, + "loss": 1.9098, + "mean_token_accuracy": 0.5649194717407227, + "num_tokens": 8180026332.0, + "step": 16002 + }, + { + "epoch": 4.327474310438075, + "grad_norm": 0.8112642765045166, + "learning_rate": 2.8412507698811885e-06, + "loss": 1.7129, + "mean_token_accuracy": 0.596247136592865, + "num_tokens": 8180550428.0, + "step": 16003 + }, + { + "epoch": 4.327744726879394, + "grad_norm": 0.8403117656707764, + "learning_rate": 2.8405853858838516e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.592352032661438, + "num_tokens": 8181050641.0, + "step": 16004 + }, + { + "epoch": 4.328015143320714, + "grad_norm": 0.7727798819541931, + "learning_rate": 2.83992025224135e-06, + "loss": 1.9459, + "mean_token_accuracy": 0.5521528720855713, + "num_tokens": 8181574818.0, + "step": 16005 + }, + { + "epoch": 4.328285559762033, + "grad_norm": 0.8421270847320557, + "learning_rate": 2.839255368974091e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.5993345975875854, + "num_tokens": 8182098965.0, + "step": 16006 + }, + { + "epoch": 4.328555976203353, + "grad_norm": 0.8976472020149231, + "learning_rate": 2.838590736102478e-06, + "loss": 1.8499, + "mean_token_accuracy": 0.5647573471069336, + "num_tokens": 8182623089.0, + "step": 16007 + }, + { + "epoch": 4.328826392644673, + "grad_norm": 0.8704126477241516, + "learning_rate": 2.8379263536469e-06, + "loss": 2.0127, + "mean_token_accuracy": 0.5411213636398315, + "num_tokens": 8183147341.0, + "step": 16008 + }, + { + "epoch": 4.329096809085993, + "grad_norm": 0.9692211151123047, + "learning_rate": 2.8372622216277467e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5863865613937378, + "num_tokens": 8183609590.0, + "step": 16009 + }, + { + "epoch": 4.329367225527312, + "grad_norm": 0.8988884091377258, + "learning_rate": 2.836598340065394e-06, + "loss": 1.8771, + "mean_token_accuracy": 0.5617353916168213, + "num_tokens": 8184133859.0, + "step": 16010 + }, + { + "epoch": 4.329637641968632, + "grad_norm": 0.8981078863143921, + "learning_rate": 2.8359347089802093e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5702958106994629, + "num_tokens": 8184658106.0, + "step": 16011 + }, + { + "epoch": 4.329908058409951, + "grad_norm": 0.7592588663101196, + "learning_rate": 2.835271328392558e-06, + "loss": 1.8386, + "mean_token_accuracy": 0.5783549547195435, + "num_tokens": 8185156621.0, + "step": 16012 + }, + { + "epoch": 4.330178474851271, + "grad_norm": 0.8827601671218872, + "learning_rate": 2.8346081983227914e-06, + "loss": 1.8064, + "mean_token_accuracy": 0.5575532913208008, + "num_tokens": 8185680862.0, + "step": 16013 + }, + { + "epoch": 4.3304488912925905, + "grad_norm": 1.0081636905670166, + "learning_rate": 2.8339453187912596e-06, + "loss": 1.725, + "mean_token_accuracy": 0.6049599647521973, + "num_tokens": 8186204937.0, + "step": 16014 + }, + { + "epoch": 4.330719307733911, + "grad_norm": 0.9142642617225647, + "learning_rate": 2.8332826898182984e-06, + "loss": 1.7831, + "mean_token_accuracy": 0.574368953704834, + "num_tokens": 8186729116.0, + "step": 16015 + }, + { + "epoch": 4.33098972417523, + "grad_norm": 0.8641999959945679, + "learning_rate": 2.832620311424244e-06, + "loss": 1.8295, + "mean_token_accuracy": 0.5863043069839478, + "num_tokens": 8187200694.0, + "step": 16016 + }, + { + "epoch": 4.33126014061655, + "grad_norm": 0.9635865688323975, + "learning_rate": 2.831958183629415e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5732780694961548, + "num_tokens": 8187630164.0, + "step": 16017 + }, + { + "epoch": 4.331530557057869, + "grad_norm": 0.8850816488265991, + "learning_rate": 2.8312963064541306e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.5937728881835938, + "num_tokens": 8188129969.0, + "step": 16018 + }, + { + "epoch": 4.331800973499189, + "grad_norm": 1.0376447439193726, + "learning_rate": 2.830634679918699e-06, + "loss": 1.7537, + "mean_token_accuracy": 0.5858256816864014, + "num_tokens": 8188651882.0, + "step": 16019 + }, + { + "epoch": 4.332071389940508, + "grad_norm": 0.846886932849884, + "learning_rate": 2.829973304043418e-06, + "loss": 1.7719, + "mean_token_accuracy": 0.5600941777229309, + "num_tokens": 8189175969.0, + "step": 16020 + }, + { + "epoch": 4.332341806381828, + "grad_norm": 0.32617610692977905, + "learning_rate": 2.8293121788485835e-06, + "loss": 1.1173, + "mean_token_accuracy": 0.6887475252151489, + "num_tokens": 8189700178.0, + "step": 16021 + }, + { + "epoch": 4.332612222823148, + "grad_norm": 0.9295453429222107, + "learning_rate": 2.82865130435448e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5719199180603027, + "num_tokens": 8190224414.0, + "step": 16022 + }, + { + "epoch": 4.332882639264467, + "grad_norm": 1.056174635887146, + "learning_rate": 2.8279906805813834e-06, + "loss": 1.8986, + "mean_token_accuracy": 0.5670343637466431, + "num_tokens": 8190748690.0, + "step": 16023 + }, + { + "epoch": 4.333153055705787, + "grad_norm": 0.8794177770614624, + "learning_rate": 2.827330307549567e-06, + "loss": 1.7258, + "mean_token_accuracy": 0.6002072691917419, + "num_tokens": 8191272970.0, + "step": 16024 + }, + { + "epoch": 4.333423472147107, + "grad_norm": 0.7941747307777405, + "learning_rate": 2.826670185279291e-06, + "loss": 1.9439, + "mean_token_accuracy": 0.537837028503418, + "num_tokens": 8191797088.0, + "step": 16025 + }, + { + "epoch": 4.333693888588426, + "grad_norm": 0.8709018230438232, + "learning_rate": 2.8260103137908075e-06, + "loss": 1.9034, + "mean_token_accuracy": 0.5691341161727905, + "num_tokens": 8192321298.0, + "step": 16026 + }, + { + "epoch": 4.333964305029745, + "grad_norm": 1.0328096151351929, + "learning_rate": 2.8253506931043666e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5791358947753906, + "num_tokens": 8192833740.0, + "step": 16027 + }, + { + "epoch": 4.3342347214710655, + "grad_norm": 1.0104155540466309, + "learning_rate": 2.8246913232402077e-06, + "loss": 1.7582, + "mean_token_accuracy": 0.5990744829177856, + "num_tokens": 8193357919.0, + "step": 16028 + }, + { + "epoch": 4.334505137912385, + "grad_norm": 0.7954427599906921, + "learning_rate": 2.82403220421856e-06, + "loss": 1.85, + "mean_token_accuracy": 0.5816067457199097, + "num_tokens": 8193882192.0, + "step": 16029 + }, + { + "epoch": 4.334775554353705, + "grad_norm": 0.9613745212554932, + "learning_rate": 2.8233733360596495e-06, + "loss": 1.8689, + "mean_token_accuracy": 0.5561486482620239, + "num_tokens": 8194406380.0, + "step": 16030 + }, + { + "epoch": 4.335045970795024, + "grad_norm": 0.807401716709137, + "learning_rate": 2.8227147187836883e-06, + "loss": 1.7686, + "mean_token_accuracy": 0.5995386242866516, + "num_tokens": 8194920650.0, + "step": 16031 + }, + { + "epoch": 4.335316387236344, + "grad_norm": 0.9152147173881531, + "learning_rate": 2.8220563524108903e-06, + "loss": 1.8337, + "mean_token_accuracy": 0.5906398892402649, + "num_tokens": 8195444925.0, + "step": 16032 + }, + { + "epoch": 4.335586803677663, + "grad_norm": 0.8456562757492065, + "learning_rate": 2.821398236961453e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.5879402160644531, + "num_tokens": 8195969166.0, + "step": 16033 + }, + { + "epoch": 4.335857220118983, + "grad_norm": 0.766749918460846, + "learning_rate": 2.8207403724555672e-06, + "loss": 1.7179, + "mean_token_accuracy": 0.6139752864837646, + "num_tokens": 8196428909.0, + "step": 16034 + }, + { + "epoch": 4.3361276365603025, + "grad_norm": 0.7783914804458618, + "learning_rate": 2.820082758913423e-06, + "loss": 1.7264, + "mean_token_accuracy": 0.5831999778747559, + "num_tokens": 8196953172.0, + "step": 16035 + }, + { + "epoch": 4.336398053001623, + "grad_norm": 0.8000797629356384, + "learning_rate": 2.8194253963551942e-06, + "loss": 1.9137, + "mean_token_accuracy": 0.550053596496582, + "num_tokens": 8197477435.0, + "step": 16036 + }, + { + "epoch": 4.336668469442942, + "grad_norm": 1.0360780954360962, + "learning_rate": 2.8187682848010507e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.5925942659378052, + "num_tokens": 8197967512.0, + "step": 16037 + }, + { + "epoch": 4.336938885884262, + "grad_norm": 1.0200902223587036, + "learning_rate": 2.818111424271157e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.569844126701355, + "num_tokens": 8198491747.0, + "step": 16038 + }, + { + "epoch": 4.337209302325581, + "grad_norm": 0.7873961925506592, + "learning_rate": 2.817454814785666e-06, + "loss": 1.7583, + "mean_token_accuracy": 0.5970602035522461, + "num_tokens": 8198973304.0, + "step": 16039 + }, + { + "epoch": 4.337479718766901, + "grad_norm": 0.8538610339164734, + "learning_rate": 2.8167984563647245e-06, + "loss": 1.7889, + "mean_token_accuracy": 0.5926399230957031, + "num_tokens": 8199497501.0, + "step": 16040 + }, + { + "epoch": 4.33775013520822, + "grad_norm": 0.3286479711532593, + "learning_rate": 2.816142349028471e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.721780002117157, + "num_tokens": 8200021773.0, + "step": 16041 + }, + { + "epoch": 4.3380205516495405, + "grad_norm": 0.9959648251533508, + "learning_rate": 2.815486492797037e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5907707810401917, + "num_tokens": 8200546050.0, + "step": 16042 + }, + { + "epoch": 4.33829096809086, + "grad_norm": 1.080350637435913, + "learning_rate": 2.8148308876905466e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5924540758132935, + "num_tokens": 8201070137.0, + "step": 16043 + }, + { + "epoch": 4.33856138453218, + "grad_norm": 0.8271778225898743, + "learning_rate": 2.8141755337291177e-06, + "loss": 1.783, + "mean_token_accuracy": 0.5843855142593384, + "num_tokens": 8201547205.0, + "step": 16044 + }, + { + "epoch": 4.338831800973499, + "grad_norm": 0.769157886505127, + "learning_rate": 2.8135204309328544e-06, + "loss": 1.7668, + "mean_token_accuracy": 0.5965374112129211, + "num_tokens": 8202071475.0, + "step": 16045 + }, + { + "epoch": 4.339102217414819, + "grad_norm": 0.8917599320411682, + "learning_rate": 2.8128655793218594e-06, + "loss": 1.8092, + "mean_token_accuracy": 0.5765557289123535, + "num_tokens": 8202595742.0, + "step": 16046 + }, + { + "epoch": 4.339372633856138, + "grad_norm": 0.7834969162940979, + "learning_rate": 2.8122109789162255e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5916557312011719, + "num_tokens": 8203119851.0, + "step": 16047 + }, + { + "epoch": 4.339643050297458, + "grad_norm": 0.8729854822158813, + "learning_rate": 2.8115566297360386e-06, + "loss": 1.8192, + "mean_token_accuracy": 0.5741355419158936, + "num_tokens": 8203633765.0, + "step": 16048 + }, + { + "epoch": 4.3399134667387775, + "grad_norm": 0.8532403111457825, + "learning_rate": 2.810902531801373e-06, + "loss": 1.8889, + "mean_token_accuracy": 0.557221531867981, + "num_tokens": 8204157981.0, + "step": 16049 + }, + { + "epoch": 4.340183883180098, + "grad_norm": 0.8651081323623657, + "learning_rate": 2.810248685132302e-06, + "loss": 1.7491, + "mean_token_accuracy": 0.5800737142562866, + "num_tokens": 8204682256.0, + "step": 16050 + }, + { + "epoch": 4.340454299621417, + "grad_norm": 0.91588294506073, + "learning_rate": 2.8095950897488856e-06, + "loss": 1.8461, + "mean_token_accuracy": 0.5754451155662537, + "num_tokens": 8205150727.0, + "step": 16051 + }, + { + "epoch": 4.340724716062737, + "grad_norm": 0.8948493003845215, + "learning_rate": 2.8089417456711767e-06, + "loss": 1.8978, + "mean_token_accuracy": 0.5597522258758545, + "num_tokens": 8205674996.0, + "step": 16052 + }, + { + "epoch": 4.340995132504056, + "grad_norm": 0.7564201951026917, + "learning_rate": 2.808288652919225e-06, + "loss": 1.6262, + "mean_token_accuracy": 0.6288061141967773, + "num_tokens": 8206199140.0, + "step": 16053 + }, + { + "epoch": 4.341265548945376, + "grad_norm": 0.8904353380203247, + "learning_rate": 2.8076358115130652e-06, + "loss": 1.9274, + "mean_token_accuracy": 0.5638332366943359, + "num_tokens": 8206712142.0, + "step": 16054 + }, + { + "epoch": 4.341535965386695, + "grad_norm": 0.9026094079017639, + "learning_rate": 2.8069832214727337e-06, + "loss": 1.7449, + "mean_token_accuracy": 0.5864015817642212, + "num_tokens": 8207236374.0, + "step": 16055 + }, + { + "epoch": 4.3418063818280155, + "grad_norm": 0.8151059746742249, + "learning_rate": 2.8063308828182496e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5513622164726257, + "num_tokens": 8207755951.0, + "step": 16056 + }, + { + "epoch": 4.342076798269335, + "grad_norm": 0.9791679382324219, + "learning_rate": 2.8056787955696296e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5762001276016235, + "num_tokens": 8208280199.0, + "step": 16057 + }, + { + "epoch": 4.342347214710655, + "grad_norm": 0.9699086546897888, + "learning_rate": 2.805026959746883e-06, + "loss": 1.6542, + "mean_token_accuracy": 0.6164657473564148, + "num_tokens": 8208804365.0, + "step": 16058 + }, + { + "epoch": 4.342617631151974, + "grad_norm": 1.0088512897491455, + "learning_rate": 2.8043753753700093e-06, + "loss": 1.8672, + "mean_token_accuracy": 0.592728853225708, + "num_tokens": 8209328622.0, + "step": 16059 + }, + { + "epoch": 4.342888047593294, + "grad_norm": 0.9526670575141907, + "learning_rate": 2.8037240424590005e-06, + "loss": 1.8636, + "mean_token_accuracy": 0.5787362456321716, + "num_tokens": 8209804301.0, + "step": 16060 + }, + { + "epoch": 4.343158464034613, + "grad_norm": 0.35366755723953247, + "learning_rate": 2.803072961033842e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7154532074928284, + "num_tokens": 8210328468.0, + "step": 16061 + }, + { + "epoch": 4.343428880475933, + "grad_norm": 0.9628905653953552, + "learning_rate": 2.802422131114511e-06, + "loss": 1.839, + "mean_token_accuracy": 0.583166778087616, + "num_tokens": 8210852648.0, + "step": 16062 + }, + { + "epoch": 4.3436992969172525, + "grad_norm": 1.0873862504959106, + "learning_rate": 2.8017715527209747e-06, + "loss": 1.8721, + "mean_token_accuracy": 0.573217511177063, + "num_tokens": 8211376831.0, + "step": 16063 + }, + { + "epoch": 4.343969713358572, + "grad_norm": 0.9296965003013611, + "learning_rate": 2.801121225873199e-06, + "loss": 1.5559, + "mean_token_accuracy": 0.6243347525596619, + "num_tokens": 8211900964.0, + "step": 16064 + }, + { + "epoch": 4.344240129799892, + "grad_norm": 0.9084265232086182, + "learning_rate": 2.8004711505911353e-06, + "loss": 1.7885, + "mean_token_accuracy": 0.5789210796356201, + "num_tokens": 8212373425.0, + "step": 16065 + }, + { + "epoch": 4.344510546241212, + "grad_norm": 0.799719512462616, + "learning_rate": 2.799821326894727e-06, + "loss": 1.7547, + "mean_token_accuracy": 0.5932798385620117, + "num_tokens": 8212866695.0, + "step": 16066 + }, + { + "epoch": 4.344780962682531, + "grad_norm": 0.9064217209815979, + "learning_rate": 2.7991717548039194e-06, + "loss": 1.9534, + "mean_token_accuracy": 0.5439364910125732, + "num_tokens": 8213390971.0, + "step": 16067 + }, + { + "epoch": 4.34505137912385, + "grad_norm": 0.9580265283584595, + "learning_rate": 2.7985224343386377e-06, + "loss": 2.0385, + "mean_token_accuracy": 0.532539963722229, + "num_tokens": 8213878357.0, + "step": 16068 + }, + { + "epoch": 4.34532179556517, + "grad_norm": 0.9160981774330139, + "learning_rate": 2.797873365518805e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5806835889816284, + "num_tokens": 8214363936.0, + "step": 16069 + }, + { + "epoch": 4.34559221200649, + "grad_norm": 1.0365560054779053, + "learning_rate": 2.7972245483643396e-06, + "loss": 1.8339, + "mean_token_accuracy": 0.5553107261657715, + "num_tokens": 8214888187.0, + "step": 16070 + }, + { + "epoch": 4.34586262844781, + "grad_norm": 1.0657960176467896, + "learning_rate": 2.796575982895148e-06, + "loss": 1.8333, + "mean_token_accuracy": 0.6058981418609619, + "num_tokens": 8215409777.0, + "step": 16071 + }, + { + "epoch": 4.346133044889129, + "grad_norm": 0.8381242156028748, + "learning_rate": 2.7959276691311297e-06, + "loss": 1.8285, + "mean_token_accuracy": 0.573180079460144, + "num_tokens": 8215933960.0, + "step": 16072 + }, + { + "epoch": 4.346403461330449, + "grad_norm": 0.8963270783424377, + "learning_rate": 2.7952796070921783e-06, + "loss": 1.7932, + "mean_token_accuracy": 0.57355797290802, + "num_tokens": 8216458179.0, + "step": 16073 + }, + { + "epoch": 4.346673877771768, + "grad_norm": 0.9569092988967896, + "learning_rate": 2.7946317967981765e-06, + "loss": 1.7378, + "mean_token_accuracy": 0.6072957515716553, + "num_tokens": 8216934492.0, + "step": 16074 + }, + { + "epoch": 4.346944294213088, + "grad_norm": 1.0953580141067505, + "learning_rate": 2.7939842382690008e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5625751614570618, + "num_tokens": 8217458726.0, + "step": 16075 + }, + { + "epoch": 4.3472147106544075, + "grad_norm": 0.9516634941101074, + "learning_rate": 2.7933369315245224e-06, + "loss": 1.7405, + "mean_token_accuracy": 0.5849359035491943, + "num_tokens": 8217982921.0, + "step": 16076 + }, + { + "epoch": 4.3474851270957275, + "grad_norm": 0.917749285697937, + "learning_rate": 2.792689876584599e-06, + "loss": 1.7066, + "mean_token_accuracy": 0.6050872206687927, + "num_tokens": 8218506980.0, + "step": 16077 + }, + { + "epoch": 4.347755543537047, + "grad_norm": 0.8634536862373352, + "learning_rate": 2.7920430734690873e-06, + "loss": 1.878, + "mean_token_accuracy": 0.5805416107177734, + "num_tokens": 8219031135.0, + "step": 16078 + }, + { + "epoch": 4.348025959978367, + "grad_norm": 0.8197638988494873, + "learning_rate": 2.791396522197834e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5799355506896973, + "num_tokens": 8219516118.0, + "step": 16079 + }, + { + "epoch": 4.348296376419686, + "grad_norm": 0.7867786288261414, + "learning_rate": 2.790750222790673e-06, + "loss": 1.8028, + "mean_token_accuracy": 0.5803393721580505, + "num_tokens": 8220003196.0, + "step": 16080 + }, + { + "epoch": 4.348566792861006, + "grad_norm": 0.36883115768432617, + "learning_rate": 2.7901041752674373e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.7078191041946411, + "num_tokens": 8220483085.0, + "step": 16081 + }, + { + "epoch": 4.348837209302325, + "grad_norm": 0.8570137619972229, + "learning_rate": 2.7894583796479506e-06, + "loss": 1.9274, + "mean_token_accuracy": 0.5692395567893982, + "num_tokens": 8221007190.0, + "step": 16082 + }, + { + "epoch": 4.349107625743645, + "grad_norm": 0.8749241232872009, + "learning_rate": 2.7888128359520243e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5595993399620056, + "num_tokens": 8221531349.0, + "step": 16083 + }, + { + "epoch": 4.349378042184965, + "grad_norm": 0.6812179684638977, + "learning_rate": 2.7881675441994697e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.5498822331428528, + "num_tokens": 8222055626.0, + "step": 16084 + }, + { + "epoch": 4.349648458626285, + "grad_norm": 0.7860147356987, + "learning_rate": 2.787522504410083e-06, + "loss": 1.4924, + "mean_token_accuracy": 0.6337879300117493, + "num_tokens": 8222551915.0, + "step": 16085 + }, + { + "epoch": 4.349918875067604, + "grad_norm": 0.8102852702140808, + "learning_rate": 2.7868777166036567e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5701864361763, + "num_tokens": 8223076004.0, + "step": 16086 + }, + { + "epoch": 4.350189291508924, + "grad_norm": 0.9140382409095764, + "learning_rate": 2.7862331807999766e-06, + "loss": 1.8147, + "mean_token_accuracy": 0.5974428653717041, + "num_tokens": 8223554142.0, + "step": 16087 + }, + { + "epoch": 4.350459707950243, + "grad_norm": 0.8710496425628662, + "learning_rate": 2.785588897018819e-06, + "loss": 1.7218, + "mean_token_accuracy": 0.5822428464889526, + "num_tokens": 8224078282.0, + "step": 16088 + }, + { + "epoch": 4.350730124391563, + "grad_norm": 0.8074767589569092, + "learning_rate": 2.7849448652799476e-06, + "loss": 1.9, + "mean_token_accuracy": 0.5780975818634033, + "num_tokens": 8224567049.0, + "step": 16089 + }, + { + "epoch": 4.3510005408328825, + "grad_norm": 0.9296610355377197, + "learning_rate": 2.7843010856031287e-06, + "loss": 1.827, + "mean_token_accuracy": 0.5901191234588623, + "num_tokens": 8225091169.0, + "step": 16090 + }, + { + "epoch": 4.3512709572742025, + "grad_norm": 0.7529512047767639, + "learning_rate": 2.783657558008114e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5944826006889343, + "num_tokens": 8225615410.0, + "step": 16091 + }, + { + "epoch": 4.351541373715522, + "grad_norm": 0.8232950568199158, + "learning_rate": 2.783014282514646e-06, + "loss": 1.7481, + "mean_token_accuracy": 0.5769374370574951, + "num_tokens": 8226104244.0, + "step": 16092 + }, + { + "epoch": 4.351811790156842, + "grad_norm": 0.7520079612731934, + "learning_rate": 2.782371259142466e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5774582624435425, + "num_tokens": 8226628396.0, + "step": 16093 + }, + { + "epoch": 4.352082206598161, + "grad_norm": 0.8635165691375732, + "learning_rate": 2.7817284879113025e-06, + "loss": 1.785, + "mean_token_accuracy": 0.6028397083282471, + "num_tokens": 8227152653.0, + "step": 16094 + }, + { + "epoch": 4.352352623039481, + "grad_norm": 0.8573206067085266, + "learning_rate": 2.7810859688408757e-06, + "loss": 1.6642, + "mean_token_accuracy": 0.5890675783157349, + "num_tokens": 8227676783.0, + "step": 16095 + }, + { + "epoch": 4.3526230394808, + "grad_norm": 0.8027945160865784, + "learning_rate": 2.7804437019509034e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.5788981914520264, + "num_tokens": 8228197392.0, + "step": 16096 + }, + { + "epoch": 4.35289345592212, + "grad_norm": 0.7636209726333618, + "learning_rate": 2.7798016872610877e-06, + "loss": 1.7807, + "mean_token_accuracy": 0.5928203463554382, + "num_tokens": 8228671284.0, + "step": 16097 + }, + { + "epoch": 4.35316387236344, + "grad_norm": 0.8800970911979675, + "learning_rate": 2.7791599247911338e-06, + "loss": 1.8474, + "mean_token_accuracy": 0.5727123022079468, + "num_tokens": 8229195438.0, + "step": 16098 + }, + { + "epoch": 4.35343428880476, + "grad_norm": 0.8264820575714111, + "learning_rate": 2.778518414560726e-06, + "loss": 1.952, + "mean_token_accuracy": 0.5643289089202881, + "num_tokens": 8229664149.0, + "step": 16099 + }, + { + "epoch": 4.353704705246079, + "grad_norm": 0.9082998633384705, + "learning_rate": 2.7778771565895513e-06, + "loss": 1.8049, + "mean_token_accuracy": 0.5810528993606567, + "num_tokens": 8230188248.0, + "step": 16100 + }, + { + "epoch": 4.353975121687399, + "grad_norm": 0.37616297602653503, + "learning_rate": 2.7772361508972877e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.7139447927474976, + "num_tokens": 8230637507.0, + "step": 16101 + }, + { + "epoch": 4.354245538128718, + "grad_norm": 0.9884278178215027, + "learning_rate": 2.7765953975036004e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5759478807449341, + "num_tokens": 8231110461.0, + "step": 16102 + }, + { + "epoch": 4.354515954570038, + "grad_norm": 0.8540992140769958, + "learning_rate": 2.775954896428148e-06, + "loss": 1.7466, + "mean_token_accuracy": 0.5936874151229858, + "num_tokens": 8231617101.0, + "step": 16103 + }, + { + "epoch": 4.3547863710113575, + "grad_norm": 0.7933434247970581, + "learning_rate": 2.775314647690585e-06, + "loss": 1.7224, + "mean_token_accuracy": 0.5949145555496216, + "num_tokens": 8232141294.0, + "step": 16104 + }, + { + "epoch": 4.355056787452677, + "grad_norm": 0.9325520396232605, + "learning_rate": 2.774674651310557e-06, + "loss": 1.9241, + "mean_token_accuracy": 0.5569356679916382, + "num_tokens": 8232665481.0, + "step": 16105 + }, + { + "epoch": 4.355327203893997, + "grad_norm": 0.8514291048049927, + "learning_rate": 2.7740349073076984e-06, + "loss": 1.8352, + "mean_token_accuracy": 0.5838132500648499, + "num_tokens": 8233135441.0, + "step": 16106 + }, + { + "epoch": 4.355597620335317, + "grad_norm": 0.8928138613700867, + "learning_rate": 2.7733954157016414e-06, + "loss": 1.892, + "mean_token_accuracy": 0.5517722368240356, + "num_tokens": 8233659572.0, + "step": 16107 + }, + { + "epoch": 4.355868036776636, + "grad_norm": 0.8525038957595825, + "learning_rate": 2.772756176512006e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5713181495666504, + "num_tokens": 8234183718.0, + "step": 16108 + }, + { + "epoch": 4.356138453217955, + "grad_norm": 0.9235404133796692, + "learning_rate": 2.7721171897584033e-06, + "loss": 1.9338, + "mean_token_accuracy": 0.552336573600769, + "num_tokens": 8234707894.0, + "step": 16109 + }, + { + "epoch": 4.356408869659275, + "grad_norm": 0.9030181765556335, + "learning_rate": 2.7714784554604445e-06, + "loss": 1.7569, + "mean_token_accuracy": 0.572447657585144, + "num_tokens": 8235232014.0, + "step": 16110 + }, + { + "epoch": 4.3566792861005945, + "grad_norm": 0.8376818895339966, + "learning_rate": 2.770839973637725e-06, + "loss": 1.6705, + "mean_token_accuracy": 0.6223565936088562, + "num_tokens": 8235745847.0, + "step": 16111 + }, + { + "epoch": 4.356949702541915, + "grad_norm": 0.8947389125823975, + "learning_rate": 2.770201744309834e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5537710785865784, + "num_tokens": 8236269979.0, + "step": 16112 + }, + { + "epoch": 4.357220118983234, + "grad_norm": 0.9032676815986633, + "learning_rate": 2.769563767496355e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5668647885322571, + "num_tokens": 8236794123.0, + "step": 16113 + }, + { + "epoch": 4.357490535424554, + "grad_norm": 0.8650619387626648, + "learning_rate": 2.768926043216866e-06, + "loss": 1.9303, + "mean_token_accuracy": 0.5592959523200989, + "num_tokens": 8237318382.0, + "step": 16114 + }, + { + "epoch": 4.357760951865873, + "grad_norm": 0.795681893825531, + "learning_rate": 2.7682885714909297e-06, + "loss": 1.8101, + "mean_token_accuracy": 0.5645918846130371, + "num_tokens": 8237842511.0, + "step": 16115 + }, + { + "epoch": 4.358031368307193, + "grad_norm": 0.9146254062652588, + "learning_rate": 2.76765135233811e-06, + "loss": 1.8838, + "mean_token_accuracy": 0.5740859508514404, + "num_tokens": 8238309225.0, + "step": 16116 + }, + { + "epoch": 4.358301784748512, + "grad_norm": 0.9430210590362549, + "learning_rate": 2.7670143857779565e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5706013441085815, + "num_tokens": 8238833457.0, + "step": 16117 + }, + { + "epoch": 4.3585722011898325, + "grad_norm": 0.9239507913589478, + "learning_rate": 2.7663776718300113e-06, + "loss": 1.8154, + "mean_token_accuracy": 0.5668689012527466, + "num_tokens": 8239357682.0, + "step": 16118 + }, + { + "epoch": 4.358842617631152, + "grad_norm": 0.8512819409370422, + "learning_rate": 2.7657412105138147e-06, + "loss": 1.813, + "mean_token_accuracy": 0.5948513150215149, + "num_tokens": 8239881961.0, + "step": 16119 + }, + { + "epoch": 4.359113034072472, + "grad_norm": 0.8184904456138611, + "learning_rate": 2.765105001848894e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5851666927337646, + "num_tokens": 8240406241.0, + "step": 16120 + }, + { + "epoch": 4.359383450513791, + "grad_norm": 0.337884783744812, + "learning_rate": 2.7644690458547657e-06, + "loss": 1.1386, + "mean_token_accuracy": 0.6978172063827515, + "num_tokens": 8240930520.0, + "step": 16121 + }, + { + "epoch": 4.359653866955111, + "grad_norm": 0.8419917225837708, + "learning_rate": 2.7638333425509485e-06, + "loss": 1.8812, + "mean_token_accuracy": 0.556163489818573, + "num_tokens": 8241454781.0, + "step": 16122 + }, + { + "epoch": 4.35992428339643, + "grad_norm": 1.0214210748672485, + "learning_rate": 2.763197891956944e-06, + "loss": 1.8635, + "mean_token_accuracy": 0.5730794668197632, + "num_tokens": 8241979022.0, + "step": 16123 + }, + { + "epoch": 4.36019469983775, + "grad_norm": 0.8963714241981506, + "learning_rate": 2.7625626940922525e-06, + "loss": 1.8805, + "mean_token_accuracy": 0.5656747817993164, + "num_tokens": 8242503234.0, + "step": 16124 + }, + { + "epoch": 4.3604651162790695, + "grad_norm": 1.0751895904541016, + "learning_rate": 2.7619277489763622e-06, + "loss": 1.8943, + "mean_token_accuracy": 0.5692653656005859, + "num_tokens": 8243027520.0, + "step": 16125 + }, + { + "epoch": 4.36073553272039, + "grad_norm": 0.987659215927124, + "learning_rate": 2.7612930566287538e-06, + "loss": 1.7997, + "mean_token_accuracy": 0.5693811178207397, + "num_tokens": 8243551665.0, + "step": 16126 + }, + { + "epoch": 4.361005949161709, + "grad_norm": 1.0277787446975708, + "learning_rate": 2.760658617068903e-06, + "loss": 1.7136, + "mean_token_accuracy": 0.583045482635498, + "num_tokens": 8244075672.0, + "step": 16127 + }, + { + "epoch": 4.361276365603029, + "grad_norm": 0.8137012720108032, + "learning_rate": 2.760024430316277e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.5717888474464417, + "num_tokens": 8244599935.0, + "step": 16128 + }, + { + "epoch": 4.361546782044348, + "grad_norm": 0.9627756476402283, + "learning_rate": 2.7593904963903327e-06, + "loss": 1.8132, + "mean_token_accuracy": 0.5936533212661743, + "num_tokens": 8245112857.0, + "step": 16129 + }, + { + "epoch": 4.361817198485668, + "grad_norm": 0.9617366194725037, + "learning_rate": 2.758756815310524e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5760495662689209, + "num_tokens": 8245637041.0, + "step": 16130 + }, + { + "epoch": 4.362087614926987, + "grad_norm": 1.0066049098968506, + "learning_rate": 2.7581233870962913e-06, + "loss": 1.8498, + "mean_token_accuracy": 0.5777840614318848, + "num_tokens": 8246161312.0, + "step": 16131 + }, + { + "epoch": 4.3623580313683075, + "grad_norm": 1.055069088935852, + "learning_rate": 2.7574902117670697e-06, + "loss": 1.7743, + "mean_token_accuracy": 0.5683059692382812, + "num_tokens": 8246685564.0, + "step": 16132 + }, + { + "epoch": 4.362628447809627, + "grad_norm": 0.9052609205245972, + "learning_rate": 2.7568572893422896e-06, + "loss": 1.7922, + "mean_token_accuracy": 0.5818862318992615, + "num_tokens": 8247209574.0, + "step": 16133 + }, + { + "epoch": 4.362898864250947, + "grad_norm": 0.8952510952949524, + "learning_rate": 2.756224619841369e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5759910345077515, + "num_tokens": 8247679316.0, + "step": 16134 + }, + { + "epoch": 4.363169280692266, + "grad_norm": 0.8935516476631165, + "learning_rate": 2.75559220328372e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5852410793304443, + "num_tokens": 8248161243.0, + "step": 16135 + }, + { + "epoch": 4.363439697133586, + "grad_norm": 0.8390777111053467, + "learning_rate": 2.7549600396887478e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.5595420598983765, + "num_tokens": 8248685514.0, + "step": 16136 + }, + { + "epoch": 4.363710113574905, + "grad_norm": 0.8566527962684631, + "learning_rate": 2.7543281290758483e-06, + "loss": 1.8523, + "mean_token_accuracy": 0.5645596981048584, + "num_tokens": 8249209715.0, + "step": 16137 + }, + { + "epoch": 4.363980530016225, + "grad_norm": 0.8918218612670898, + "learning_rate": 2.7536964714644104e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.5781897902488708, + "num_tokens": 8249733916.0, + "step": 16138 + }, + { + "epoch": 4.3642509464575445, + "grad_norm": 0.8818287253379822, + "learning_rate": 2.753065066873816e-06, + "loss": 1.9227, + "mean_token_accuracy": 0.5460923314094543, + "num_tokens": 8250258170.0, + "step": 16139 + }, + { + "epoch": 4.364521362898865, + "grad_norm": 0.782167911529541, + "learning_rate": 2.752433915323437e-06, + "loss": 1.8205, + "mean_token_accuracy": 0.5745985507965088, + "num_tokens": 8250782373.0, + "step": 16140 + }, + { + "epoch": 4.364791779340184, + "grad_norm": 0.3141710162162781, + "learning_rate": 2.751803016832642e-06, + "loss": 1.1046, + "mean_token_accuracy": 0.6973950266838074, + "num_tokens": 8251306651.0, + "step": 16141 + }, + { + "epoch": 4.365062195781504, + "grad_norm": 0.9024785757064819, + "learning_rate": 2.751172371420785e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5711091756820679, + "num_tokens": 8251830891.0, + "step": 16142 + }, + { + "epoch": 4.365332612222823, + "grad_norm": 0.9831063151359558, + "learning_rate": 2.7505419791072196e-06, + "loss": 1.8602, + "mean_token_accuracy": 0.572954535484314, + "num_tokens": 8252355049.0, + "step": 16143 + }, + { + "epoch": 4.365603028664143, + "grad_norm": 0.807514488697052, + "learning_rate": 2.7499118399112846e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5606766939163208, + "num_tokens": 8252879217.0, + "step": 16144 + }, + { + "epoch": 4.365873445105462, + "grad_norm": 0.8667873740196228, + "learning_rate": 2.749281953852317e-06, + "loss": 1.8945, + "mean_token_accuracy": 0.5638935565948486, + "num_tokens": 8253403397.0, + "step": 16145 + }, + { + "epoch": 4.366143861546782, + "grad_norm": 0.8179121613502502, + "learning_rate": 2.7486523209496424e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.5626024007797241, + "num_tokens": 8253927672.0, + "step": 16146 + }, + { + "epoch": 4.366414277988102, + "grad_norm": 0.9648574590682983, + "learning_rate": 2.748022941222581e-06, + "loss": 1.8747, + "mean_token_accuracy": 0.5768553018569946, + "num_tokens": 8254451856.0, + "step": 16147 + }, + { + "epoch": 4.366684694429422, + "grad_norm": 0.950379490852356, + "learning_rate": 2.7473938146904445e-06, + "loss": 1.816, + "mean_token_accuracy": 0.5884586572647095, + "num_tokens": 8254931555.0, + "step": 16148 + }, + { + "epoch": 4.366955110870741, + "grad_norm": 0.7972749471664429, + "learning_rate": 2.7467649413725318e-06, + "loss": 1.8169, + "mean_token_accuracy": 0.5755232572555542, + "num_tokens": 8255404092.0, + "step": 16149 + }, + { + "epoch": 4.36722552731206, + "grad_norm": 0.8341914415359497, + "learning_rate": 2.7461363212881443e-06, + "loss": 1.7005, + "mean_token_accuracy": 0.6362203359603882, + "num_tokens": 8255864128.0, + "step": 16150 + }, + { + "epoch": 4.36749594375338, + "grad_norm": 0.9067995548248291, + "learning_rate": 2.7455079544565678e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5710910558700562, + "num_tokens": 8256388347.0, + "step": 16151 + }, + { + "epoch": 4.367766360194699, + "grad_norm": 0.8877078890800476, + "learning_rate": 2.74487984089708e-06, + "loss": 1.6763, + "mean_token_accuracy": 0.602033793926239, + "num_tokens": 8256863218.0, + "step": 16152 + }, + { + "epoch": 4.3680367766360195, + "grad_norm": 0.9178370237350464, + "learning_rate": 2.744251980628957e-06, + "loss": 1.769, + "mean_token_accuracy": 0.5989372134208679, + "num_tokens": 8257387503.0, + "step": 16153 + }, + { + "epoch": 4.368307193077339, + "grad_norm": 0.9413103461265564, + "learning_rate": 2.7436243736714617e-06, + "loss": 1.7093, + "mean_token_accuracy": 0.5925270318984985, + "num_tokens": 8257858270.0, + "step": 16154 + }, + { + "epoch": 4.368577609518659, + "grad_norm": 0.9299713969230652, + "learning_rate": 2.74299702004385e-06, + "loss": 1.8619, + "mean_token_accuracy": 0.5708394050598145, + "num_tokens": 8258382475.0, + "step": 16155 + }, + { + "epoch": 4.368848025959978, + "grad_norm": 0.8574255704879761, + "learning_rate": 2.742369919765372e-06, + "loss": 1.799, + "mean_token_accuracy": 0.5740841627120972, + "num_tokens": 8258906721.0, + "step": 16156 + }, + { + "epoch": 4.369118442401298, + "grad_norm": 0.9423446655273438, + "learning_rate": 2.7417430728552707e-06, + "loss": 1.7897, + "mean_token_accuracy": 0.5798540711402893, + "num_tokens": 8259430853.0, + "step": 16157 + }, + { + "epoch": 4.369388858842617, + "grad_norm": 0.9237724542617798, + "learning_rate": 2.741116479332776e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.5811929702758789, + "num_tokens": 8259897971.0, + "step": 16158 + }, + { + "epoch": 4.369659275283937, + "grad_norm": 1.0831706523895264, + "learning_rate": 2.740490139217117e-06, + "loss": 1.9145, + "mean_token_accuracy": 0.549566388130188, + "num_tokens": 8260422056.0, + "step": 16159 + }, + { + "epoch": 4.369929691725257, + "grad_norm": 1.1106598377227783, + "learning_rate": 2.73986405252751e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.5912021398544312, + "num_tokens": 8260865619.0, + "step": 16160 + }, + { + "epoch": 4.370200108166577, + "grad_norm": 0.3304283320903778, + "learning_rate": 2.739238219283164e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7161828875541687, + "num_tokens": 8261389880.0, + "step": 16161 + }, + { + "epoch": 4.370470524607896, + "grad_norm": 0.945857584476471, + "learning_rate": 2.7386126395032842e-06, + "loss": 1.7907, + "mean_token_accuracy": 0.5754592418670654, + "num_tokens": 8261914096.0, + "step": 16162 + }, + { + "epoch": 4.370740941049216, + "grad_norm": 1.0237503051757812, + "learning_rate": 2.737987313207064e-06, + "loss": 1.8592, + "mean_token_accuracy": 0.5523240566253662, + "num_tokens": 8262438243.0, + "step": 16163 + }, + { + "epoch": 4.371011357490535, + "grad_norm": 0.919642984867096, + "learning_rate": 2.7373622404136884e-06, + "loss": 1.7683, + "mean_token_accuracy": 0.5919108390808105, + "num_tokens": 8262940839.0, + "step": 16164 + }, + { + "epoch": 4.371281773931855, + "grad_norm": 1.0612759590148926, + "learning_rate": 2.7367374211423393e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5485043525695801, + "num_tokens": 8263421154.0, + "step": 16165 + }, + { + "epoch": 4.371552190373174, + "grad_norm": 0.9120994806289673, + "learning_rate": 2.7361128554121875e-06, + "loss": 1.799, + "mean_token_accuracy": 0.5818380117416382, + "num_tokens": 8263945377.0, + "step": 16166 + }, + { + "epoch": 4.3718226068144945, + "grad_norm": 0.9589935541152954, + "learning_rate": 2.735488543242393e-06, + "loss": 1.743, + "mean_token_accuracy": 0.5861998796463013, + "num_tokens": 8264469662.0, + "step": 16167 + }, + { + "epoch": 4.372093023255814, + "grad_norm": 0.8710707426071167, + "learning_rate": 2.7348644846521173e-06, + "loss": 1.8268, + "mean_token_accuracy": 0.5793644785881042, + "num_tokens": 8264993879.0, + "step": 16168 + }, + { + "epoch": 4.372363439697134, + "grad_norm": 0.991459310054779, + "learning_rate": 2.734240679660503e-06, + "loss": 1.7883, + "mean_token_accuracy": 0.5949999094009399, + "num_tokens": 8265518068.0, + "step": 16169 + }, + { + "epoch": 4.372633856138453, + "grad_norm": 0.7916221022605896, + "learning_rate": 2.733617128286693e-06, + "loss": 1.9262, + "mean_token_accuracy": 0.5594229698181152, + "num_tokens": 8266042190.0, + "step": 16170 + }, + { + "epoch": 4.372904272579773, + "grad_norm": 0.7999862432479858, + "learning_rate": 2.7329938305498203e-06, + "loss": 1.8932, + "mean_token_accuracy": 0.5645265579223633, + "num_tokens": 8266566467.0, + "step": 16171 + }, + { + "epoch": 4.373174689021092, + "grad_norm": 0.7643865346908569, + "learning_rate": 2.732370786469008e-06, + "loss": 1.8275, + "mean_token_accuracy": 0.5762628316879272, + "num_tokens": 8267090743.0, + "step": 16172 + }, + { + "epoch": 4.373445105462412, + "grad_norm": 0.8594830632209778, + "learning_rate": 2.7317479960633742e-06, + "loss": 1.8798, + "mean_token_accuracy": 0.5713881254196167, + "num_tokens": 8267614881.0, + "step": 16173 + }, + { + "epoch": 4.373715521903732, + "grad_norm": 0.9180176854133606, + "learning_rate": 2.7311254593520274e-06, + "loss": 1.9255, + "mean_token_accuracy": 0.550325870513916, + "num_tokens": 8268139051.0, + "step": 16174 + }, + { + "epoch": 4.373985938345052, + "grad_norm": 0.8905978798866272, + "learning_rate": 2.730503176354068e-06, + "loss": 1.7757, + "mean_token_accuracy": 0.5943427085876465, + "num_tokens": 8268616165.0, + "step": 16175 + }, + { + "epoch": 4.374256354786371, + "grad_norm": 0.7771614193916321, + "learning_rate": 2.729881147088591e-06, + "loss": 1.8026, + "mean_token_accuracy": 0.5838793516159058, + "num_tokens": 8269140367.0, + "step": 16176 + }, + { + "epoch": 4.374526771227691, + "grad_norm": 0.7742796540260315, + "learning_rate": 2.7292593715746805e-06, + "loss": 1.9164, + "mean_token_accuracy": 0.5647331476211548, + "num_tokens": 8269664452.0, + "step": 16177 + }, + { + "epoch": 4.37479718766901, + "grad_norm": 0.8677051067352295, + "learning_rate": 2.728637849831415e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5642606019973755, + "num_tokens": 8270150866.0, + "step": 16178 + }, + { + "epoch": 4.37506760411033, + "grad_norm": 0.7777577042579651, + "learning_rate": 2.7280165818778652e-06, + "loss": 1.8726, + "mean_token_accuracy": 0.5773612260818481, + "num_tokens": 8270675119.0, + "step": 16179 + }, + { + "epoch": 4.375338020551649, + "grad_norm": 0.8853181600570679, + "learning_rate": 2.7273955677330944e-06, + "loss": 1.8723, + "mean_token_accuracy": 0.563709020614624, + "num_tokens": 8271142138.0, + "step": 16180 + }, + { + "epoch": 4.3756084369929695, + "grad_norm": 0.3447881042957306, + "learning_rate": 2.7267748074161524e-06, + "loss": 1.1074, + "mean_token_accuracy": 0.695120096206665, + "num_tokens": 8271666414.0, + "step": 16181 + }, + { + "epoch": 4.375878853434289, + "grad_norm": 1.0104610919952393, + "learning_rate": 2.726154300946092e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.5742336511611938, + "num_tokens": 8272190581.0, + "step": 16182 + }, + { + "epoch": 4.376149269875609, + "grad_norm": 0.8598110675811768, + "learning_rate": 2.7255340483419466e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5723989009857178, + "num_tokens": 8272714817.0, + "step": 16183 + }, + { + "epoch": 4.376419686316928, + "grad_norm": 0.921125054359436, + "learning_rate": 2.7249140496227534e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5715513229370117, + "num_tokens": 8273239062.0, + "step": 16184 + }, + { + "epoch": 4.376690102758248, + "grad_norm": 0.8042295575141907, + "learning_rate": 2.7242943048075303e-06, + "loss": 1.733, + "mean_token_accuracy": 0.6015962958335876, + "num_tokens": 8273763273.0, + "step": 16185 + }, + { + "epoch": 4.376960519199567, + "grad_norm": 0.9245854020118713, + "learning_rate": 2.7236748139152965e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5748600959777832, + "num_tokens": 8274287535.0, + "step": 16186 + }, + { + "epoch": 4.3772309356408865, + "grad_norm": 0.8114863634109497, + "learning_rate": 2.723055576965058e-06, + "loss": 1.7716, + "mean_token_accuracy": 0.6003606915473938, + "num_tokens": 8274785292.0, + "step": 16187 + }, + { + "epoch": 4.377501352082207, + "grad_norm": 0.8374509811401367, + "learning_rate": 2.722436593975815e-06, + "loss": 1.7662, + "mean_token_accuracy": 0.5732126235961914, + "num_tokens": 8275309535.0, + "step": 16188 + }, + { + "epoch": 4.377771768523527, + "grad_norm": 0.829383373260498, + "learning_rate": 2.7218178649665615e-06, + "loss": 1.6062, + "mean_token_accuracy": 0.6254621744155884, + "num_tokens": 8275811693.0, + "step": 16189 + }, + { + "epoch": 4.378042184964846, + "grad_norm": 0.9592364430427551, + "learning_rate": 2.7211993899562795e-06, + "loss": 1.8592, + "mean_token_accuracy": 0.5601680874824524, + "num_tokens": 8276314764.0, + "step": 16190 + }, + { + "epoch": 4.378312601406165, + "grad_norm": 1.013974666595459, + "learning_rate": 2.7205811689639465e-06, + "loss": 1.856, + "mean_token_accuracy": 0.5709032416343689, + "num_tokens": 8276838975.0, + "step": 16191 + }, + { + "epoch": 4.378583017847485, + "grad_norm": 0.8325334787368774, + "learning_rate": 2.719963202008531e-06, + "loss": 1.7749, + "mean_token_accuracy": 0.5911047458648682, + "num_tokens": 8277363181.0, + "step": 16192 + }, + { + "epoch": 4.378853434288804, + "grad_norm": 0.9333621859550476, + "learning_rate": 2.719345489108995e-06, + "loss": 1.8144, + "mean_token_accuracy": 0.5545083284378052, + "num_tokens": 8277887365.0, + "step": 16193 + }, + { + "epoch": 4.3791238507301244, + "grad_norm": 1.005044937133789, + "learning_rate": 2.7187280302842924e-06, + "loss": 1.8534, + "mean_token_accuracy": 0.5618348121643066, + "num_tokens": 8278411546.0, + "step": 16194 + }, + { + "epoch": 4.379394267171444, + "grad_norm": 0.8656541705131531, + "learning_rate": 2.7181108255533646e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5727031230926514, + "num_tokens": 8278892957.0, + "step": 16195 + }, + { + "epoch": 4.379664683612764, + "grad_norm": 0.9284042716026306, + "learning_rate": 2.7174938749351543e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5795841217041016, + "num_tokens": 8279417166.0, + "step": 16196 + }, + { + "epoch": 4.379935100054083, + "grad_norm": 1.0431804656982422, + "learning_rate": 2.716877178448588e-06, + "loss": 1.7704, + "mean_token_accuracy": 0.5816670656204224, + "num_tokens": 8279941389.0, + "step": 16197 + }, + { + "epoch": 4.380205516495403, + "grad_norm": 0.8051857948303223, + "learning_rate": 2.71626073611259e-06, + "loss": 1.9225, + "mean_token_accuracy": 0.5653403997421265, + "num_tokens": 8280465563.0, + "step": 16198 + }, + { + "epoch": 4.380475932936722, + "grad_norm": 0.94025057554245, + "learning_rate": 2.7156445479460725e-06, + "loss": 1.9004, + "mean_token_accuracy": 0.5570210218429565, + "num_tokens": 8280989716.0, + "step": 16199 + }, + { + "epoch": 4.380746349378042, + "grad_norm": 0.8390653133392334, + "learning_rate": 2.7150286139679445e-06, + "loss": 1.9398, + "mean_token_accuracy": 0.5572271943092346, + "num_tokens": 8281513849.0, + "step": 16200 + }, + { + "epoch": 4.3810167658193615, + "grad_norm": 0.4376572370529175, + "learning_rate": 2.7144129341971002e-06, + "loss": 1.1957, + "mean_token_accuracy": 0.6834322214126587, + "num_tokens": 8281979198.0, + "step": 16201 + }, + { + "epoch": 4.381287182260682, + "grad_norm": 0.9385073184967041, + "learning_rate": 2.713797508652436e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5736135244369507, + "num_tokens": 8282503358.0, + "step": 16202 + }, + { + "epoch": 4.381557598702001, + "grad_norm": 1.1043651103973389, + "learning_rate": 2.7131823373528327e-06, + "loss": 1.7833, + "mean_token_accuracy": 0.594922661781311, + "num_tokens": 8283027548.0, + "step": 16203 + }, + { + "epoch": 4.381828015143321, + "grad_norm": 0.9209941029548645, + "learning_rate": 2.712567420317163e-06, + "loss": 1.9077, + "mean_token_accuracy": 0.553424596786499, + "num_tokens": 8283551740.0, + "step": 16204 + }, + { + "epoch": 4.38209843158464, + "grad_norm": 0.9269095063209534, + "learning_rate": 2.711952757564299e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5751609206199646, + "num_tokens": 8284076018.0, + "step": 16205 + }, + { + "epoch": 4.38236884802596, + "grad_norm": 0.7815455198287964, + "learning_rate": 2.7113383491130974e-06, + "loss": 1.773, + "mean_token_accuracy": 0.5922557711601257, + "num_tokens": 8284600298.0, + "step": 16206 + }, + { + "epoch": 4.382639264467279, + "grad_norm": 0.8306549787521362, + "learning_rate": 2.7107241949824088e-06, + "loss": 1.8827, + "mean_token_accuracy": 0.5674135684967041, + "num_tokens": 8285124579.0, + "step": 16207 + }, + { + "epoch": 4.3829096809085994, + "grad_norm": 1.074959635734558, + "learning_rate": 2.7101102951910808e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5888035893440247, + "num_tokens": 8285590859.0, + "step": 16208 + }, + { + "epoch": 4.383180097349919, + "grad_norm": 0.9920074939727783, + "learning_rate": 2.7094966497579485e-06, + "loss": 1.8024, + "mean_token_accuracy": 0.5956128835678101, + "num_tokens": 8286115113.0, + "step": 16209 + }, + { + "epoch": 4.383450513791239, + "grad_norm": 0.8506423830986023, + "learning_rate": 2.7088832587018376e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.5757791996002197, + "num_tokens": 8286639355.0, + "step": 16210 + }, + { + "epoch": 4.383720930232558, + "grad_norm": 0.7945981621742249, + "learning_rate": 2.708270122041573e-06, + "loss": 1.8313, + "mean_token_accuracy": 0.5669711828231812, + "num_tokens": 8287163622.0, + "step": 16211 + }, + { + "epoch": 4.383991346673878, + "grad_norm": 0.9261264801025391, + "learning_rate": 2.7076572397959627e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5677900910377502, + "num_tokens": 8287687887.0, + "step": 16212 + }, + { + "epoch": 4.384261763115197, + "grad_norm": 0.8877354264259338, + "learning_rate": 2.7070446119838155e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.5767313241958618, + "num_tokens": 8288212082.0, + "step": 16213 + }, + { + "epoch": 4.384532179556517, + "grad_norm": 0.8989190459251404, + "learning_rate": 2.7064322386239294e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5742629170417786, + "num_tokens": 8288685568.0, + "step": 16214 + }, + { + "epoch": 4.3848025959978365, + "grad_norm": 0.7813180685043335, + "learning_rate": 2.7058201197350897e-06, + "loss": 1.8139, + "mean_token_accuracy": 0.5674406290054321, + "num_tokens": 8289209792.0, + "step": 16215 + }, + { + "epoch": 4.385073012439157, + "grad_norm": 0.8912730813026428, + "learning_rate": 2.7052082553360814e-06, + "loss": 1.8582, + "mean_token_accuracy": 0.5658937692642212, + "num_tokens": 8289734063.0, + "step": 16216 + }, + { + "epoch": 4.385343428880476, + "grad_norm": 0.8933124542236328, + "learning_rate": 2.7045966454456786e-06, + "loss": 1.8308, + "mean_token_accuracy": 0.5763610005378723, + "num_tokens": 8290241688.0, + "step": 16217 + }, + { + "epoch": 4.385613845321796, + "grad_norm": 0.8762218356132507, + "learning_rate": 2.7039852900826442e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5506472587585449, + "num_tokens": 8290765861.0, + "step": 16218 + }, + { + "epoch": 4.385884261763115, + "grad_norm": 0.8098008632659912, + "learning_rate": 2.7033741892657393e-06, + "loss": 1.7697, + "mean_token_accuracy": 0.5811101794242859, + "num_tokens": 8291289996.0, + "step": 16219 + }, + { + "epoch": 4.386154678204435, + "grad_norm": 0.8016395568847656, + "learning_rate": 2.702763343013713e-06, + "loss": 1.8036, + "mean_token_accuracy": 0.5707679390907288, + "num_tokens": 8291814031.0, + "step": 16220 + }, + { + "epoch": 4.386425094645754, + "grad_norm": 0.33335500955581665, + "learning_rate": 2.7021527513453056e-06, + "loss": 1.1517, + "mean_token_accuracy": 0.6962522268295288, + "num_tokens": 8292338180.0, + "step": 16221 + }, + { + "epoch": 4.3866955110870745, + "grad_norm": 1.1603474617004395, + "learning_rate": 2.701542414279257e-06, + "loss": 1.88, + "mean_token_accuracy": 0.5670595169067383, + "num_tokens": 8292862411.0, + "step": 16222 + }, + { + "epoch": 4.386965927528394, + "grad_norm": 0.8930793404579163, + "learning_rate": 2.7009323318342905e-06, + "loss": 1.7803, + "mean_token_accuracy": 0.599338948726654, + "num_tokens": 8293328757.0, + "step": 16223 + }, + { + "epoch": 4.387236343969714, + "grad_norm": 0.8344774842262268, + "learning_rate": 2.700322504029125e-06, + "loss": 1.8551, + "mean_token_accuracy": 0.5651483535766602, + "num_tokens": 8293806157.0, + "step": 16224 + }, + { + "epoch": 4.387506760411033, + "grad_norm": 1.1226089000701904, + "learning_rate": 2.699712930882474e-06, + "loss": 1.8219, + "mean_token_accuracy": 0.5864375829696655, + "num_tokens": 8294306810.0, + "step": 16225 + }, + { + "epoch": 4.387777176852353, + "grad_norm": 0.9821133017539978, + "learning_rate": 2.6991036124130375e-06, + "loss": 1.7677, + "mean_token_accuracy": 0.5915336608886719, + "num_tokens": 8294831086.0, + "step": 16226 + }, + { + "epoch": 4.388047593293672, + "grad_norm": 1.2742259502410889, + "learning_rate": 2.698494548639517e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.5390472412109375, + "num_tokens": 8295355274.0, + "step": 16227 + }, + { + "epoch": 4.388318009734991, + "grad_norm": 0.7348055243492126, + "learning_rate": 2.6978857395805937e-06, + "loss": 1.7611, + "mean_token_accuracy": 0.5618463754653931, + "num_tokens": 8295879471.0, + "step": 16228 + }, + { + "epoch": 4.3885884261763115, + "grad_norm": 1.074975848197937, + "learning_rate": 2.697277185254953e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5787920355796814, + "num_tokens": 8296403726.0, + "step": 16229 + }, + { + "epoch": 4.388858842617632, + "grad_norm": 1.0288875102996826, + "learning_rate": 2.696668885681264e-06, + "loss": 1.8687, + "mean_token_accuracy": 0.5570318698883057, + "num_tokens": 8296918300.0, + "step": 16230 + }, + { + "epoch": 4.389129259058951, + "grad_norm": 1.2119542360305786, + "learning_rate": 2.696060840878193e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5782392621040344, + "num_tokens": 8297401492.0, + "step": 16231 + }, + { + "epoch": 4.38939967550027, + "grad_norm": 1.0898988246917725, + "learning_rate": 2.695453050864395e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5737314224243164, + "num_tokens": 8297925759.0, + "step": 16232 + }, + { + "epoch": 4.38967009194159, + "grad_norm": 0.9262891411781311, + "learning_rate": 2.694845515658519e-06, + "loss": 1.7477, + "mean_token_accuracy": 0.6042708158493042, + "num_tokens": 8298450044.0, + "step": 16233 + }, + { + "epoch": 4.389940508382909, + "grad_norm": 0.7972708344459534, + "learning_rate": 2.6942382352792073e-06, + "loss": 1.7956, + "mean_token_accuracy": 0.5868526101112366, + "num_tokens": 8298941647.0, + "step": 16234 + }, + { + "epoch": 4.390210924824229, + "grad_norm": 0.8462345600128174, + "learning_rate": 2.6936312097450933e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5787732601165771, + "num_tokens": 8299465842.0, + "step": 16235 + }, + { + "epoch": 4.390481341265549, + "grad_norm": 0.9828410744667053, + "learning_rate": 2.693024439074799e-06, + "loss": 1.9062, + "mean_token_accuracy": 0.5473114252090454, + "num_tokens": 8299990115.0, + "step": 16236 + }, + { + "epoch": 4.390751757706869, + "grad_norm": 1.1400830745697021, + "learning_rate": 2.6924179232869457e-06, + "loss": 1.7238, + "mean_token_accuracy": 0.5922951698303223, + "num_tokens": 8300482428.0, + "step": 16237 + }, + { + "epoch": 4.391022174148188, + "grad_norm": 1.128420114517212, + "learning_rate": 2.69181166240014e-06, + "loss": 1.9276, + "mean_token_accuracy": 0.5655996203422546, + "num_tokens": 8301006642.0, + "step": 16238 + }, + { + "epoch": 4.391292590589508, + "grad_norm": 0.977561354637146, + "learning_rate": 2.6912056564329862e-06, + "loss": 1.8834, + "mean_token_accuracy": 0.5680327415466309, + "num_tokens": 8301530885.0, + "step": 16239 + }, + { + "epoch": 4.391563007030827, + "grad_norm": 1.0130510330200195, + "learning_rate": 2.690599905404076e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.573692262172699, + "num_tokens": 8302055167.0, + "step": 16240 + }, + { + "epoch": 4.391833423472147, + "grad_norm": 0.38605090975761414, + "learning_rate": 2.6899944093319997e-06, + "loss": 1.1704, + "mean_token_accuracy": 0.6718575358390808, + "num_tokens": 8302579418.0, + "step": 16241 + }, + { + "epoch": 4.392103839913466, + "grad_norm": 1.0403251647949219, + "learning_rate": 2.68938916823533e-06, + "loss": 1.801, + "mean_token_accuracy": 0.5737131834030151, + "num_tokens": 8303103551.0, + "step": 16242 + }, + { + "epoch": 4.3923742563547865, + "grad_norm": 0.7889463305473328, + "learning_rate": 2.6887841821326432e-06, + "loss": 1.8004, + "mean_token_accuracy": 0.5695400238037109, + "num_tokens": 8303627812.0, + "step": 16243 + }, + { + "epoch": 4.392644672796106, + "grad_norm": 0.8342224359512329, + "learning_rate": 2.688179451042498e-06, + "loss": 1.795, + "mean_token_accuracy": 0.5731909871101379, + "num_tokens": 8304152021.0, + "step": 16244 + }, + { + "epoch": 4.392915089237426, + "grad_norm": 0.8523207902908325, + "learning_rate": 2.687574974983452e-06, + "loss": 1.7598, + "mean_token_accuracy": 0.5920174717903137, + "num_tokens": 8304676277.0, + "step": 16245 + }, + { + "epoch": 4.393185505678745, + "grad_norm": 0.8997477293014526, + "learning_rate": 2.686970753974051e-06, + "loss": 1.9653, + "mean_token_accuracy": 0.5401184558868408, + "num_tokens": 8305200455.0, + "step": 16246 + }, + { + "epoch": 4.393455922120065, + "grad_norm": 0.9641953110694885, + "learning_rate": 2.6863667880328327e-06, + "loss": 1.7674, + "mean_token_accuracy": 0.5961933135986328, + "num_tokens": 8305684907.0, + "step": 16247 + }, + { + "epoch": 4.393726338561384, + "grad_norm": 0.956478476524353, + "learning_rate": 2.6857630771783305e-06, + "loss": 1.894, + "mean_token_accuracy": 0.5841659307479858, + "num_tokens": 8306148172.0, + "step": 16248 + }, + { + "epoch": 4.393996755002704, + "grad_norm": 1.1039714813232422, + "learning_rate": 2.685159621429069e-06, + "loss": 1.8862, + "mean_token_accuracy": 0.5617504119873047, + "num_tokens": 8306645609.0, + "step": 16249 + }, + { + "epoch": 4.394267171444024, + "grad_norm": 0.9623245000839233, + "learning_rate": 2.6845564208035602e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.576245903968811, + "num_tokens": 8307169852.0, + "step": 16250 + }, + { + "epoch": 4.394537587885344, + "grad_norm": 0.9374212026596069, + "learning_rate": 2.6839534753203173e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5938923358917236, + "num_tokens": 8307624783.0, + "step": 16251 + }, + { + "epoch": 4.394808004326663, + "grad_norm": 0.9320588111877441, + "learning_rate": 2.683350784997837e-06, + "loss": 1.836, + "mean_token_accuracy": 0.5902581214904785, + "num_tokens": 8308108537.0, + "step": 16252 + }, + { + "epoch": 4.395078420767983, + "grad_norm": 0.8600667715072632, + "learning_rate": 2.6827483498546115e-06, + "loss": 1.8572, + "mean_token_accuracy": 0.5755158066749573, + "num_tokens": 8308632799.0, + "step": 16253 + }, + { + "epoch": 4.395348837209302, + "grad_norm": 0.8325991034507751, + "learning_rate": 2.6821461699091266e-06, + "loss": 1.7737, + "mean_token_accuracy": 0.5939179062843323, + "num_tokens": 8309156944.0, + "step": 16254 + }, + { + "epoch": 4.395619253650622, + "grad_norm": 0.9559600353240967, + "learning_rate": 2.6815442451798572e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5780924558639526, + "num_tokens": 8309656338.0, + "step": 16255 + }, + { + "epoch": 4.395889670091941, + "grad_norm": 0.8467960357666016, + "learning_rate": 2.680942575685273e-06, + "loss": 1.6459, + "mean_token_accuracy": 0.6150437593460083, + "num_tokens": 8310180470.0, + "step": 16256 + }, + { + "epoch": 4.3961600865332615, + "grad_norm": 1.0536442995071411, + "learning_rate": 2.6803411614438378e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.5650164484977722, + "num_tokens": 8310704736.0, + "step": 16257 + }, + { + "epoch": 4.396430502974581, + "grad_norm": 0.8737409710884094, + "learning_rate": 2.679740002474002e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5798876285552979, + "num_tokens": 8311225365.0, + "step": 16258 + }, + { + "epoch": 4.396700919415901, + "grad_norm": 0.9554868340492249, + "learning_rate": 2.679139098794209e-06, + "loss": 1.7871, + "mean_token_accuracy": 0.5888453722000122, + "num_tokens": 8311749635.0, + "step": 16259 + }, + { + "epoch": 4.39697133585722, + "grad_norm": 1.0772570371627808, + "learning_rate": 2.6785384504229005e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.6092573404312134, + "num_tokens": 8312201421.0, + "step": 16260 + }, + { + "epoch": 4.39724175229854, + "grad_norm": 0.31298092007637024, + "learning_rate": 2.6779380573785023e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.7180430889129639, + "num_tokens": 8312725704.0, + "step": 16261 + }, + { + "epoch": 4.397512168739859, + "grad_norm": 0.9470908641815186, + "learning_rate": 2.67733791967944e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.577901303768158, + "num_tokens": 8313249984.0, + "step": 16262 + }, + { + "epoch": 4.397782585181179, + "grad_norm": 1.0187537670135498, + "learning_rate": 2.6767380373441253e-06, + "loss": 1.9333, + "mean_token_accuracy": 0.5633573532104492, + "num_tokens": 8313765237.0, + "step": 16263 + }, + { + "epoch": 4.398053001622499, + "grad_norm": 1.102630615234375, + "learning_rate": 2.6761384103909636e-06, + "loss": 1.9156, + "mean_token_accuracy": 0.5675427913665771, + "num_tokens": 8314246075.0, + "step": 16264 + }, + { + "epoch": 4.398323418063819, + "grad_norm": 0.8799295425415039, + "learning_rate": 2.6755390388383564e-06, + "loss": 1.6936, + "mean_token_accuracy": 0.5905696153640747, + "num_tokens": 8314770321.0, + "step": 16265 + }, + { + "epoch": 4.398593834505138, + "grad_norm": 1.0170053243637085, + "learning_rate": 2.6749399227046914e-06, + "loss": 1.9208, + "mean_token_accuracy": 0.5611281991004944, + "num_tokens": 8315294464.0, + "step": 16266 + }, + { + "epoch": 4.398864250946458, + "grad_norm": 1.3311147689819336, + "learning_rate": 2.674341062008351e-06, + "loss": 1.5635, + "mean_token_accuracy": 0.6390862464904785, + "num_tokens": 8315818518.0, + "step": 16267 + }, + { + "epoch": 4.399134667387777, + "grad_norm": 0.9888596534729004, + "learning_rate": 2.673742456767712e-06, + "loss": 1.8834, + "mean_token_accuracy": 0.5633904933929443, + "num_tokens": 8316342721.0, + "step": 16268 + }, + { + "epoch": 4.399405083829096, + "grad_norm": 0.9500898122787476, + "learning_rate": 2.673144107001138e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5903322696685791, + "num_tokens": 8316790045.0, + "step": 16269 + }, + { + "epoch": 4.399675500270416, + "grad_norm": 0.8949172496795654, + "learning_rate": 2.672546012726992e-06, + "loss": 1.8576, + "mean_token_accuracy": 0.5706899166107178, + "num_tokens": 8317314236.0, + "step": 16270 + }, + { + "epoch": 4.3999459167117365, + "grad_norm": 0.9472634196281433, + "learning_rate": 2.671948173963625e-06, + "loss": 1.8871, + "mean_token_accuracy": 0.5646753311157227, + "num_tokens": 8317838512.0, + "step": 16271 + }, + { + "epoch": 4.400216333153056, + "grad_norm": 0.8613503575325012, + "learning_rate": 2.6713505907293785e-06, + "loss": 1.7192, + "mean_token_accuracy": 0.5654213428497314, + "num_tokens": 8318362724.0, + "step": 16272 + }, + { + "epoch": 4.400486749594375, + "grad_norm": 0.8526551723480225, + "learning_rate": 2.670753263042587e-06, + "loss": 1.8391, + "mean_token_accuracy": 0.578955352306366, + "num_tokens": 8318859058.0, + "step": 16273 + }, + { + "epoch": 4.400757166035695, + "grad_norm": 0.9364415407180786, + "learning_rate": 2.6701561909215833e-06, + "loss": 1.9125, + "mean_token_accuracy": 0.5706185698509216, + "num_tokens": 8319383330.0, + "step": 16274 + }, + { + "epoch": 4.401027582477014, + "grad_norm": 0.9819481372833252, + "learning_rate": 2.6695593743846826e-06, + "loss": 1.8363, + "mean_token_accuracy": 0.5878381729125977, + "num_tokens": 8319846142.0, + "step": 16275 + }, + { + "epoch": 4.401297998918334, + "grad_norm": 0.8874406218528748, + "learning_rate": 2.6689628134501983e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5991958379745483, + "num_tokens": 8320308764.0, + "step": 16276 + }, + { + "epoch": 4.4015684153596535, + "grad_norm": 0.898848295211792, + "learning_rate": 2.668366508136435e-06, + "loss": 1.8386, + "mean_token_accuracy": 0.5579977035522461, + "num_tokens": 8320831781.0, + "step": 16277 + }, + { + "epoch": 4.401838831800974, + "grad_norm": 0.939009964466095, + "learning_rate": 2.6677704584616904e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.5685533285140991, + "num_tokens": 8321356011.0, + "step": 16278 + }, + { + "epoch": 4.402109248242293, + "grad_norm": 0.9625822305679321, + "learning_rate": 2.6671746644442498e-06, + "loss": 1.6816, + "mean_token_accuracy": 0.602453351020813, + "num_tokens": 8321854001.0, + "step": 16279 + }, + { + "epoch": 4.402379664683613, + "grad_norm": 0.941783607006073, + "learning_rate": 2.6665791261023964e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5739974975585938, + "num_tokens": 8322378276.0, + "step": 16280 + }, + { + "epoch": 4.402650081124932, + "grad_norm": 0.37260088324546814, + "learning_rate": 2.6659838434544026e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.7276232242584229, + "num_tokens": 8322902552.0, + "step": 16281 + }, + { + "epoch": 4.402920497566252, + "grad_norm": 0.8405743837356567, + "learning_rate": 2.665388816518532e-06, + "loss": 1.8485, + "mean_token_accuracy": 0.5772359371185303, + "num_tokens": 8323426833.0, + "step": 16282 + }, + { + "epoch": 4.403190914007571, + "grad_norm": 0.8611368536949158, + "learning_rate": 2.664794045313043e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5677249431610107, + "num_tokens": 8323951081.0, + "step": 16283 + }, + { + "epoch": 4.403461330448891, + "grad_norm": 0.9523220658302307, + "learning_rate": 2.664199529856186e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5586684346199036, + "num_tokens": 8324475208.0, + "step": 16284 + }, + { + "epoch": 4.403731746890211, + "grad_norm": 0.910342276096344, + "learning_rate": 2.6636052701662006e-06, + "loss": 1.8886, + "mean_token_accuracy": 0.5600661039352417, + "num_tokens": 8324999424.0, + "step": 16285 + }, + { + "epoch": 4.404002163331531, + "grad_norm": 0.8156879544258118, + "learning_rate": 2.663011266261322e-06, + "loss": 1.7849, + "mean_token_accuracy": 0.5923197269439697, + "num_tokens": 8325523678.0, + "step": 16286 + }, + { + "epoch": 4.40427257977285, + "grad_norm": 0.7626866102218628, + "learning_rate": 2.6624175181597745e-06, + "loss": 1.79, + "mean_token_accuracy": 0.5894894599914551, + "num_tokens": 8326047820.0, + "step": 16287 + }, + { + "epoch": 4.40454299621417, + "grad_norm": 0.7606061697006226, + "learning_rate": 2.661824025879778e-06, + "loss": 1.7968, + "mean_token_accuracy": 0.5756576061248779, + "num_tokens": 8326571938.0, + "step": 16288 + }, + { + "epoch": 4.404813412655489, + "grad_norm": 0.8367266058921814, + "learning_rate": 2.6612307894395417e-06, + "loss": 1.8652, + "mean_token_accuracy": 0.5793346762657166, + "num_tokens": 8327096124.0, + "step": 16289 + }, + { + "epoch": 4.405083829096809, + "grad_norm": 0.753156840801239, + "learning_rate": 2.6606378088572654e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.5773154497146606, + "num_tokens": 8327620392.0, + "step": 16290 + }, + { + "epoch": 4.4053542455381285, + "grad_norm": 0.8162492513656616, + "learning_rate": 2.660045084151147e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5659688115119934, + "num_tokens": 8328128189.0, + "step": 16291 + }, + { + "epoch": 4.405624661979449, + "grad_norm": 0.7530010938644409, + "learning_rate": 2.6594526153393717e-06, + "loss": 1.7941, + "mean_token_accuracy": 0.5827743411064148, + "num_tokens": 8328652444.0, + "step": 16292 + }, + { + "epoch": 4.405895078420768, + "grad_norm": 0.8045449256896973, + "learning_rate": 2.6588604024401177e-06, + "loss": 1.6381, + "mean_token_accuracy": 0.6217176914215088, + "num_tokens": 8329176594.0, + "step": 16293 + }, + { + "epoch": 4.406165494862088, + "grad_norm": 0.8218655586242676, + "learning_rate": 2.658268445471556e-06, + "loss": 1.8703, + "mean_token_accuracy": 0.5680220127105713, + "num_tokens": 8329678000.0, + "step": 16294 + }, + { + "epoch": 4.406435911303407, + "grad_norm": 0.7740538716316223, + "learning_rate": 2.6576767444518513e-06, + "loss": 1.9682, + "mean_token_accuracy": 0.5619374513626099, + "num_tokens": 8330202274.0, + "step": 16295 + }, + { + "epoch": 4.406706327744727, + "grad_norm": 0.9021888971328735, + "learning_rate": 2.6570852993991546e-06, + "loss": 1.9361, + "mean_token_accuracy": 0.5552475452423096, + "num_tokens": 8330726507.0, + "step": 16296 + }, + { + "epoch": 4.406976744186046, + "grad_norm": 0.8262234926223755, + "learning_rate": 2.6564941103316167e-06, + "loss": 1.8889, + "mean_token_accuracy": 0.5746908187866211, + "num_tokens": 8331250640.0, + "step": 16297 + }, + { + "epoch": 4.407247160627366, + "grad_norm": 0.9262793660163879, + "learning_rate": 2.655903177267378e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.5692626237869263, + "num_tokens": 8331773343.0, + "step": 16298 + }, + { + "epoch": 4.407517577068686, + "grad_norm": 0.8177393078804016, + "learning_rate": 2.655312500224565e-06, + "loss": 1.7756, + "mean_token_accuracy": 0.5990351438522339, + "num_tokens": 8332271271.0, + "step": 16299 + }, + { + "epoch": 4.407787993510006, + "grad_norm": 0.8577929735183716, + "learning_rate": 2.654722079221308e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.5591438412666321, + "num_tokens": 8332795514.0, + "step": 16300 + }, + { + "epoch": 4.408058409951325, + "grad_norm": 0.322344034910202, + "learning_rate": 2.654131914275718e-06, + "loss": 1.1216, + "mean_token_accuracy": 0.6935121417045593, + "num_tokens": 8333319666.0, + "step": 16301 + }, + { + "epoch": 4.408328826392645, + "grad_norm": 0.8694682717323303, + "learning_rate": 2.653542005405905e-06, + "loss": 1.7505, + "mean_token_accuracy": 0.5955519676208496, + "num_tokens": 8333781681.0, + "step": 16302 + }, + { + "epoch": 4.408599242833964, + "grad_norm": 0.875812828540802, + "learning_rate": 2.6529523526299683e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.5748385190963745, + "num_tokens": 8334305871.0, + "step": 16303 + }, + { + "epoch": 4.408869659275284, + "grad_norm": 0.7724071741104126, + "learning_rate": 2.652362955966e-06, + "loss": 1.8138, + "mean_token_accuracy": 0.5848875641822815, + "num_tokens": 8334830111.0, + "step": 16304 + }, + { + "epoch": 4.4091400757166035, + "grad_norm": 0.8409916162490845, + "learning_rate": 2.651773815432085e-06, + "loss": 1.8916, + "mean_token_accuracy": 0.5795913934707642, + "num_tokens": 8335298516.0, + "step": 16305 + }, + { + "epoch": 4.409410492157924, + "grad_norm": 0.8411158919334412, + "learning_rate": 2.651184931046301e-06, + "loss": 1.8441, + "mean_token_accuracy": 0.5527896881103516, + "num_tokens": 8335822711.0, + "step": 16306 + }, + { + "epoch": 4.409680908599243, + "grad_norm": 0.7996156811714172, + "learning_rate": 2.6505963028267135e-06, + "loss": 1.8581, + "mean_token_accuracy": 0.5819828510284424, + "num_tokens": 8336326688.0, + "step": 16307 + }, + { + "epoch": 4.409951325040563, + "grad_norm": 0.9335824847221375, + "learning_rate": 2.6500079307913878e-06, + "loss": 1.8262, + "mean_token_accuracy": 0.578836977481842, + "num_tokens": 8336850863.0, + "step": 16308 + }, + { + "epoch": 4.410221741481882, + "grad_norm": 1.0198745727539062, + "learning_rate": 2.6494198149583742e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.5737738013267517, + "num_tokens": 8337280115.0, + "step": 16309 + }, + { + "epoch": 4.410492157923201, + "grad_norm": 0.8280027508735657, + "learning_rate": 2.648831955345716e-06, + "loss": 1.7921, + "mean_token_accuracy": 0.5919435024261475, + "num_tokens": 8337804396.0, + "step": 16310 + }, + { + "epoch": 4.410762574364521, + "grad_norm": 0.8608886003494263, + "learning_rate": 2.6482443519714544e-06, + "loss": 1.7312, + "mean_token_accuracy": 0.5991209745407104, + "num_tokens": 8338328662.0, + "step": 16311 + }, + { + "epoch": 4.411032990805841, + "grad_norm": 0.9304802417755127, + "learning_rate": 2.647657004853616e-06, + "loss": 1.9136, + "mean_token_accuracy": 0.5666078329086304, + "num_tokens": 8338852926.0, + "step": 16312 + }, + { + "epoch": 4.411303407247161, + "grad_norm": 0.7759521007537842, + "learning_rate": 2.6470699140102224e-06, + "loss": 1.7237, + "mean_token_accuracy": 0.601714551448822, + "num_tokens": 8339322172.0, + "step": 16313 + }, + { + "epoch": 4.41157382368848, + "grad_norm": 0.8944421410560608, + "learning_rate": 2.6464830794592906e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.583451509475708, + "num_tokens": 8339846074.0, + "step": 16314 + }, + { + "epoch": 4.4118442401298, + "grad_norm": 0.8801156878471375, + "learning_rate": 2.645896501218824e-06, + "loss": 1.8642, + "mean_token_accuracy": 0.5721015930175781, + "num_tokens": 8340370345.0, + "step": 16315 + }, + { + "epoch": 4.412114656571119, + "grad_norm": 0.9843936562538147, + "learning_rate": 2.6453101793068187e-06, + "loss": 1.8289, + "mean_token_accuracy": 0.5764861106872559, + "num_tokens": 8340894504.0, + "step": 16316 + }, + { + "epoch": 4.412385073012439, + "grad_norm": 0.8450524210929871, + "learning_rate": 2.644724113741268e-06, + "loss": 1.9546, + "mean_token_accuracy": 0.5632961988449097, + "num_tokens": 8341365553.0, + "step": 16317 + }, + { + "epoch": 4.412655489453758, + "grad_norm": 1.048389196395874, + "learning_rate": 2.644138304540153e-06, + "loss": 1.754, + "mean_token_accuracy": 0.5674128532409668, + "num_tokens": 8341839302.0, + "step": 16318 + }, + { + "epoch": 4.4129259058950785, + "grad_norm": 0.899183452129364, + "learning_rate": 2.6435527517214466e-06, + "loss": 1.844, + "mean_token_accuracy": 0.584586501121521, + "num_tokens": 8342351857.0, + "step": 16319 + }, + { + "epoch": 4.413196322336398, + "grad_norm": 0.9186657667160034, + "learning_rate": 2.6429674553031177e-06, + "loss": 1.6889, + "mean_token_accuracy": 0.6175296902656555, + "num_tokens": 8342847343.0, + "step": 16320 + }, + { + "epoch": 4.413466738777718, + "grad_norm": 0.3580383360385895, + "learning_rate": 2.6423824153031236e-06, + "loss": 1.1217, + "mean_token_accuracy": 0.7025165557861328, + "num_tokens": 8343371605.0, + "step": 16321 + }, + { + "epoch": 4.413737155219037, + "grad_norm": 0.9748808741569519, + "learning_rate": 2.6417976317394134e-06, + "loss": 1.9148, + "mean_token_accuracy": 0.559353232383728, + "num_tokens": 8343895883.0, + "step": 16322 + }, + { + "epoch": 4.414007571660357, + "grad_norm": 0.8069055676460266, + "learning_rate": 2.6412131046299336e-06, + "loss": 1.7778, + "mean_token_accuracy": 0.5928232669830322, + "num_tokens": 8344420067.0, + "step": 16323 + }, + { + "epoch": 4.414277988101676, + "grad_norm": 0.8598297834396362, + "learning_rate": 2.6406288339926176e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5707268714904785, + "num_tokens": 8344944330.0, + "step": 16324 + }, + { + "epoch": 4.414548404542996, + "grad_norm": 0.9024242758750916, + "learning_rate": 2.6400448198453903e-06, + "loss": 1.7605, + "mean_token_accuracy": 0.6132320165634155, + "num_tokens": 8345468598.0, + "step": 16325 + }, + { + "epoch": 4.4148188209843156, + "grad_norm": 0.924315869808197, + "learning_rate": 2.6394610622061728e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5583336353302002, + "num_tokens": 8345992817.0, + "step": 16326 + }, + { + "epoch": 4.415089237425636, + "grad_norm": 0.8920279741287231, + "learning_rate": 2.638877561092879e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.5744348168373108, + "num_tokens": 8346517077.0, + "step": 16327 + }, + { + "epoch": 4.415359653866955, + "grad_norm": 0.9280808568000793, + "learning_rate": 2.6382943165234083e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5754690766334534, + "num_tokens": 8347033416.0, + "step": 16328 + }, + { + "epoch": 4.415630070308275, + "grad_norm": 0.8989757299423218, + "learning_rate": 2.637711328515659e-06, + "loss": 1.8615, + "mean_token_accuracy": 0.566164493560791, + "num_tokens": 8347557656.0, + "step": 16329 + }, + { + "epoch": 4.415900486749594, + "grad_norm": 0.8348963856697083, + "learning_rate": 2.6371285970875184e-06, + "loss": 1.8191, + "mean_token_accuracy": 0.5746594667434692, + "num_tokens": 8348051646.0, + "step": 16330 + }, + { + "epoch": 4.416170903190914, + "grad_norm": 0.8221695423126221, + "learning_rate": 2.6365461222568643e-06, + "loss": 1.7413, + "mean_token_accuracy": 0.5967915654182434, + "num_tokens": 8348575897.0, + "step": 16331 + }, + { + "epoch": 4.416441319632233, + "grad_norm": 0.9649903178215027, + "learning_rate": 2.635963904041573e-06, + "loss": 1.7119, + "mean_token_accuracy": 0.5873551368713379, + "num_tokens": 8349100010.0, + "step": 16332 + }, + { + "epoch": 4.4167117360735535, + "grad_norm": 0.9218122959136963, + "learning_rate": 2.6353819424595032e-06, + "loss": 1.8737, + "mean_token_accuracy": 0.5699061155319214, + "num_tokens": 8349593547.0, + "step": 16333 + }, + { + "epoch": 4.416982152514873, + "grad_norm": 3.035210371017456, + "learning_rate": 2.6348002375285164e-06, + "loss": 1.6421, + "mean_token_accuracy": 0.6411778926849365, + "num_tokens": 8349993516.0, + "step": 16334 + }, + { + "epoch": 4.417252568956193, + "grad_norm": 0.8884866833686829, + "learning_rate": 2.6342187892664596e-06, + "loss": 1.9485, + "mean_token_accuracy": 0.5646935105323792, + "num_tokens": 8350517711.0, + "step": 16335 + }, + { + "epoch": 4.417522985397512, + "grad_norm": 0.8004021644592285, + "learning_rate": 2.63363759769117e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5913058519363403, + "num_tokens": 8351041981.0, + "step": 16336 + }, + { + "epoch": 4.417793401838832, + "grad_norm": 0.821439266204834, + "learning_rate": 2.6330566628204843e-06, + "loss": 1.7421, + "mean_token_accuracy": 0.5728165507316589, + "num_tokens": 8351566111.0, + "step": 16337 + }, + { + "epoch": 4.418063818280151, + "grad_norm": 0.9430598020553589, + "learning_rate": 2.632475984672226e-06, + "loss": 1.8124, + "mean_token_accuracy": 0.5733094215393066, + "num_tokens": 8352090286.0, + "step": 16338 + }, + { + "epoch": 4.418334234721471, + "grad_norm": 0.9038327932357788, + "learning_rate": 2.6318955632642097e-06, + "loss": 1.9736, + "mean_token_accuracy": 0.5573079586029053, + "num_tokens": 8352564902.0, + "step": 16339 + }, + { + "epoch": 4.4186046511627906, + "grad_norm": 0.8013404011726379, + "learning_rate": 2.631315398614247e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.575927734375, + "num_tokens": 8353089058.0, + "step": 16340 + }, + { + "epoch": 4.418875067604111, + "grad_norm": 0.3331926465034485, + "learning_rate": 2.6307354907401406e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.7166846990585327, + "num_tokens": 8353613334.0, + "step": 16341 + }, + { + "epoch": 4.41914548404543, + "grad_norm": 1.024918556213379, + "learning_rate": 2.630155839659679e-06, + "loss": 1.8111, + "mean_token_accuracy": 0.5823729634284973, + "num_tokens": 8354137606.0, + "step": 16342 + }, + { + "epoch": 4.41941590048675, + "grad_norm": 0.90109783411026, + "learning_rate": 2.6295764453906526e-06, + "loss": 1.8682, + "mean_token_accuracy": 0.5746648907661438, + "num_tokens": 8354661884.0, + "step": 16343 + }, + { + "epoch": 4.419686316928069, + "grad_norm": 0.8440985679626465, + "learning_rate": 2.6289973079508366e-06, + "loss": 1.8282, + "mean_token_accuracy": 0.5710604190826416, + "num_tokens": 8355186080.0, + "step": 16344 + }, + { + "epoch": 4.419956733369389, + "grad_norm": 0.9238697290420532, + "learning_rate": 2.6284184273579986e-06, + "loss": 1.7629, + "mean_token_accuracy": 0.5815333724021912, + "num_tokens": 8355710332.0, + "step": 16345 + }, + { + "epoch": 4.420227149810708, + "grad_norm": 0.7751708626747131, + "learning_rate": 2.6278398036299047e-06, + "loss": 1.7571, + "mean_token_accuracy": 0.5890868306159973, + "num_tokens": 8356234548.0, + "step": 16346 + }, + { + "epoch": 4.4204975662520285, + "grad_norm": 0.8414819836616516, + "learning_rate": 2.6272614367843065e-06, + "loss": 1.7691, + "mean_token_accuracy": 0.5835983753204346, + "num_tokens": 8356743215.0, + "step": 16347 + }, + { + "epoch": 4.420767982693348, + "grad_norm": 0.864535927772522, + "learning_rate": 2.6266833268389485e-06, + "loss": 1.8196, + "mean_token_accuracy": 0.5758322477340698, + "num_tokens": 8357267427.0, + "step": 16348 + }, + { + "epoch": 4.421038399134668, + "grad_norm": 0.7682017087936401, + "learning_rate": 2.626105473811572e-06, + "loss": 1.8629, + "mean_token_accuracy": 0.5584298372268677, + "num_tokens": 8357791640.0, + "step": 16349 + }, + { + "epoch": 4.421308815575987, + "grad_norm": 0.8977522850036621, + "learning_rate": 2.625527877719905e-06, + "loss": 1.85, + "mean_token_accuracy": 0.5896661877632141, + "num_tokens": 8358288475.0, + "step": 16350 + }, + { + "epoch": 4.421579232017306, + "grad_norm": 0.8111764788627625, + "learning_rate": 2.62495053858167e-06, + "loss": 1.7697, + "mean_token_accuracy": 0.6003412008285522, + "num_tokens": 8358812646.0, + "step": 16351 + }, + { + "epoch": 4.421849648458626, + "grad_norm": 0.7622833251953125, + "learning_rate": 2.6243734564145825e-06, + "loss": 1.8053, + "mean_token_accuracy": 0.5828929543495178, + "num_tokens": 8359336915.0, + "step": 16352 + }, + { + "epoch": 4.422120064899946, + "grad_norm": 0.9341472387313843, + "learning_rate": 2.623796631236347e-06, + "loss": 1.9177, + "mean_token_accuracy": 0.5541995763778687, + "num_tokens": 8359861091.0, + "step": 16353 + }, + { + "epoch": 4.4223904813412656, + "grad_norm": 0.8762210011482239, + "learning_rate": 2.623220063064665e-06, + "loss": 1.8334, + "mean_token_accuracy": 0.5775481462478638, + "num_tokens": 8360385336.0, + "step": 16354 + }, + { + "epoch": 4.422660897782585, + "grad_norm": 0.8563927412033081, + "learning_rate": 2.6226437519172254e-06, + "loss": 1.8785, + "mean_token_accuracy": 0.5788825750350952, + "num_tokens": 8360893408.0, + "step": 16355 + }, + { + "epoch": 4.422931314223905, + "grad_norm": 0.8581533432006836, + "learning_rate": 2.622067697811711e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5837221145629883, + "num_tokens": 8361405995.0, + "step": 16356 + }, + { + "epoch": 4.423201730665224, + "grad_norm": 0.8170847296714783, + "learning_rate": 2.621491900765799e-06, + "loss": 1.7672, + "mean_token_accuracy": 0.5811870694160461, + "num_tokens": 8361930156.0, + "step": 16357 + }, + { + "epoch": 4.423472147106544, + "grad_norm": 0.9068911075592041, + "learning_rate": 2.6209163607971556e-06, + "loss": 1.8005, + "mean_token_accuracy": 0.5729942321777344, + "num_tokens": 8362454176.0, + "step": 16358 + }, + { + "epoch": 4.423742563547863, + "grad_norm": 1.0371003150939941, + "learning_rate": 2.6203410779234374e-06, + "loss": 1.8026, + "mean_token_accuracy": 0.5590511560440063, + "num_tokens": 8362978258.0, + "step": 16359 + }, + { + "epoch": 4.424012979989183, + "grad_norm": 0.9526181817054749, + "learning_rate": 2.619766052162299e-06, + "loss": 1.7403, + "mean_token_accuracy": 0.6080343723297119, + "num_tokens": 8363502523.0, + "step": 16360 + }, + { + "epoch": 4.424283396430503, + "grad_norm": 0.318048357963562, + "learning_rate": 2.619191283531384e-06, + "loss": 1.1229, + "mean_token_accuracy": 0.6999667286872864, + "num_tokens": 8364026653.0, + "step": 16361 + }, + { + "epoch": 4.424553812871823, + "grad_norm": 1.034684181213379, + "learning_rate": 2.618616772048324e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.5868330001831055, + "num_tokens": 8364550834.0, + "step": 16362 + }, + { + "epoch": 4.424824229313142, + "grad_norm": 1.136858344078064, + "learning_rate": 2.6180425177307524e-06, + "loss": 1.9618, + "mean_token_accuracy": 0.5705384612083435, + "num_tokens": 8365024462.0, + "step": 16363 + }, + { + "epoch": 4.425094645754462, + "grad_norm": 0.9486026763916016, + "learning_rate": 2.617468520596285e-06, + "loss": 1.8355, + "mean_token_accuracy": 0.5779622793197632, + "num_tokens": 8365529137.0, + "step": 16364 + }, + { + "epoch": 4.425365062195781, + "grad_norm": 0.870925784111023, + "learning_rate": 2.616894780662534e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.5760699510574341, + "num_tokens": 8366053090.0, + "step": 16365 + }, + { + "epoch": 4.425635478637101, + "grad_norm": 0.9374924898147583, + "learning_rate": 2.6163212979471047e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5551751852035522, + "num_tokens": 8366577282.0, + "step": 16366 + }, + { + "epoch": 4.4259058950784205, + "grad_norm": 0.8916085362434387, + "learning_rate": 2.6157480724675915e-06, + "loss": 1.7671, + "mean_token_accuracy": 0.5738706588745117, + "num_tokens": 8367101460.0, + "step": 16367 + }, + { + "epoch": 4.426176311519741, + "grad_norm": 0.9293206930160522, + "learning_rate": 2.615175104241587e-06, + "loss": 1.8245, + "mean_token_accuracy": 0.591018557548523, + "num_tokens": 8367625692.0, + "step": 16368 + }, + { + "epoch": 4.42644672796106, + "grad_norm": 0.9492203593254089, + "learning_rate": 2.614602393286666e-06, + "loss": 1.8561, + "mean_token_accuracy": 0.5824500322341919, + "num_tokens": 8368149913.0, + "step": 16369 + }, + { + "epoch": 4.42671714440238, + "grad_norm": 0.8278179168701172, + "learning_rate": 2.614029939620404e-06, + "loss": 1.7343, + "mean_token_accuracy": 0.583537220954895, + "num_tokens": 8368673988.0, + "step": 16370 + }, + { + "epoch": 4.426987560843699, + "grad_norm": 0.9598603248596191, + "learning_rate": 2.613457743260364e-06, + "loss": 1.7971, + "mean_token_accuracy": 0.569800078868866, + "num_tokens": 8369198161.0, + "step": 16371 + }, + { + "epoch": 4.427257977285019, + "grad_norm": 0.9365677833557129, + "learning_rate": 2.612885804224106e-06, + "loss": 1.7895, + "mean_token_accuracy": 0.58536696434021, + "num_tokens": 8369722292.0, + "step": 16372 + }, + { + "epoch": 4.427528393726338, + "grad_norm": 0.8217282891273499, + "learning_rate": 2.612314122529176e-06, + "loss": 1.8049, + "mean_token_accuracy": 0.5761699676513672, + "num_tokens": 8370246483.0, + "step": 16373 + }, + { + "epoch": 4.427798810167658, + "grad_norm": 0.8588076829910278, + "learning_rate": 2.6117426981931134e-06, + "loss": 1.7702, + "mean_token_accuracy": 0.5921148061752319, + "num_tokens": 8370760853.0, + "step": 16374 + }, + { + "epoch": 4.428069226608978, + "grad_norm": 0.7463195323944092, + "learning_rate": 2.6111715312334544e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5830522179603577, + "num_tokens": 8371285112.0, + "step": 16375 + }, + { + "epoch": 4.428339643050298, + "grad_norm": 0.8552924990653992, + "learning_rate": 2.6106006216677233e-06, + "loss": 1.8804, + "mean_token_accuracy": 0.5846153497695923, + "num_tokens": 8371734295.0, + "step": 16376 + }, + { + "epoch": 4.428610059491617, + "grad_norm": 0.8437815308570862, + "learning_rate": 2.6100299695134347e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5842266082763672, + "num_tokens": 8372258573.0, + "step": 16377 + }, + { + "epoch": 4.428880475932937, + "grad_norm": 1.1364465951919556, + "learning_rate": 2.6094595747881026e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5671999454498291, + "num_tokens": 8372710307.0, + "step": 16378 + }, + { + "epoch": 4.429150892374256, + "grad_norm": 1.0245851278305054, + "learning_rate": 2.6088894375092233e-06, + "loss": 1.703, + "mean_token_accuracy": 0.6026500463485718, + "num_tokens": 8373177963.0, + "step": 16379 + }, + { + "epoch": 4.429421308815576, + "grad_norm": 0.7524458169937134, + "learning_rate": 2.6083195576942936e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.5723745822906494, + "num_tokens": 8373702229.0, + "step": 16380 + }, + { + "epoch": 4.4296917252568955, + "grad_norm": 0.3397464156150818, + "learning_rate": 2.6077499353607988e-06, + "loss": 1.096, + "mean_token_accuracy": 0.7005093097686768, + "num_tokens": 8374226359.0, + "step": 16381 + }, + { + "epoch": 4.429962141698216, + "grad_norm": 0.9823393821716309, + "learning_rate": 2.6071805705262152e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5642120838165283, + "num_tokens": 8374736901.0, + "step": 16382 + }, + { + "epoch": 4.430232558139535, + "grad_norm": 0.7924048900604248, + "learning_rate": 2.606611463208013e-06, + "loss": 1.8542, + "mean_token_accuracy": 0.584503173828125, + "num_tokens": 8375261067.0, + "step": 16383 + }, + { + "epoch": 4.430502974580855, + "grad_norm": 0.8330602049827576, + "learning_rate": 2.6060426134236554e-06, + "loss": 1.8087, + "mean_token_accuracy": 0.5930886268615723, + "num_tokens": 8375785303.0, + "step": 16384 + }, + { + "epoch": 4.430773391022174, + "grad_norm": 0.7170494794845581, + "learning_rate": 2.6054740211905942e-06, + "loss": 1.8076, + "mean_token_accuracy": 0.5756574869155884, + "num_tokens": 8376309523.0, + "step": 16385 + }, + { + "epoch": 4.431043807463494, + "grad_norm": 0.9612166285514832, + "learning_rate": 2.6049056865262784e-06, + "loss": 1.828, + "mean_token_accuracy": 0.5610237121582031, + "num_tokens": 8376794911.0, + "step": 16386 + }, + { + "epoch": 4.431314223904813, + "grad_norm": 0.9248942136764526, + "learning_rate": 2.604337609448144e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5721069574356079, + "num_tokens": 8377319000.0, + "step": 16387 + }, + { + "epoch": 4.431584640346133, + "grad_norm": 1.0654962062835693, + "learning_rate": 2.6037697899736215e-06, + "loss": 1.9075, + "mean_token_accuracy": 0.5482841730117798, + "num_tokens": 8377843203.0, + "step": 16388 + }, + { + "epoch": 4.431855056787453, + "grad_norm": 0.7704185247421265, + "learning_rate": 2.6032022281201334e-06, + "loss": 1.7994, + "mean_token_accuracy": 0.5954093337059021, + "num_tokens": 8378367383.0, + "step": 16389 + }, + { + "epoch": 4.432125473228773, + "grad_norm": 0.7902304530143738, + "learning_rate": 2.6026349239050962e-06, + "loss": 1.8791, + "mean_token_accuracy": 0.5601251125335693, + "num_tokens": 8378891479.0, + "step": 16390 + }, + { + "epoch": 4.432395889670092, + "grad_norm": 0.9650033712387085, + "learning_rate": 2.602067877345911e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5858777165412903, + "num_tokens": 8379347373.0, + "step": 16391 + }, + { + "epoch": 4.432666306111411, + "grad_norm": 0.8999001383781433, + "learning_rate": 2.6015010884599835e-06, + "loss": 1.8918, + "mean_token_accuracy": 0.5703626871109009, + "num_tokens": 8379871449.0, + "step": 16392 + }, + { + "epoch": 4.432936722552731, + "grad_norm": 0.9635007977485657, + "learning_rate": 2.6009345572646998e-06, + "loss": 1.8396, + "mean_token_accuracy": 0.5603057146072388, + "num_tokens": 8380395570.0, + "step": 16393 + }, + { + "epoch": 4.433207138994051, + "grad_norm": 0.775610625743866, + "learning_rate": 2.6003682837774424e-06, + "loss": 1.8648, + "mean_token_accuracy": 0.5788708925247192, + "num_tokens": 8380919786.0, + "step": 16394 + }, + { + "epoch": 4.4334775554353705, + "grad_norm": 0.9315019249916077, + "learning_rate": 2.5998022680155895e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5686283111572266, + "num_tokens": 8381427674.0, + "step": 16395 + }, + { + "epoch": 4.43374797187669, + "grad_norm": 0.8724614977836609, + "learning_rate": 2.599236509996505e-06, + "loss": 1.7494, + "mean_token_accuracy": 0.5808191299438477, + "num_tokens": 8381951874.0, + "step": 16396 + }, + { + "epoch": 4.43401838831801, + "grad_norm": 0.843822181224823, + "learning_rate": 2.598671009737551e-06, + "loss": 1.7772, + "mean_token_accuracy": 0.5965707302093506, + "num_tokens": 8382476023.0, + "step": 16397 + }, + { + "epoch": 4.434288804759329, + "grad_norm": 0.9022906422615051, + "learning_rate": 2.5981057672560754e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5736527442932129, + "num_tokens": 8383000115.0, + "step": 16398 + }, + { + "epoch": 4.434559221200649, + "grad_norm": 0.8206889033317566, + "learning_rate": 2.5975407825694243e-06, + "loss": 1.7599, + "mean_token_accuracy": 0.6034932732582092, + "num_tokens": 8383492659.0, + "step": 16399 + }, + { + "epoch": 4.434829637641968, + "grad_norm": 0.8344615697860718, + "learning_rate": 2.5969760556949308e-06, + "loss": 1.7837, + "mean_token_accuracy": 0.5882434844970703, + "num_tokens": 8384016936.0, + "step": 16400 + }, + { + "epoch": 4.435100054083288, + "grad_norm": 0.33363887667655945, + "learning_rate": 2.5964115866499264e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.7237107753753662, + "num_tokens": 8384541124.0, + "step": 16401 + }, + { + "epoch": 4.4353704705246075, + "grad_norm": 0.9979276657104492, + "learning_rate": 2.5958473754517246e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5804905891418457, + "num_tokens": 8385020662.0, + "step": 16402 + }, + { + "epoch": 4.435640886965928, + "grad_norm": 1.0147534608840942, + "learning_rate": 2.5952834221176425e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5786383152008057, + "num_tokens": 8385544851.0, + "step": 16403 + }, + { + "epoch": 4.435911303407247, + "grad_norm": 1.0862735509872437, + "learning_rate": 2.594719726664982e-06, + "loss": 1.8111, + "mean_token_accuracy": 0.5725890398025513, + "num_tokens": 8386069093.0, + "step": 16404 + }, + { + "epoch": 4.436181719848567, + "grad_norm": 0.8648231029510498, + "learning_rate": 2.5941562891110365e-06, + "loss": 1.9593, + "mean_token_accuracy": 0.5308895111083984, + "num_tokens": 8386593337.0, + "step": 16405 + }, + { + "epoch": 4.436452136289886, + "grad_norm": 0.8628883957862854, + "learning_rate": 2.593593109473097e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5857715010643005, + "num_tokens": 8387117448.0, + "step": 16406 + }, + { + "epoch": 4.436722552731206, + "grad_norm": 1.0705851316452026, + "learning_rate": 2.5930301877684434e-06, + "loss": 1.7586, + "mean_token_accuracy": 0.5824484825134277, + "num_tokens": 8387604681.0, + "step": 16407 + }, + { + "epoch": 4.436992969172525, + "grad_norm": 0.8334093689918518, + "learning_rate": 2.5924675240143454e-06, + "loss": 1.7299, + "mean_token_accuracy": 0.6013634204864502, + "num_tokens": 8388128925.0, + "step": 16408 + }, + { + "epoch": 4.4372633856138455, + "grad_norm": 0.8229671716690063, + "learning_rate": 2.5919051182280702e-06, + "loss": 1.9097, + "mean_token_accuracy": 0.572664737701416, + "num_tokens": 8388631480.0, + "step": 16409 + }, + { + "epoch": 4.437533802055165, + "grad_norm": 0.9060949087142944, + "learning_rate": 2.591342970426871e-06, + "loss": 1.8555, + "mean_token_accuracy": 0.5838409662246704, + "num_tokens": 8389070182.0, + "step": 16410 + }, + { + "epoch": 4.437804218496485, + "grad_norm": 0.8638928532600403, + "learning_rate": 2.5907810806279998e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5634337067604065, + "num_tokens": 8389543940.0, + "step": 16411 + }, + { + "epoch": 4.438074634937804, + "grad_norm": 0.9840535521507263, + "learning_rate": 2.5902194488486927e-06, + "loss": 1.7721, + "mean_token_accuracy": 0.5882804989814758, + "num_tokens": 8390068084.0, + "step": 16412 + }, + { + "epoch": 4.438345051379124, + "grad_norm": 1.006971001625061, + "learning_rate": 2.589658075106186e-06, + "loss": 1.7209, + "mean_token_accuracy": 0.5895694494247437, + "num_tokens": 8390592191.0, + "step": 16413 + }, + { + "epoch": 4.438615467820443, + "grad_norm": 0.895954966545105, + "learning_rate": 2.5890969594177016e-06, + "loss": 1.6958, + "mean_token_accuracy": 0.5916717648506165, + "num_tokens": 8391116366.0, + "step": 16414 + }, + { + "epoch": 4.438885884261763, + "grad_norm": 0.8630063533782959, + "learning_rate": 2.588536101800458e-06, + "loss": 1.8957, + "mean_token_accuracy": 0.5489773154258728, + "num_tokens": 8391640592.0, + "step": 16415 + }, + { + "epoch": 4.4391563007030825, + "grad_norm": 0.7502323389053345, + "learning_rate": 2.5879755022716645e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5701689124107361, + "num_tokens": 8392164832.0, + "step": 16416 + }, + { + "epoch": 4.439426717144403, + "grad_norm": 1.0626099109649658, + "learning_rate": 2.587415160848518e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.5868760943412781, + "num_tokens": 8392688942.0, + "step": 16417 + }, + { + "epoch": 4.439697133585722, + "grad_norm": 0.9367249011993408, + "learning_rate": 2.5868550775482172e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5757708549499512, + "num_tokens": 8393154745.0, + "step": 16418 + }, + { + "epoch": 4.439967550027042, + "grad_norm": 0.8854276537895203, + "learning_rate": 2.586295252387943e-06, + "loss": 1.8208, + "mean_token_accuracy": 0.5865254402160645, + "num_tokens": 8393637968.0, + "step": 16419 + }, + { + "epoch": 4.440237966468361, + "grad_norm": 0.9982411861419678, + "learning_rate": 2.5857356853848713e-06, + "loss": 1.6564, + "mean_token_accuracy": 0.5932905077934265, + "num_tokens": 8394162217.0, + "step": 16420 + }, + { + "epoch": 4.440508382909681, + "grad_norm": 0.346733421087265, + "learning_rate": 2.585176376556177e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.7200049161911011, + "num_tokens": 8394686442.0, + "step": 16421 + }, + { + "epoch": 4.440778799351, + "grad_norm": 1.004449486732483, + "learning_rate": 2.5846173259190166e-06, + "loss": 1.8717, + "mean_token_accuracy": 0.582343339920044, + "num_tokens": 8395185940.0, + "step": 16422 + }, + { + "epoch": 4.4410492157923205, + "grad_norm": 0.94517582654953, + "learning_rate": 2.584058533490543e-06, + "loss": 1.8803, + "mean_token_accuracy": 0.5588741302490234, + "num_tokens": 8395710173.0, + "step": 16423 + }, + { + "epoch": 4.44131963223364, + "grad_norm": 0.8250675797462463, + "learning_rate": 2.5834999992879057e-06, + "loss": 1.7483, + "mean_token_accuracy": 0.5821345448493958, + "num_tokens": 8396234389.0, + "step": 16424 + }, + { + "epoch": 4.44159004867496, + "grad_norm": 0.8495391011238098, + "learning_rate": 2.5829417233282373e-06, + "loss": 1.8508, + "mean_token_accuracy": 0.5879043936729431, + "num_tokens": 8396748191.0, + "step": 16425 + }, + { + "epoch": 4.441860465116279, + "grad_norm": 0.8789790868759155, + "learning_rate": 2.5823837056286705e-06, + "loss": 1.8507, + "mean_token_accuracy": 0.5768123865127563, + "num_tokens": 8397237084.0, + "step": 16426 + }, + { + "epoch": 4.442130881557599, + "grad_norm": 0.7177753448486328, + "learning_rate": 2.5818259462063274e-06, + "loss": 1.7563, + "mean_token_accuracy": 0.5758085250854492, + "num_tokens": 8397761194.0, + "step": 16427 + }, + { + "epoch": 4.442401297998918, + "grad_norm": 0.9759888052940369, + "learning_rate": 2.5812684450783187e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.5893653035163879, + "num_tokens": 8398285384.0, + "step": 16428 + }, + { + "epoch": 4.442671714440238, + "grad_norm": 0.7554633021354675, + "learning_rate": 2.580711202261754e-06, + "loss": 1.7199, + "mean_token_accuracy": 0.593027651309967, + "num_tokens": 8398809538.0, + "step": 16429 + }, + { + "epoch": 4.4429421308815575, + "grad_norm": 0.7414389252662659, + "learning_rate": 2.5801542177737283e-06, + "loss": 1.7958, + "mean_token_accuracy": 0.58198481798172, + "num_tokens": 8399333658.0, + "step": 16430 + }, + { + "epoch": 4.443212547322878, + "grad_norm": 0.8392457962036133, + "learning_rate": 2.5795974916313315e-06, + "loss": 1.7217, + "mean_token_accuracy": 0.5874857902526855, + "num_tokens": 8399857935.0, + "step": 16431 + }, + { + "epoch": 4.443482963764197, + "grad_norm": 1.1138248443603516, + "learning_rate": 2.5790410238516485e-06, + "loss": 1.8644, + "mean_token_accuracy": 0.5750958323478699, + "num_tokens": 8400333538.0, + "step": 16432 + }, + { + "epoch": 4.443753380205516, + "grad_norm": 0.9830881357192993, + "learning_rate": 2.5784848144517503e-06, + "loss": 1.9229, + "mean_token_accuracy": 0.5415971875190735, + "num_tokens": 8400857581.0, + "step": 16433 + }, + { + "epoch": 4.444023796646836, + "grad_norm": 0.888335108757019, + "learning_rate": 2.577928863448703e-06, + "loss": 1.7126, + "mean_token_accuracy": 0.6024836301803589, + "num_tokens": 8401325280.0, + "step": 16434 + }, + { + "epoch": 4.444294213088156, + "grad_norm": 0.8423448801040649, + "learning_rate": 2.5773731708595677e-06, + "loss": 1.9185, + "mean_token_accuracy": 0.5585005283355713, + "num_tokens": 8401849479.0, + "step": 16435 + }, + { + "epoch": 4.444564629529475, + "grad_norm": 0.8717311024665833, + "learning_rate": 2.576817736701392e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5636268854141235, + "num_tokens": 8402373737.0, + "step": 16436 + }, + { + "epoch": 4.444835045970795, + "grad_norm": 1.0020825862884521, + "learning_rate": 2.576262560991218e-06, + "loss": 1.8851, + "mean_token_accuracy": 0.5610201358795166, + "num_tokens": 8402897925.0, + "step": 16437 + }, + { + "epoch": 4.445105462412115, + "grad_norm": 0.8982502222061157, + "learning_rate": 2.575707643746083e-06, + "loss": 1.7799, + "mean_token_accuracy": 0.5767204165458679, + "num_tokens": 8403422118.0, + "step": 16438 + }, + { + "epoch": 4.445375878853434, + "grad_norm": 0.847119152545929, + "learning_rate": 2.5751529849830097e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5776335000991821, + "num_tokens": 8403946287.0, + "step": 16439 + }, + { + "epoch": 4.445646295294754, + "grad_norm": 0.8828698396682739, + "learning_rate": 2.574598584719019e-06, + "loss": 1.7171, + "mean_token_accuracy": 0.5878797173500061, + "num_tokens": 8404410442.0, + "step": 16440 + }, + { + "epoch": 4.445916711736073, + "grad_norm": 0.3769211769104004, + "learning_rate": 2.574044442971123e-06, + "loss": 1.059, + "mean_token_accuracy": 0.7123828530311584, + "num_tokens": 8404934592.0, + "step": 16441 + }, + { + "epoch": 4.446187128177393, + "grad_norm": 0.8708076477050781, + "learning_rate": 2.5734905597563227e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.5632635354995728, + "num_tokens": 8405458727.0, + "step": 16442 + }, + { + "epoch": 4.4464575446187125, + "grad_norm": 0.9999671578407288, + "learning_rate": 2.5729369350916106e-06, + "loss": 1.8997, + "mean_token_accuracy": 0.5853716135025024, + "num_tokens": 8405959897.0, + "step": 16443 + }, + { + "epoch": 4.4467279610600325, + "grad_norm": 1.0930578708648682, + "learning_rate": 2.5723835689939773e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.5895006656646729, + "num_tokens": 8406447038.0, + "step": 16444 + }, + { + "epoch": 4.446998377501352, + "grad_norm": 0.8883737325668335, + "learning_rate": 2.5718304614804e-06, + "loss": 1.7693, + "mean_token_accuracy": 0.5763131976127625, + "num_tokens": 8406971148.0, + "step": 16445 + }, + { + "epoch": 4.447268793942672, + "grad_norm": 0.7426568865776062, + "learning_rate": 2.571277612567849e-06, + "loss": 1.7665, + "mean_token_accuracy": 0.5749993324279785, + "num_tokens": 8407495415.0, + "step": 16446 + }, + { + "epoch": 4.447539210383991, + "grad_norm": 0.7849779725074768, + "learning_rate": 2.5707250222732894e-06, + "loss": 1.6574, + "mean_token_accuracy": 0.625083327293396, + "num_tokens": 8408019681.0, + "step": 16447 + }, + { + "epoch": 4.447809626825311, + "grad_norm": 0.7711138129234314, + "learning_rate": 2.570172690613674e-06, + "loss": 1.7509, + "mean_token_accuracy": 0.5846970081329346, + "num_tokens": 8408543877.0, + "step": 16448 + }, + { + "epoch": 4.44808004326663, + "grad_norm": 0.7480186223983765, + "learning_rate": 2.56962061760595e-06, + "loss": 1.636, + "mean_token_accuracy": 0.6248750686645508, + "num_tokens": 8409068151.0, + "step": 16449 + }, + { + "epoch": 4.44835045970795, + "grad_norm": 0.7465925812721252, + "learning_rate": 2.5690688032670594e-06, + "loss": 1.684, + "mean_token_accuracy": 0.6215837001800537, + "num_tokens": 8409592344.0, + "step": 16450 + }, + { + "epoch": 4.44862087614927, + "grad_norm": 0.8755999803543091, + "learning_rate": 2.568517247613931e-06, + "loss": 1.9303, + "mean_token_accuracy": 0.569259762763977, + "num_tokens": 8410002823.0, + "step": 16451 + }, + { + "epoch": 4.44889129259059, + "grad_norm": 0.9162470698356628, + "learning_rate": 2.567965950663489e-06, + "loss": 1.8877, + "mean_token_accuracy": 0.5691236257553101, + "num_tokens": 8410496031.0, + "step": 16452 + }, + { + "epoch": 4.449161709031909, + "grad_norm": 0.9257559180259705, + "learning_rate": 2.567414912432648e-06, + "loss": 1.865, + "mean_token_accuracy": 0.5655984878540039, + "num_tokens": 8411020306.0, + "step": 16453 + }, + { + "epoch": 4.449432125473229, + "grad_norm": 0.7812325954437256, + "learning_rate": 2.5668641329383175e-06, + "loss": 1.827, + "mean_token_accuracy": 0.5891909599304199, + "num_tokens": 8411496703.0, + "step": 16454 + }, + { + "epoch": 4.449702541914548, + "grad_norm": 0.7934725284576416, + "learning_rate": 2.5663136121973935e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5729631185531616, + "num_tokens": 8412020907.0, + "step": 16455 + }, + { + "epoch": 4.449972958355868, + "grad_norm": 0.7891346216201782, + "learning_rate": 2.5657633502267714e-06, + "loss": 1.8032, + "mean_token_accuracy": 0.5849410891532898, + "num_tokens": 8412539036.0, + "step": 16456 + }, + { + "epoch": 4.4502433747971875, + "grad_norm": 0.8359742164611816, + "learning_rate": 2.5652133470433315e-06, + "loss": 1.7388, + "mean_token_accuracy": 0.5998271703720093, + "num_tokens": 8413063199.0, + "step": 16457 + }, + { + "epoch": 4.4505137912385075, + "grad_norm": 0.7601435780525208, + "learning_rate": 2.5646636026639536e-06, + "loss": 1.9292, + "mean_token_accuracy": 0.5576177835464478, + "num_tokens": 8413546720.0, + "step": 16458 + }, + { + "epoch": 4.450784207679827, + "grad_norm": 0.8732327818870544, + "learning_rate": 2.564114117105502e-06, + "loss": 1.7884, + "mean_token_accuracy": 0.612871527671814, + "num_tokens": 8413960079.0, + "step": 16459 + }, + { + "epoch": 4.451054624121147, + "grad_norm": 0.7746036648750305, + "learning_rate": 2.5635648903848365e-06, + "loss": 1.8052, + "mean_token_accuracy": 0.5838584899902344, + "num_tokens": 8414457208.0, + "step": 16460 + }, + { + "epoch": 4.451325040562466, + "grad_norm": 0.31996288895606995, + "learning_rate": 2.563015922518812e-06, + "loss": 1.036, + "mean_token_accuracy": 0.7239100337028503, + "num_tokens": 8414981434.0, + "step": 16461 + }, + { + "epoch": 4.451595457003786, + "grad_norm": 0.8634374737739563, + "learning_rate": 2.56246721352427e-06, + "loss": 1.7772, + "mean_token_accuracy": 0.5638400316238403, + "num_tokens": 8415494174.0, + "step": 16462 + }, + { + "epoch": 4.451865873445105, + "grad_norm": 0.9175437092781067, + "learning_rate": 2.561918763418046e-06, + "loss": 1.8622, + "mean_token_accuracy": 0.5588691234588623, + "num_tokens": 8416018300.0, + "step": 16463 + }, + { + "epoch": 4.452136289886425, + "grad_norm": 0.9186397790908813, + "learning_rate": 2.5613705722169706e-06, + "loss": 1.7513, + "mean_token_accuracy": 0.5898832082748413, + "num_tokens": 8416542574.0, + "step": 16464 + }, + { + "epoch": 4.452406706327745, + "grad_norm": 0.9516413807868958, + "learning_rate": 2.5608226399378626e-06, + "loss": 1.9096, + "mean_token_accuracy": 0.5729427337646484, + "num_tokens": 8416999121.0, + "step": 16465 + }, + { + "epoch": 4.452677122769065, + "grad_norm": 0.8112813830375671, + "learning_rate": 2.5602749665975317e-06, + "loss": 1.7485, + "mean_token_accuracy": 0.6106305122375488, + "num_tokens": 8417465947.0, + "step": 16466 + }, + { + "epoch": 4.452947539210384, + "grad_norm": 0.8074321150779724, + "learning_rate": 2.5597275522127863e-06, + "loss": 1.7755, + "mean_token_accuracy": 0.5856974124908447, + "num_tokens": 8417990191.0, + "step": 16467 + }, + { + "epoch": 4.453217955651704, + "grad_norm": 0.9533770084381104, + "learning_rate": 2.5591803968004203e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5915886163711548, + "num_tokens": 8418475015.0, + "step": 16468 + }, + { + "epoch": 4.453488372093023, + "grad_norm": 0.882024347782135, + "learning_rate": 2.558633500377221e-06, + "loss": 1.7068, + "mean_token_accuracy": 0.5934687852859497, + "num_tokens": 8418999053.0, + "step": 16469 + }, + { + "epoch": 4.453758788534343, + "grad_norm": 0.8571767807006836, + "learning_rate": 2.5580868629599726e-06, + "loss": 1.7194, + "mean_token_accuracy": 0.6017330884933472, + "num_tokens": 8419523210.0, + "step": 16470 + }, + { + "epoch": 4.4540292049756625, + "grad_norm": 0.8447673916816711, + "learning_rate": 2.557540484565443e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.5629507899284363, + "num_tokens": 8420047491.0, + "step": 16471 + }, + { + "epoch": 4.4542996214169825, + "grad_norm": 0.8291149139404297, + "learning_rate": 2.556994365210402e-06, + "loss": 1.8169, + "mean_token_accuracy": 0.5821697115898132, + "num_tokens": 8420571775.0, + "step": 16472 + }, + { + "epoch": 4.454570037858302, + "grad_norm": 0.8719257712364197, + "learning_rate": 2.5564485049116027e-06, + "loss": 1.7686, + "mean_token_accuracy": 0.5832393765449524, + "num_tokens": 8421095899.0, + "step": 16473 + }, + { + "epoch": 4.454840454299622, + "grad_norm": 1.0591458082199097, + "learning_rate": 2.5559029036857918e-06, + "loss": 1.7395, + "mean_token_accuracy": 0.5885810852050781, + "num_tokens": 8421579185.0, + "step": 16474 + }, + { + "epoch": 4.455110870740941, + "grad_norm": 0.8797118663787842, + "learning_rate": 2.555357561549715e-06, + "loss": 1.8079, + "mean_token_accuracy": 0.5694652795791626, + "num_tokens": 8422103216.0, + "step": 16475 + }, + { + "epoch": 4.455381287182261, + "grad_norm": 0.89206862449646, + "learning_rate": 2.5548124785201013e-06, + "loss": 1.8281, + "mean_token_accuracy": 0.5797224640846252, + "num_tokens": 8422627410.0, + "step": 16476 + }, + { + "epoch": 4.45565170362358, + "grad_norm": 0.9906453490257263, + "learning_rate": 2.5542676546136754e-06, + "loss": 1.6958, + "mean_token_accuracy": 0.6031159162521362, + "num_tokens": 8423126680.0, + "step": 16477 + }, + { + "epoch": 4.4559221200648995, + "grad_norm": 0.7637538909912109, + "learning_rate": 2.553723089847156e-06, + "loss": 1.7861, + "mean_token_accuracy": 0.5935596227645874, + "num_tokens": 8423650893.0, + "step": 16478 + }, + { + "epoch": 4.45619253650622, + "grad_norm": 0.8456180095672607, + "learning_rate": 2.553178784237252e-06, + "loss": 1.8446, + "mean_token_accuracy": 0.5783163905143738, + "num_tokens": 8424175163.0, + "step": 16479 + }, + { + "epoch": 4.456462952947539, + "grad_norm": 0.9634764790534973, + "learning_rate": 2.5526347378006617e-06, + "loss": 1.8148, + "mean_token_accuracy": 0.5675945281982422, + "num_tokens": 8424699346.0, + "step": 16480 + }, + { + "epoch": 4.456733369388859, + "grad_norm": 0.35498127341270447, + "learning_rate": 2.5520909505540803e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.7074601650238037, + "num_tokens": 8425215766.0, + "step": 16481 + }, + { + "epoch": 4.457003785830178, + "grad_norm": 0.8730393052101135, + "learning_rate": 2.5515474225141917e-06, + "loss": 1.7204, + "mean_token_accuracy": 0.5881451368331909, + "num_tokens": 8425739933.0, + "step": 16482 + }, + { + "epoch": 4.457274202271498, + "grad_norm": 0.9888526797294617, + "learning_rate": 2.5510041536976726e-06, + "loss": 1.8131, + "mean_token_accuracy": 0.5786809921264648, + "num_tokens": 8426264217.0, + "step": 16483 + }, + { + "epoch": 4.457544618712817, + "grad_norm": 0.93958979845047, + "learning_rate": 2.5504611441211952e-06, + "loss": 1.8613, + "mean_token_accuracy": 0.5686637163162231, + "num_tokens": 8426788407.0, + "step": 16484 + }, + { + "epoch": 4.4578150351541375, + "grad_norm": 0.9881579875946045, + "learning_rate": 2.5499183938014177e-06, + "loss": 1.8227, + "mean_token_accuracy": 0.5859715938568115, + "num_tokens": 8427261262.0, + "step": 16485 + }, + { + "epoch": 4.458085451595457, + "grad_norm": 1.8601164817810059, + "learning_rate": 2.5493759027549932e-06, + "loss": 1.6492, + "mean_token_accuracy": 0.6206099987030029, + "num_tokens": 8427785547.0, + "step": 16486 + }, + { + "epoch": 4.458355868036777, + "grad_norm": 0.9558214545249939, + "learning_rate": 2.548833670998569e-06, + "loss": 1.7605, + "mean_token_accuracy": 0.5708584785461426, + "num_tokens": 8428309735.0, + "step": 16487 + }, + { + "epoch": 4.458626284478096, + "grad_norm": 1.0860849618911743, + "learning_rate": 2.548291698548781e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5669407844543457, + "num_tokens": 8428833973.0, + "step": 16488 + }, + { + "epoch": 4.458896700919416, + "grad_norm": 0.9564658999443054, + "learning_rate": 2.5477499854222566e-06, + "loss": 1.7603, + "mean_token_accuracy": 0.5655548572540283, + "num_tokens": 8429358033.0, + "step": 16489 + }, + { + "epoch": 4.459167117360735, + "grad_norm": 0.8989684581756592, + "learning_rate": 2.547208531635621e-06, + "loss": 1.8786, + "mean_token_accuracy": 0.560301661491394, + "num_tokens": 8429882226.0, + "step": 16490 + }, + { + "epoch": 4.459437533802055, + "grad_norm": 0.8918318748474121, + "learning_rate": 2.5466673372054856e-06, + "loss": 1.7495, + "mean_token_accuracy": 0.591060996055603, + "num_tokens": 8430330075.0, + "step": 16491 + }, + { + "epoch": 4.4597079502433745, + "grad_norm": 0.950849175453186, + "learning_rate": 2.5461264021484543e-06, + "loss": 1.8408, + "mean_token_accuracy": 0.563811182975769, + "num_tokens": 8430854322.0, + "step": 16492 + }, + { + "epoch": 4.459978366684695, + "grad_norm": 1.0718027353286743, + "learning_rate": 2.5455857264811286e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5662226676940918, + "num_tokens": 8431335870.0, + "step": 16493 + }, + { + "epoch": 4.460248783126014, + "grad_norm": 0.959235668182373, + "learning_rate": 2.5450453102200935e-06, + "loss": 1.8037, + "mean_token_accuracy": 0.5668383240699768, + "num_tokens": 8431860001.0, + "step": 16494 + }, + { + "epoch": 4.460519199567334, + "grad_norm": 0.8657545447349548, + "learning_rate": 2.544505153381934e-06, + "loss": 1.7091, + "mean_token_accuracy": 0.5917718410491943, + "num_tokens": 8432357414.0, + "step": 16495 + }, + { + "epoch": 4.460789616008653, + "grad_norm": 0.9351508617401123, + "learning_rate": 2.5439652559832206e-06, + "loss": 1.8211, + "mean_token_accuracy": 0.5898730754852295, + "num_tokens": 8432881687.0, + "step": 16496 + }, + { + "epoch": 4.461060032449973, + "grad_norm": 0.9689204692840576, + "learning_rate": 2.543425618040523e-06, + "loss": 1.8463, + "mean_token_accuracy": 0.5544904470443726, + "num_tokens": 8433387475.0, + "step": 16497 + }, + { + "epoch": 4.461330448891292, + "grad_norm": 0.9283255934715271, + "learning_rate": 2.542886239570394e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5865340828895569, + "num_tokens": 8433872099.0, + "step": 16498 + }, + { + "epoch": 4.4616008653326125, + "grad_norm": 0.8725805282592773, + "learning_rate": 2.5423471205893884e-06, + "loss": 1.7728, + "mean_token_accuracy": 0.5978615283966064, + "num_tokens": 8434396360.0, + "step": 16499 + }, + { + "epoch": 4.461871281773932, + "grad_norm": 1.327718734741211, + "learning_rate": 2.5418082611140434e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.5748531818389893, + "num_tokens": 8434909668.0, + "step": 16500 + }, + { + "epoch": 4.462141698215252, + "grad_norm": 0.3674340546131134, + "learning_rate": 2.5412696611608974e-06, + "loss": 1.1066, + "mean_token_accuracy": 0.7044486403465271, + "num_tokens": 8435431877.0, + "step": 16501 + }, + { + "epoch": 4.462412114656571, + "grad_norm": 0.9332149624824524, + "learning_rate": 2.540731320746473e-06, + "loss": 1.8879, + "mean_token_accuracy": 0.5561801195144653, + "num_tokens": 8435945381.0, + "step": 16502 + }, + { + "epoch": 4.462682531097891, + "grad_norm": 0.9348810315132141, + "learning_rate": 2.5401932398872865e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5934357047080994, + "num_tokens": 8436428876.0, + "step": 16503 + }, + { + "epoch": 4.46295294753921, + "grad_norm": 1.0955114364624023, + "learning_rate": 2.5396554185998525e-06, + "loss": 1.9121, + "mean_token_accuracy": 0.5752661228179932, + "num_tokens": 8436941728.0, + "step": 16504 + }, + { + "epoch": 4.46322336398053, + "grad_norm": 0.7998419404029846, + "learning_rate": 2.5391178569006704e-06, + "loss": 1.8102, + "mean_token_accuracy": 0.5904219746589661, + "num_tokens": 8437408387.0, + "step": 16505 + }, + { + "epoch": 4.4634937804218495, + "grad_norm": 1.018610954284668, + "learning_rate": 2.5385805548062328e-06, + "loss": 1.8897, + "mean_token_accuracy": 0.5632874965667725, + "num_tokens": 8437932626.0, + "step": 16506 + }, + { + "epoch": 4.46376419686317, + "grad_norm": 0.9060342907905579, + "learning_rate": 2.5380435123330283e-06, + "loss": 1.9439, + "mean_token_accuracy": 0.5613272786140442, + "num_tokens": 8438456909.0, + "step": 16507 + }, + { + "epoch": 4.464034613304489, + "grad_norm": 0.9790240526199341, + "learning_rate": 2.5375067294975336e-06, + "loss": 1.7321, + "mean_token_accuracy": 0.5965242981910706, + "num_tokens": 8438981123.0, + "step": 16508 + }, + { + "epoch": 4.464305029745809, + "grad_norm": 0.9981247186660767, + "learning_rate": 2.5369702063162187e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5966551303863525, + "num_tokens": 8439505349.0, + "step": 16509 + }, + { + "epoch": 4.464575446187128, + "grad_norm": 0.8618406057357788, + "learning_rate": 2.5364339428055458e-06, + "loss": 1.7366, + "mean_token_accuracy": 0.5825695991516113, + "num_tokens": 8440029607.0, + "step": 16510 + }, + { + "epoch": 4.464845862628448, + "grad_norm": 0.8394141793251038, + "learning_rate": 2.53589793898197e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5643224716186523, + "num_tokens": 8440553846.0, + "step": 16511 + }, + { + "epoch": 4.465116279069767, + "grad_norm": 0.7767936587333679, + "learning_rate": 2.5353621948619354e-06, + "loss": 1.8238, + "mean_token_accuracy": 0.5610179901123047, + "num_tokens": 8441078042.0, + "step": 16512 + }, + { + "epoch": 4.4653866955110875, + "grad_norm": 0.8534278273582458, + "learning_rate": 2.534826710461883e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5704888105392456, + "num_tokens": 8441602232.0, + "step": 16513 + }, + { + "epoch": 4.465657111952407, + "grad_norm": 0.9390314221382141, + "learning_rate": 2.534291485798242e-06, + "loss": 1.6731, + "mean_token_accuracy": 0.6330063939094543, + "num_tokens": 8442126512.0, + "step": 16514 + }, + { + "epoch": 4.465927528393727, + "grad_norm": 1.4595770835876465, + "learning_rate": 2.533756520887432e-06, + "loss": 1.6727, + "mean_token_accuracy": 0.6412289142608643, + "num_tokens": 8442650758.0, + "step": 16515 + }, + { + "epoch": 4.466197944835046, + "grad_norm": 1.0792748928070068, + "learning_rate": 2.533221815745872e-06, + "loss": 1.8116, + "mean_token_accuracy": 0.5942432284355164, + "num_tokens": 8443174972.0, + "step": 16516 + }, + { + "epoch": 4.466468361276366, + "grad_norm": 1.1168452501296997, + "learning_rate": 2.532687370389964e-06, + "loss": 1.9608, + "mean_token_accuracy": 0.5658892393112183, + "num_tokens": 8443640156.0, + "step": 16517 + }, + { + "epoch": 4.466738777717685, + "grad_norm": 0.8890377283096313, + "learning_rate": 2.5321531848361093e-06, + "loss": 1.7796, + "mean_token_accuracy": 0.5811461806297302, + "num_tokens": 8444164335.0, + "step": 16518 + }, + { + "epoch": 4.467009194159004, + "grad_norm": 0.8752557039260864, + "learning_rate": 2.5316192591006975e-06, + "loss": 1.7475, + "mean_token_accuracy": 0.592797040939331, + "num_tokens": 8444655990.0, + "step": 16519 + }, + { + "epoch": 4.4672796106003245, + "grad_norm": 1.0305145978927612, + "learning_rate": 2.5310855932001094e-06, + "loss": 1.9395, + "mean_token_accuracy": 0.5847750902175903, + "num_tokens": 8445097624.0, + "step": 16520 + }, + { + "epoch": 4.467550027041644, + "grad_norm": 0.3448253273963928, + "learning_rate": 2.530552187150722e-06, + "loss": 1.1525, + "mean_token_accuracy": 0.6788937449455261, + "num_tokens": 8445621860.0, + "step": 16521 + }, + { + "epoch": 4.467820443482964, + "grad_norm": 1.1668611764907837, + "learning_rate": 2.5300190409689006e-06, + "loss": 1.9187, + "mean_token_accuracy": 0.5727680325508118, + "num_tokens": 8446116994.0, + "step": 16522 + }, + { + "epoch": 4.468090859924283, + "grad_norm": 1.0962283611297607, + "learning_rate": 2.5294861546710013e-06, + "loss": 1.8892, + "mean_token_accuracy": 0.576837420463562, + "num_tokens": 8446641254.0, + "step": 16523 + }, + { + "epoch": 4.468361276365603, + "grad_norm": 0.8322210907936096, + "learning_rate": 2.5289535282733797e-06, + "loss": 1.7659, + "mean_token_accuracy": 0.5843170881271362, + "num_tokens": 8447165383.0, + "step": 16524 + }, + { + "epoch": 4.468631692806922, + "grad_norm": 0.860548734664917, + "learning_rate": 2.528421161792374e-06, + "loss": 1.8345, + "mean_token_accuracy": 0.5784255862236023, + "num_tokens": 8447689606.0, + "step": 16525 + }, + { + "epoch": 4.468902109248242, + "grad_norm": 0.9434882998466492, + "learning_rate": 2.5278890552443196e-06, + "loss": 1.8892, + "mean_token_accuracy": 0.5466687679290771, + "num_tokens": 8448208744.0, + "step": 16526 + }, + { + "epoch": 4.469172525689562, + "grad_norm": 1.092864990234375, + "learning_rate": 2.527357208645546e-06, + "loss": 1.8876, + "mean_token_accuracy": 0.5684694051742554, + "num_tokens": 8448706557.0, + "step": 16527 + }, + { + "epoch": 4.469442942130882, + "grad_norm": 0.9275088310241699, + "learning_rate": 2.526825622012369e-06, + "loss": 1.7317, + "mean_token_accuracy": 0.574263334274292, + "num_tokens": 8449230646.0, + "step": 16528 + }, + { + "epoch": 4.469713358572201, + "grad_norm": 0.9317117929458618, + "learning_rate": 2.5262942953611e-06, + "loss": 1.8173, + "mean_token_accuracy": 0.5843890905380249, + "num_tokens": 8449754918.0, + "step": 16529 + }, + { + "epoch": 4.469983775013521, + "grad_norm": 0.8313539624214172, + "learning_rate": 2.5257632287080412e-06, + "loss": 1.8265, + "mean_token_accuracy": 0.5857232809066772, + "num_tokens": 8450255319.0, + "step": 16530 + }, + { + "epoch": 4.47025419145484, + "grad_norm": 0.7059081792831421, + "learning_rate": 2.525232422069488e-06, + "loss": 1.5077, + "mean_token_accuracy": 0.6415964365005493, + "num_tokens": 8450756234.0, + "step": 16531 + }, + { + "epoch": 4.47052460789616, + "grad_norm": 0.8942802548408508, + "learning_rate": 2.524701875461725e-06, + "loss": 1.7707, + "mean_token_accuracy": 0.5963256359100342, + "num_tokens": 8451250945.0, + "step": 16532 + }, + { + "epoch": 4.470795024337479, + "grad_norm": 0.8692130446434021, + "learning_rate": 2.524171588901036e-06, + "loss": 1.9221, + "mean_token_accuracy": 0.5543407797813416, + "num_tokens": 8451726818.0, + "step": 16533 + }, + { + "epoch": 4.4710654407787995, + "grad_norm": 0.9176825881004333, + "learning_rate": 2.523641562403687e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.5734366178512573, + "num_tokens": 8452227467.0, + "step": 16534 + }, + { + "epoch": 4.471335857220119, + "grad_norm": 0.8608222007751465, + "learning_rate": 2.523111795985941e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5860471725463867, + "num_tokens": 8452751573.0, + "step": 16535 + }, + { + "epoch": 4.471606273661439, + "grad_norm": 0.7815428376197815, + "learning_rate": 2.522582289664055e-06, + "loss": 1.8328, + "mean_token_accuracy": 0.569848358631134, + "num_tokens": 8453275823.0, + "step": 16536 + }, + { + "epoch": 4.471876690102758, + "grad_norm": 0.8058736324310303, + "learning_rate": 2.522053043454274e-06, + "loss": 1.8317, + "mean_token_accuracy": 0.5764864683151245, + "num_tokens": 8453789568.0, + "step": 16537 + }, + { + "epoch": 4.472147106544078, + "grad_norm": 0.7449316382408142, + "learning_rate": 2.5215240573728383e-06, + "loss": 1.7624, + "mean_token_accuracy": 0.588080883026123, + "num_tokens": 8454313671.0, + "step": 16538 + }, + { + "epoch": 4.472417522985397, + "grad_norm": 0.8496463298797607, + "learning_rate": 2.5209953314359782e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5992204546928406, + "num_tokens": 8454815732.0, + "step": 16539 + }, + { + "epoch": 4.472687939426717, + "grad_norm": 0.8981430530548096, + "learning_rate": 2.520466865659914e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.5761430859565735, + "num_tokens": 8455312916.0, + "step": 16540 + }, + { + "epoch": 4.472958355868037, + "grad_norm": 0.3306539058685303, + "learning_rate": 2.519938660060867e-06, + "loss": 1.1732, + "mean_token_accuracy": 0.6891268491744995, + "num_tokens": 8455811669.0, + "step": 16541 + }, + { + "epoch": 4.473228772309357, + "grad_norm": 0.9382945895195007, + "learning_rate": 2.519410714655039e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5819212198257446, + "num_tokens": 8456335868.0, + "step": 16542 + }, + { + "epoch": 4.473499188750676, + "grad_norm": 0.9124078750610352, + "learning_rate": 2.518883029458628e-06, + "loss": 1.7507, + "mean_token_accuracy": 0.5809460282325745, + "num_tokens": 8456859975.0, + "step": 16543 + }, + { + "epoch": 4.473769605191996, + "grad_norm": 0.8239511251449585, + "learning_rate": 2.5183556044878298e-06, + "loss": 1.8005, + "mean_token_accuracy": 0.5853759050369263, + "num_tokens": 8457384131.0, + "step": 16544 + }, + { + "epoch": 4.474040021633315, + "grad_norm": 0.7660414576530457, + "learning_rate": 2.517828439758823e-06, + "loss": 1.8143, + "mean_token_accuracy": 0.5760228633880615, + "num_tokens": 8457908251.0, + "step": 16545 + }, + { + "epoch": 4.474310438074635, + "grad_norm": 0.7773104310035706, + "learning_rate": 2.517301535287784e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.575535237789154, + "num_tokens": 8458411064.0, + "step": 16546 + }, + { + "epoch": 4.474580854515954, + "grad_norm": 0.9078723192214966, + "learning_rate": 2.5167748910908806e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.58347487449646, + "num_tokens": 8458935340.0, + "step": 16547 + }, + { + "epoch": 4.4748512709572745, + "grad_norm": 1.0707532167434692, + "learning_rate": 2.5162485071842716e-06, + "loss": 1.871, + "mean_token_accuracy": 0.5587284564971924, + "num_tokens": 8459459476.0, + "step": 16548 + }, + { + "epoch": 4.475121687398594, + "grad_norm": 0.8420475721359253, + "learning_rate": 2.5157223835841053e-06, + "loss": 1.8471, + "mean_token_accuracy": 0.5757670402526855, + "num_tokens": 8459952479.0, + "step": 16549 + }, + { + "epoch": 4.475392103839914, + "grad_norm": 1.0360243320465088, + "learning_rate": 2.51519652030653e-06, + "loss": 1.8496, + "mean_token_accuracy": 0.5962485074996948, + "num_tokens": 8460450188.0, + "step": 16550 + }, + { + "epoch": 4.475662520281233, + "grad_norm": 1.0474622249603271, + "learning_rate": 2.5146709173676755e-06, + "loss": 1.8983, + "mean_token_accuracy": 0.5876948833465576, + "num_tokens": 8460876248.0, + "step": 16551 + }, + { + "epoch": 4.475932936722553, + "grad_norm": 1.0001113414764404, + "learning_rate": 2.514145574783671e-06, + "loss": 1.7656, + "mean_token_accuracy": 0.5732457637786865, + "num_tokens": 8461400489.0, + "step": 16552 + }, + { + "epoch": 4.476203353163872, + "grad_norm": 0.883101224899292, + "learning_rate": 2.5136204925706353e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5848375558853149, + "num_tokens": 8461870296.0, + "step": 16553 + }, + { + "epoch": 4.476473769605192, + "grad_norm": 0.9879742860794067, + "learning_rate": 2.5130956707446803e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5784138441085815, + "num_tokens": 8462379600.0, + "step": 16554 + }, + { + "epoch": 4.476744186046512, + "grad_norm": 0.831305205821991, + "learning_rate": 2.5125711093219085e-06, + "loss": 1.6825, + "mean_token_accuracy": 0.6269891858100891, + "num_tokens": 8462855121.0, + "step": 16555 + }, + { + "epoch": 4.477014602487832, + "grad_norm": 0.8877335786819458, + "learning_rate": 2.5120468083184156e-06, + "loss": 1.7922, + "mean_token_accuracy": 0.5819834470748901, + "num_tokens": 8463379399.0, + "step": 16556 + }, + { + "epoch": 4.477285018929151, + "grad_norm": 0.8355042338371277, + "learning_rate": 2.5115227677502867e-06, + "loss": 1.818, + "mean_token_accuracy": 0.5728743672370911, + "num_tokens": 8463903571.0, + "step": 16557 + }, + { + "epoch": 4.477555435370471, + "grad_norm": 1.0043145418167114, + "learning_rate": 2.5109989876336027e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5807052254676819, + "num_tokens": 8464420101.0, + "step": 16558 + }, + { + "epoch": 4.47782585181179, + "grad_norm": 0.8791120648384094, + "learning_rate": 2.5104754679844357e-06, + "loss": 1.7008, + "mean_token_accuracy": 0.6007957458496094, + "num_tokens": 8464944379.0, + "step": 16559 + }, + { + "epoch": 4.478096268253109, + "grad_norm": 0.8835245370864868, + "learning_rate": 2.5099522088188466e-06, + "loss": 1.8929, + "mean_token_accuracy": 0.5541541576385498, + "num_tokens": 8465468658.0, + "step": 16560 + }, + { + "epoch": 4.4783666846944294, + "grad_norm": 0.3594348430633545, + "learning_rate": 2.5094292101528906e-06, + "loss": 0.982, + "mean_token_accuracy": 0.727637767791748, + "num_tokens": 8465981226.0, + "step": 16561 + }, + { + "epoch": 4.478637101135749, + "grad_norm": 1.0925705432891846, + "learning_rate": 2.5089064720026156e-06, + "loss": 1.8111, + "mean_token_accuracy": 0.5812387466430664, + "num_tokens": 8466505499.0, + "step": 16562 + }, + { + "epoch": 4.478907517577069, + "grad_norm": 1.2372465133666992, + "learning_rate": 2.508383994384061e-06, + "loss": 1.8372, + "mean_token_accuracy": 0.5768581628799438, + "num_tokens": 8467029735.0, + "step": 16563 + }, + { + "epoch": 4.479177934018388, + "grad_norm": 0.9045315384864807, + "learning_rate": 2.5078617773132585e-06, + "loss": 1.7964, + "mean_token_accuracy": 0.591327428817749, + "num_tokens": 8467553915.0, + "step": 16564 + }, + { + "epoch": 4.479448350459708, + "grad_norm": 0.8575412034988403, + "learning_rate": 2.50733982080623e-06, + "loss": 1.9585, + "mean_token_accuracy": 0.5698947310447693, + "num_tokens": 8468024389.0, + "step": 16565 + }, + { + "epoch": 4.479718766901027, + "grad_norm": 0.8129515051841736, + "learning_rate": 2.5068181248789898e-06, + "loss": 1.8325, + "mean_token_accuracy": 0.5820307731628418, + "num_tokens": 8468534325.0, + "step": 16566 + }, + { + "epoch": 4.479989183342347, + "grad_norm": 0.8834531307220459, + "learning_rate": 2.5062966895475477e-06, + "loss": 1.7474, + "mean_token_accuracy": 0.58192378282547, + "num_tokens": 8469058449.0, + "step": 16567 + }, + { + "epoch": 4.4802595997836665, + "grad_norm": 0.8443557024002075, + "learning_rate": 2.5057755148279007e-06, + "loss": 1.7272, + "mean_token_accuracy": 0.5886517763137817, + "num_tokens": 8469582676.0, + "step": 16568 + }, + { + "epoch": 4.480530016224987, + "grad_norm": 0.8868511915206909, + "learning_rate": 2.505254600736041e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5737859010696411, + "num_tokens": 8470101578.0, + "step": 16569 + }, + { + "epoch": 4.480800432666306, + "grad_norm": 0.779953122138977, + "learning_rate": 2.504733947287952e-06, + "loss": 1.8241, + "mean_token_accuracy": 0.5819956064224243, + "num_tokens": 8470625833.0, + "step": 16570 + }, + { + "epoch": 4.481070849107626, + "grad_norm": 0.9321774840354919, + "learning_rate": 2.5042135544996088e-06, + "loss": 1.9541, + "mean_token_accuracy": 0.5646510124206543, + "num_tokens": 8471150093.0, + "step": 16571 + }, + { + "epoch": 4.481341265548945, + "grad_norm": 0.934360146522522, + "learning_rate": 2.5036934223869762e-06, + "loss": 1.7726, + "mean_token_accuracy": 0.5782805681228638, + "num_tokens": 8471674312.0, + "step": 16572 + }, + { + "epoch": 4.481611681990265, + "grad_norm": 0.7847343683242798, + "learning_rate": 2.5031735509660186e-06, + "loss": 1.8178, + "mean_token_accuracy": 0.578718364238739, + "num_tokens": 8472198456.0, + "step": 16573 + }, + { + "epoch": 4.481882098431584, + "grad_norm": 0.8555360436439514, + "learning_rate": 2.502653940252682e-06, + "loss": 1.8419, + "mean_token_accuracy": 0.5937121510505676, + "num_tokens": 8472674084.0, + "step": 16574 + }, + { + "epoch": 4.4821525148729044, + "grad_norm": 0.7978105545043945, + "learning_rate": 2.5021345902629115e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5953468084335327, + "num_tokens": 8473198329.0, + "step": 16575 + }, + { + "epoch": 4.482422931314224, + "grad_norm": 1.030866026878357, + "learning_rate": 2.5016155010126438e-06, + "loss": 1.6759, + "mean_token_accuracy": 0.6281394362449646, + "num_tokens": 8473657952.0, + "step": 16576 + }, + { + "epoch": 4.482693347755544, + "grad_norm": 1.068757176399231, + "learning_rate": 2.501096672517804e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5730005502700806, + "num_tokens": 8474182162.0, + "step": 16577 + }, + { + "epoch": 4.482963764196863, + "grad_norm": 0.780287504196167, + "learning_rate": 2.5005781047943102e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5904563665390015, + "num_tokens": 8474706369.0, + "step": 16578 + }, + { + "epoch": 4.483234180638183, + "grad_norm": 0.8033947348594666, + "learning_rate": 2.5000597978580776e-06, + "loss": 1.8232, + "mean_token_accuracy": 0.5749342441558838, + "num_tokens": 8475230507.0, + "step": 16579 + }, + { + "epoch": 4.483504597079502, + "grad_norm": 0.745956301689148, + "learning_rate": 2.4995417517250055e-06, + "loss": 1.7266, + "mean_token_accuracy": 0.5971724987030029, + "num_tokens": 8475754770.0, + "step": 16580 + }, + { + "epoch": 4.483775013520822, + "grad_norm": 0.3263762295246124, + "learning_rate": 2.499023966410992e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.7143372297286987, + "num_tokens": 8476278979.0, + "step": 16581 + }, + { + "epoch": 4.4840454299621415, + "grad_norm": 1.0528035163879395, + "learning_rate": 2.4985064419319203e-06, + "loss": 1.8908, + "mean_token_accuracy": 0.564454197883606, + "num_tokens": 8476803074.0, + "step": 16582 + }, + { + "epoch": 4.484315846403462, + "grad_norm": 0.9539501667022705, + "learning_rate": 2.4979891783036753e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5609049797058105, + "num_tokens": 8477327282.0, + "step": 16583 + }, + { + "epoch": 4.484586262844781, + "grad_norm": 0.9488995671272278, + "learning_rate": 2.4974721755421233e-06, + "loss": 1.7599, + "mean_token_accuracy": 0.5808975696563721, + "num_tokens": 8477799970.0, + "step": 16584 + }, + { + "epoch": 4.484856679286101, + "grad_norm": 0.9268834590911865, + "learning_rate": 2.496955433663131e-06, + "loss": 1.8803, + "mean_token_accuracy": 0.5785151720046997, + "num_tokens": 8478324243.0, + "step": 16585 + }, + { + "epoch": 4.48512709572742, + "grad_norm": 0.9314447641372681, + "learning_rate": 2.4964389526825495e-06, + "loss": 1.7593, + "mean_token_accuracy": 0.5846657752990723, + "num_tokens": 8478848421.0, + "step": 16586 + }, + { + "epoch": 4.48539751216874, + "grad_norm": 0.920889675617218, + "learning_rate": 2.49592273261623e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.5853490233421326, + "num_tokens": 8479372572.0, + "step": 16587 + }, + { + "epoch": 4.485667928610059, + "grad_norm": 0.932478129863739, + "learning_rate": 2.4954067734800104e-06, + "loss": 1.854, + "mean_token_accuracy": 0.5797873735427856, + "num_tokens": 8479847353.0, + "step": 16588 + }, + { + "epoch": 4.4859383450513795, + "grad_norm": 0.7797752022743225, + "learning_rate": 2.494891075289719e-06, + "loss": 1.9049, + "mean_token_accuracy": 0.5517456531524658, + "num_tokens": 8480371629.0, + "step": 16589 + }, + { + "epoch": 4.486208761492699, + "grad_norm": 0.8346155881881714, + "learning_rate": 2.4943756380611834e-06, + "loss": 1.8467, + "mean_token_accuracy": 0.5801418423652649, + "num_tokens": 8480895722.0, + "step": 16590 + }, + { + "epoch": 4.486479177934019, + "grad_norm": 0.8764788508415222, + "learning_rate": 2.4938604618102175e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5918271541595459, + "num_tokens": 8481382124.0, + "step": 16591 + }, + { + "epoch": 4.486749594375338, + "grad_norm": 0.8136436939239502, + "learning_rate": 2.4933455465526252e-06, + "loss": 1.8012, + "mean_token_accuracy": 0.5700440406799316, + "num_tokens": 8481906265.0, + "step": 16592 + }, + { + "epoch": 4.487020010816658, + "grad_norm": 0.9294732809066772, + "learning_rate": 2.4928308923042093e-06, + "loss": 1.8345, + "mean_token_accuracy": 0.5901408791542053, + "num_tokens": 8482412464.0, + "step": 16593 + }, + { + "epoch": 4.487290427257977, + "grad_norm": 0.8947635293006897, + "learning_rate": 2.4923164990807596e-06, + "loss": 1.7718, + "mean_token_accuracy": 0.5862816572189331, + "num_tokens": 8482896810.0, + "step": 16594 + }, + { + "epoch": 4.487560843699297, + "grad_norm": 0.8519256711006165, + "learning_rate": 2.4918023668980573e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.5838989019393921, + "num_tokens": 8483420935.0, + "step": 16595 + }, + { + "epoch": 4.4878312601406165, + "grad_norm": 0.9050683379173279, + "learning_rate": 2.4912884957718804e-06, + "loss": 1.7828, + "mean_token_accuracy": 0.5940735340118408, + "num_tokens": 8483902060.0, + "step": 16596 + }, + { + "epoch": 4.488101676581937, + "grad_norm": 0.9719569683074951, + "learning_rate": 2.4907748857179953e-06, + "loss": 1.6994, + "mean_token_accuracy": 0.5768651962280273, + "num_tokens": 8484426095.0, + "step": 16597 + }, + { + "epoch": 4.488372093023256, + "grad_norm": 0.7696815729141235, + "learning_rate": 2.49026153675216e-06, + "loss": 1.8022, + "mean_token_accuracy": 0.5755555033683777, + "num_tokens": 8484950278.0, + "step": 16598 + }, + { + "epoch": 4.488642509464576, + "grad_norm": 0.9210919141769409, + "learning_rate": 2.489748448890128e-06, + "loss": 1.7075, + "mean_token_accuracy": 0.5907467007637024, + "num_tokens": 8485474534.0, + "step": 16599 + }, + { + "epoch": 4.488912925905895, + "grad_norm": 0.9373559355735779, + "learning_rate": 2.489235622147639e-06, + "loss": 1.8577, + "mean_token_accuracy": 0.5675269365310669, + "num_tokens": 8485998638.0, + "step": 16600 + }, + { + "epoch": 4.489183342347214, + "grad_norm": 0.3377472460269928, + "learning_rate": 2.48872305654043e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.7201060056686401, + "num_tokens": 8486522797.0, + "step": 16601 + }, + { + "epoch": 4.489453758788534, + "grad_norm": 1.0466997623443604, + "learning_rate": 2.488210752084228e-06, + "loss": 1.8882, + "mean_token_accuracy": 0.5711021423339844, + "num_tokens": 8487047040.0, + "step": 16602 + }, + { + "epoch": 4.489724175229854, + "grad_norm": 1.018867015838623, + "learning_rate": 2.4876987087947523e-06, + "loss": 1.8269, + "mean_token_accuracy": 0.587500810623169, + "num_tokens": 8487571232.0, + "step": 16603 + }, + { + "epoch": 4.489994591671174, + "grad_norm": 1.015952229499817, + "learning_rate": 2.487186926687712e-06, + "loss": 1.8944, + "mean_token_accuracy": 0.572721540927887, + "num_tokens": 8488095310.0, + "step": 16604 + }, + { + "epoch": 4.490265008112493, + "grad_norm": 0.8036741018295288, + "learning_rate": 2.486675405778813e-06, + "loss": 1.8109, + "mean_token_accuracy": 0.6083754301071167, + "num_tokens": 8488556398.0, + "step": 16605 + }, + { + "epoch": 4.490535424553813, + "grad_norm": 1.0462322235107422, + "learning_rate": 2.486164146083747e-06, + "loss": 1.9826, + "mean_token_accuracy": 0.5576878190040588, + "num_tokens": 8489042028.0, + "step": 16606 + }, + { + "epoch": 4.490805840995132, + "grad_norm": 1.041428804397583, + "learning_rate": 2.4856531476182025e-06, + "loss": 1.7791, + "mean_token_accuracy": 0.577701210975647, + "num_tokens": 8489512824.0, + "step": 16607 + }, + { + "epoch": 4.491076257436452, + "grad_norm": 1.0240904092788696, + "learning_rate": 2.4851424103978595e-06, + "loss": 1.9067, + "mean_token_accuracy": 0.5723938941955566, + "num_tokens": 8490037034.0, + "step": 16608 + }, + { + "epoch": 4.491346673877771, + "grad_norm": 1.1520389318466187, + "learning_rate": 2.484631934438386e-06, + "loss": 1.8271, + "mean_token_accuracy": 0.5774316787719727, + "num_tokens": 8490561222.0, + "step": 16609 + }, + { + "epoch": 4.4916170903190915, + "grad_norm": 0.8513318300247192, + "learning_rate": 2.4841217197554485e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5823927521705627, + "num_tokens": 8491085494.0, + "step": 16610 + }, + { + "epoch": 4.491887506760411, + "grad_norm": 0.8712244629859924, + "learning_rate": 2.4836117663647006e-06, + "loss": 1.8231, + "mean_token_accuracy": 0.5849788784980774, + "num_tokens": 8491579251.0, + "step": 16611 + }, + { + "epoch": 4.492157923201731, + "grad_norm": 0.9251440167427063, + "learning_rate": 2.4831020742817872e-06, + "loss": 1.6748, + "mean_token_accuracy": 0.5999050140380859, + "num_tokens": 8492103518.0, + "step": 16612 + }, + { + "epoch": 4.49242833964305, + "grad_norm": 0.9672847986221313, + "learning_rate": 2.4825926435223494e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5799223184585571, + "num_tokens": 8492606742.0, + "step": 16613 + }, + { + "epoch": 4.49269875608437, + "grad_norm": 0.944303035736084, + "learning_rate": 2.4820834741020182e-06, + "loss": 1.907, + "mean_token_accuracy": 0.5611864328384399, + "num_tokens": 8493130890.0, + "step": 16614 + }, + { + "epoch": 4.492969172525689, + "grad_norm": 0.927708089351654, + "learning_rate": 2.4815745660364147e-06, + "loss": 1.9106, + "mean_token_accuracy": 0.5489962100982666, + "num_tokens": 8493655060.0, + "step": 16615 + }, + { + "epoch": 4.493239588967009, + "grad_norm": 1.0380789041519165, + "learning_rate": 2.4810659193411556e-06, + "loss": 1.9019, + "mean_token_accuracy": 0.5648388266563416, + "num_tokens": 8494177162.0, + "step": 16616 + }, + { + "epoch": 4.493510005408329, + "grad_norm": 0.9107304811477661, + "learning_rate": 2.4805575340318462e-06, + "loss": 1.7733, + "mean_token_accuracy": 0.5783509612083435, + "num_tokens": 8494701405.0, + "step": 16617 + }, + { + "epoch": 4.493780421849649, + "grad_norm": 1.0129318237304688, + "learning_rate": 2.4800494101240847e-06, + "loss": 1.6464, + "mean_token_accuracy": 0.6134203672409058, + "num_tokens": 8495120197.0, + "step": 16618 + }, + { + "epoch": 4.494050838290968, + "grad_norm": 0.890129029750824, + "learning_rate": 2.479541547633464e-06, + "loss": 1.6875, + "mean_token_accuracy": 0.5972049236297607, + "num_tokens": 8495644392.0, + "step": 16619 + }, + { + "epoch": 4.494321254732288, + "grad_norm": 0.9941761493682861, + "learning_rate": 2.4790339465755652e-06, + "loss": 1.8688, + "mean_token_accuracy": 0.569053053855896, + "num_tokens": 8496168587.0, + "step": 16620 + }, + { + "epoch": 4.494591671173607, + "grad_norm": 0.34653836488723755, + "learning_rate": 2.478526606965963e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.7147205471992493, + "num_tokens": 8496692774.0, + "step": 16621 + }, + { + "epoch": 4.494862087614927, + "grad_norm": 1.1842714548110962, + "learning_rate": 2.478019528820225e-06, + "loss": 1.6535, + "mean_token_accuracy": 0.6201242208480835, + "num_tokens": 8497217030.0, + "step": 16622 + }, + { + "epoch": 4.495132504056246, + "grad_norm": 1.2029184103012085, + "learning_rate": 2.477512712153909e-06, + "loss": 1.8025, + "mean_token_accuracy": 0.5690740346908569, + "num_tokens": 8497741178.0, + "step": 16623 + }, + { + "epoch": 4.4954029204975665, + "grad_norm": 1.2610822916030884, + "learning_rate": 2.4770061569825663e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5771377086639404, + "num_tokens": 8498265393.0, + "step": 16624 + }, + { + "epoch": 4.495673336938886, + "grad_norm": 0.9202259182929993, + "learning_rate": 2.4764998633217386e-06, + "loss": 1.7058, + "mean_token_accuracy": 0.581787109375, + "num_tokens": 8498789629.0, + "step": 16625 + }, + { + "epoch": 4.495943753380206, + "grad_norm": 0.8756876587867737, + "learning_rate": 2.475993831186963e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.567193865776062, + "num_tokens": 8499313841.0, + "step": 16626 + }, + { + "epoch": 4.496214169821525, + "grad_norm": 0.9485048055648804, + "learning_rate": 2.4754880605937614e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5716874599456787, + "num_tokens": 8499834494.0, + "step": 16627 + }, + { + "epoch": 4.496484586262845, + "grad_norm": 1.1341552734375, + "learning_rate": 2.4749825515576568e-06, + "loss": 1.7515, + "mean_token_accuracy": 0.5985524654388428, + "num_tokens": 8500358734.0, + "step": 16628 + }, + { + "epoch": 4.496755002704164, + "grad_norm": 1.1791889667510986, + "learning_rate": 2.4744773040941588e-06, + "loss": 1.8959, + "mean_token_accuracy": 0.5679906010627747, + "num_tokens": 8500824697.0, + "step": 16629 + }, + { + "epoch": 4.497025419145484, + "grad_norm": 1.1711746454238892, + "learning_rate": 2.473972318218767e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.5649558901786804, + "num_tokens": 8501297330.0, + "step": 16630 + }, + { + "epoch": 4.497295835586804, + "grad_norm": 0.8071866631507874, + "learning_rate": 2.473467593946979e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.573442816734314, + "num_tokens": 8501821518.0, + "step": 16631 + }, + { + "epoch": 4.497566252028124, + "grad_norm": 0.965559184551239, + "learning_rate": 2.4729631312942792e-06, + "loss": 1.8272, + "mean_token_accuracy": 0.5774987936019897, + "num_tokens": 8502345782.0, + "step": 16632 + }, + { + "epoch": 4.497836668469443, + "grad_norm": 0.8589224219322205, + "learning_rate": 2.4724589302761475e-06, + "loss": 1.9121, + "mean_token_accuracy": 0.5578051805496216, + "num_tokens": 8502869955.0, + "step": 16633 + }, + { + "epoch": 4.498107084910763, + "grad_norm": 1.0474299192428589, + "learning_rate": 2.4719549909080547e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5683640837669373, + "num_tokens": 8503394156.0, + "step": 16634 + }, + { + "epoch": 4.498377501352082, + "grad_norm": 0.9832822680473328, + "learning_rate": 2.471451313205461e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.5830336809158325, + "num_tokens": 8503918433.0, + "step": 16635 + }, + { + "epoch": 4.498647917793402, + "grad_norm": 0.8571568131446838, + "learning_rate": 2.4709478971838217e-06, + "loss": 1.8903, + "mean_token_accuracy": 0.5673902034759521, + "num_tokens": 8504396299.0, + "step": 16636 + }, + { + "epoch": 4.498918334234721, + "grad_norm": 0.8417417407035828, + "learning_rate": 2.4704447428585843e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5775033235549927, + "num_tokens": 8504920583.0, + "step": 16637 + }, + { + "epoch": 4.4991887506760415, + "grad_norm": 0.8831584453582764, + "learning_rate": 2.4699418502451845e-06, + "loss": 1.8045, + "mean_token_accuracy": 0.5746723413467407, + "num_tokens": 8505444830.0, + "step": 16638 + }, + { + "epoch": 4.499459167117361, + "grad_norm": 0.8830652832984924, + "learning_rate": 2.4694392193590544e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5722835063934326, + "num_tokens": 8505968951.0, + "step": 16639 + }, + { + "epoch": 4.499729583558681, + "grad_norm": 0.8950808644294739, + "learning_rate": 2.4689368502156167e-06, + "loss": 1.8528, + "mean_token_accuracy": 0.575886607170105, + "num_tokens": 8506493145.0, + "step": 16640 + }, + { + "epoch": 4.5, + "grad_norm": 0.3807728886604309, + "learning_rate": 2.468434742830283e-06, + "loss": 1.1312, + "mean_token_accuracy": 0.6865019798278809, + "num_tokens": 8507017407.0, + "step": 16641 + }, + { + "epoch": 4.500270416441319, + "grad_norm": 0.8542137145996094, + "learning_rate": 2.4679328972184624e-06, + "loss": 1.7961, + "mean_token_accuracy": 0.5747371912002563, + "num_tokens": 8507541499.0, + "step": 16642 + }, + { + "epoch": 4.500540832882639, + "grad_norm": 0.8700713515281677, + "learning_rate": 2.4674313133955524e-06, + "loss": 1.7604, + "mean_token_accuracy": 0.5815765261650085, + "num_tokens": 8508023453.0, + "step": 16643 + }, + { + "epoch": 4.500811249323959, + "grad_norm": 0.8754160404205322, + "learning_rate": 2.4669299913769404e-06, + "loss": 1.9191, + "mean_token_accuracy": 0.5691168308258057, + "num_tokens": 8508547691.0, + "step": 16644 + }, + { + "epoch": 4.501081665765279, + "grad_norm": 0.8040366768836975, + "learning_rate": 2.4664289311780114e-06, + "loss": 1.762, + "mean_token_accuracy": 0.600823163986206, + "num_tokens": 8509048913.0, + "step": 16645 + }, + { + "epoch": 4.501352082206598, + "grad_norm": 0.9024525284767151, + "learning_rate": 2.4659281328141392e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5599225759506226, + "num_tokens": 8509572954.0, + "step": 16646 + }, + { + "epoch": 4.501622498647918, + "grad_norm": 0.8725950717926025, + "learning_rate": 2.4654275963006873e-06, + "loss": 1.9705, + "mean_token_accuracy": 0.5522525310516357, + "num_tokens": 8510097130.0, + "step": 16647 + }, + { + "epoch": 4.501892915089237, + "grad_norm": 0.8796356320381165, + "learning_rate": 2.4649273216530165e-06, + "loss": 1.8647, + "mean_token_accuracy": 0.5588353872299194, + "num_tokens": 8510621187.0, + "step": 16648 + }, + { + "epoch": 4.502163331530557, + "grad_norm": 0.7139866352081299, + "learning_rate": 2.4644273088864763e-06, + "loss": 1.5029, + "mean_token_accuracy": 0.6239200830459595, + "num_tokens": 8511145280.0, + "step": 16649 + }, + { + "epoch": 4.502433747971876, + "grad_norm": 0.8921833038330078, + "learning_rate": 2.463927558016406e-06, + "loss": 1.7767, + "mean_token_accuracy": 0.5878461599349976, + "num_tokens": 8511669293.0, + "step": 16650 + }, + { + "epoch": 4.502704164413196, + "grad_norm": 0.7214999198913574, + "learning_rate": 2.4634280690581413e-06, + "loss": 1.8392, + "mean_token_accuracy": 0.5661306977272034, + "num_tokens": 8512193492.0, + "step": 16651 + }, + { + "epoch": 4.502974580854516, + "grad_norm": 0.8349948525428772, + "learning_rate": 2.462928842027008e-06, + "loss": 1.8898, + "mean_token_accuracy": 0.5664967894554138, + "num_tokens": 8512680520.0, + "step": 16652 + }, + { + "epoch": 4.503244997295836, + "grad_norm": 0.8341516256332397, + "learning_rate": 2.4624298769383235e-06, + "loss": 1.741, + "mean_token_accuracy": 0.6002669334411621, + "num_tokens": 8513204701.0, + "step": 16653 + }, + { + "epoch": 4.503515413737155, + "grad_norm": 0.8024539947509766, + "learning_rate": 2.461931173807398e-06, + "loss": 1.7245, + "mean_token_accuracy": 0.6106629967689514, + "num_tokens": 8513728928.0, + "step": 16654 + }, + { + "epoch": 4.503785830178475, + "grad_norm": 0.8403781056404114, + "learning_rate": 2.4614327326495332e-06, + "loss": 1.973, + "mean_token_accuracy": 0.5562975406646729, + "num_tokens": 8514253054.0, + "step": 16655 + }, + { + "epoch": 4.504056246619794, + "grad_norm": 0.8506031632423401, + "learning_rate": 2.4609345534800224e-06, + "loss": 1.7836, + "mean_token_accuracy": 0.591099202632904, + "num_tokens": 8514777195.0, + "step": 16656 + }, + { + "epoch": 4.504326663061114, + "grad_norm": 0.904333233833313, + "learning_rate": 2.4604366363141523e-06, + "loss": 1.9794, + "mean_token_accuracy": 0.5715571641921997, + "num_tokens": 8515255876.0, + "step": 16657 + }, + { + "epoch": 4.5045970795024335, + "grad_norm": 0.8173636794090271, + "learning_rate": 2.4599389811671977e-06, + "loss": 1.8203, + "mean_token_accuracy": 0.5821566581726074, + "num_tokens": 8515736399.0, + "step": 16658 + }, + { + "epoch": 4.504867495943754, + "grad_norm": 0.7848243117332458, + "learning_rate": 2.4594415880544305e-06, + "loss": 1.793, + "mean_token_accuracy": 0.5730446577072144, + "num_tokens": 8516260647.0, + "step": 16659 + }, + { + "epoch": 4.505137912385073, + "grad_norm": 0.8064472675323486, + "learning_rate": 2.4589444569911126e-06, + "loss": 1.8963, + "mean_token_accuracy": 0.569610059261322, + "num_tokens": 8516784850.0, + "step": 16660 + }, + { + "epoch": 4.505408328826393, + "grad_norm": 0.3673621416091919, + "learning_rate": 2.4584475879924943e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7350223064422607, + "num_tokens": 8517309091.0, + "step": 16661 + }, + { + "epoch": 4.505678745267712, + "grad_norm": 1.0152807235717773, + "learning_rate": 2.457950981073824e-06, + "loss": 1.7394, + "mean_token_accuracy": 0.5733111500740051, + "num_tokens": 8517833238.0, + "step": 16662 + }, + { + "epoch": 4.505949161709032, + "grad_norm": 0.8025550842285156, + "learning_rate": 2.4574546362503393e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5822767019271851, + "num_tokens": 8518357486.0, + "step": 16663 + }, + { + "epoch": 4.506219578150351, + "grad_norm": 0.9131855964660645, + "learning_rate": 2.456958553537267e-06, + "loss": 1.7524, + "mean_token_accuracy": 0.596480131149292, + "num_tokens": 8518881611.0, + "step": 16664 + }, + { + "epoch": 4.506489994591671, + "grad_norm": 0.9201784729957581, + "learning_rate": 2.4564627329498293e-06, + "loss": 1.8805, + "mean_token_accuracy": 0.5543603897094727, + "num_tokens": 8519405872.0, + "step": 16665 + }, + { + "epoch": 4.506760411032991, + "grad_norm": 1.0177425146102905, + "learning_rate": 2.4559671745032405e-06, + "loss": 1.6941, + "mean_token_accuracy": 0.6040338277816772, + "num_tokens": 8519917310.0, + "step": 16666 + }, + { + "epoch": 4.507030827474311, + "grad_norm": 0.8357783555984497, + "learning_rate": 2.455471878212706e-06, + "loss": 1.8317, + "mean_token_accuracy": 0.5757554769515991, + "num_tokens": 8520441556.0, + "step": 16667 + }, + { + "epoch": 4.50730124391563, + "grad_norm": 0.9503999352455139, + "learning_rate": 2.45497684409342e-06, + "loss": 1.8127, + "mean_token_accuracy": 0.5789207220077515, + "num_tokens": 8520965671.0, + "step": 16668 + }, + { + "epoch": 4.50757166035695, + "grad_norm": 1.0021353960037231, + "learning_rate": 2.454482072160575e-06, + "loss": 1.723, + "mean_token_accuracy": 0.6028314828872681, + "num_tokens": 8521441204.0, + "step": 16669 + }, + { + "epoch": 4.507842076798269, + "grad_norm": 1.0359716415405273, + "learning_rate": 2.4539875624293505e-06, + "loss": 1.685, + "mean_token_accuracy": 0.5961734056472778, + "num_tokens": 8521965387.0, + "step": 16670 + }, + { + "epoch": 4.508112493239589, + "grad_norm": 1.0442603826522827, + "learning_rate": 2.4534933149149214e-06, + "loss": 1.77, + "mean_token_accuracy": 0.5833204984664917, + "num_tokens": 8522489626.0, + "step": 16671 + }, + { + "epoch": 4.5083829096809085, + "grad_norm": 1.049467921257019, + "learning_rate": 2.45299932963245e-06, + "loss": 1.9073, + "mean_token_accuracy": 0.5590853095054626, + "num_tokens": 8523013902.0, + "step": 16672 + }, + { + "epoch": 4.508653326122229, + "grad_norm": 0.9621168375015259, + "learning_rate": 2.4525056065970926e-06, + "loss": 1.9116, + "mean_token_accuracy": 0.5566445589065552, + "num_tokens": 8523538145.0, + "step": 16673 + }, + { + "epoch": 4.508923742563548, + "grad_norm": 0.7773208022117615, + "learning_rate": 2.4520121458240016e-06, + "loss": 1.832, + "mean_token_accuracy": 0.5632977485656738, + "num_tokens": 8524062407.0, + "step": 16674 + }, + { + "epoch": 4.509194159004868, + "grad_norm": 0.8535948395729065, + "learning_rate": 2.451518947328316e-06, + "loss": 1.9329, + "mean_token_accuracy": 0.5577386617660522, + "num_tokens": 8524586650.0, + "step": 16675 + }, + { + "epoch": 4.509464575446187, + "grad_norm": 0.9707388281822205, + "learning_rate": 2.4510260111251673e-06, + "loss": 1.7939, + "mean_token_accuracy": 0.5835743546485901, + "num_tokens": 8525110929.0, + "step": 16676 + }, + { + "epoch": 4.509734991887507, + "grad_norm": 0.897756814956665, + "learning_rate": 2.450533337229683e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5798582434654236, + "num_tokens": 8525635200.0, + "step": 16677 + }, + { + "epoch": 4.510005408328826, + "grad_norm": 1.1121280193328857, + "learning_rate": 2.450040925656976e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.6070869565010071, + "num_tokens": 8525991944.0, + "step": 16678 + }, + { + "epoch": 4.5102758247701455, + "grad_norm": 0.9086874723434448, + "learning_rate": 2.4495487764221586e-06, + "loss": 1.6791, + "mean_token_accuracy": 0.6143139600753784, + "num_tokens": 8526453277.0, + "step": 16679 + }, + { + "epoch": 4.510546241211466, + "grad_norm": 0.7846758961677551, + "learning_rate": 2.4490568895403297e-06, + "loss": 1.8103, + "mean_token_accuracy": 0.5777378082275391, + "num_tokens": 8526977448.0, + "step": 16680 + }, + { + "epoch": 4.510816657652786, + "grad_norm": 0.3384982943534851, + "learning_rate": 2.4485652650265827e-06, + "loss": 1.1235, + "mean_token_accuracy": 0.6951918601989746, + "num_tokens": 8527501654.0, + "step": 16681 + }, + { + "epoch": 4.511087074094105, + "grad_norm": 0.9953994750976562, + "learning_rate": 2.4480739028960002e-06, + "loss": 1.8747, + "mean_token_accuracy": 0.5768742561340332, + "num_tokens": 8527982595.0, + "step": 16682 + }, + { + "epoch": 4.511357490535424, + "grad_norm": 0.9288262724876404, + "learning_rate": 2.447582803163662e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5716601610183716, + "num_tokens": 8528494091.0, + "step": 16683 + }, + { + "epoch": 4.511627906976744, + "grad_norm": 0.8911736607551575, + "learning_rate": 2.4470919658446327e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.5670677423477173, + "num_tokens": 8529018261.0, + "step": 16684 + }, + { + "epoch": 4.511898323418064, + "grad_norm": 0.8417951464653015, + "learning_rate": 2.4466013909539753e-06, + "loss": 1.853, + "mean_token_accuracy": 0.5826047658920288, + "num_tokens": 8529542431.0, + "step": 16685 + }, + { + "epoch": 4.5121687398593835, + "grad_norm": 0.7700568437576294, + "learning_rate": 2.446111078506741e-06, + "loss": 1.7939, + "mean_token_accuracy": 0.5743877291679382, + "num_tokens": 8530066713.0, + "step": 16686 + }, + { + "epoch": 4.512439156300703, + "grad_norm": 0.8400915861129761, + "learning_rate": 2.4456210285179738e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.5891051888465881, + "num_tokens": 8530527339.0, + "step": 16687 + }, + { + "epoch": 4.512709572742023, + "grad_norm": 1.0121853351593018, + "learning_rate": 2.4451312410027103e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5860042572021484, + "num_tokens": 8531051619.0, + "step": 16688 + }, + { + "epoch": 4.512979989183342, + "grad_norm": 1.0945907831192017, + "learning_rate": 2.44464171597598e-06, + "loss": 1.7374, + "mean_token_accuracy": 0.5953247547149658, + "num_tokens": 8531537231.0, + "step": 16689 + }, + { + "epoch": 4.513250405624662, + "grad_norm": 1.112535834312439, + "learning_rate": 2.4441524534527993e-06, + "loss": 1.8101, + "mean_token_accuracy": 0.5811479687690735, + "num_tokens": 8532061354.0, + "step": 16690 + }, + { + "epoch": 4.513520822065981, + "grad_norm": 0.839073896408081, + "learning_rate": 2.4436634534481836e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.5817519426345825, + "num_tokens": 8532528277.0, + "step": 16691 + }, + { + "epoch": 4.513791238507301, + "grad_norm": 0.864391028881073, + "learning_rate": 2.4431747159771356e-06, + "loss": 1.8393, + "mean_token_accuracy": 0.5866594314575195, + "num_tokens": 8533011157.0, + "step": 16692 + }, + { + "epoch": 4.5140616549486205, + "grad_norm": 1.0399987697601318, + "learning_rate": 2.442686241054651e-06, + "loss": 1.8138, + "mean_token_accuracy": 0.5856378078460693, + "num_tokens": 8533535414.0, + "step": 16693 + }, + { + "epoch": 4.514332071389941, + "grad_norm": 1.0628621578216553, + "learning_rate": 2.442198028695718e-06, + "loss": 1.8389, + "mean_token_accuracy": 0.5885967016220093, + "num_tokens": 8534019680.0, + "step": 16694 + }, + { + "epoch": 4.51460248783126, + "grad_norm": 1.0443577766418457, + "learning_rate": 2.4417100789153154e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5759134292602539, + "num_tokens": 8534540376.0, + "step": 16695 + }, + { + "epoch": 4.51487290427258, + "grad_norm": 0.9753978848457336, + "learning_rate": 2.4412223917284166e-06, + "loss": 1.8092, + "mean_token_accuracy": 0.5829908847808838, + "num_tokens": 8535064655.0, + "step": 16696 + }, + { + "epoch": 4.515143320713899, + "grad_norm": 0.9601731300354004, + "learning_rate": 2.440734967149985e-06, + "loss": 1.7112, + "mean_token_accuracy": 0.5917202234268188, + "num_tokens": 8535588760.0, + "step": 16697 + }, + { + "epoch": 4.515413737155219, + "grad_norm": 0.8140610456466675, + "learning_rate": 2.440247805194976e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.575498104095459, + "num_tokens": 8536112956.0, + "step": 16698 + }, + { + "epoch": 4.515684153596538, + "grad_norm": 0.9991709589958191, + "learning_rate": 2.4397609058783358e-06, + "loss": 1.804, + "mean_token_accuracy": 0.5984127521514893, + "num_tokens": 8536588056.0, + "step": 16699 + }, + { + "epoch": 4.5159545700378585, + "grad_norm": 0.9479324221611023, + "learning_rate": 2.439274269215007e-06, + "loss": 1.8003, + "mean_token_accuracy": 0.5812535285949707, + "num_tokens": 8537112293.0, + "step": 16700 + }, + { + "epoch": 4.516224986479178, + "grad_norm": 0.32784515619277954, + "learning_rate": 2.438787895219916e-06, + "loss": 1.0985, + "mean_token_accuracy": 0.7087775468826294, + "num_tokens": 8537636505.0, + "step": 16701 + }, + { + "epoch": 4.516495402920498, + "grad_norm": 0.929516077041626, + "learning_rate": 2.4383017839079925e-06, + "loss": 1.845, + "mean_token_accuracy": 0.5749744772911072, + "num_tokens": 8538123165.0, + "step": 16702 + }, + { + "epoch": 4.516765819361817, + "grad_norm": 0.8854724764823914, + "learning_rate": 2.437815935294147e-06, + "loss": 1.8235, + "mean_token_accuracy": 0.5709325671195984, + "num_tokens": 8538647301.0, + "step": 16703 + }, + { + "epoch": 4.517036235803137, + "grad_norm": 0.8590342998504639, + "learning_rate": 2.437330349393288e-06, + "loss": 1.6184, + "mean_token_accuracy": 0.6238713264465332, + "num_tokens": 8539171508.0, + "step": 16704 + }, + { + "epoch": 4.517306652244456, + "grad_norm": 0.8639314770698547, + "learning_rate": 2.4368450262203164e-06, + "loss": 1.7488, + "mean_token_accuracy": 0.5791290998458862, + "num_tokens": 8539695761.0, + "step": 16705 + }, + { + "epoch": 4.517577068685776, + "grad_norm": 0.9174240231513977, + "learning_rate": 2.4363599657901212e-06, + "loss": 1.8278, + "mean_token_accuracy": 0.6128062009811401, + "num_tokens": 8540155525.0, + "step": 16706 + }, + { + "epoch": 4.5178474851270956, + "grad_norm": 1.0924081802368164, + "learning_rate": 2.435875168117586e-06, + "loss": 1.8043, + "mean_token_accuracy": 0.5608329772949219, + "num_tokens": 8540679649.0, + "step": 16707 + }, + { + "epoch": 4.518117901568416, + "grad_norm": 0.8856711983680725, + "learning_rate": 2.4353906332175865e-06, + "loss": 1.8777, + "mean_token_accuracy": 0.5660480856895447, + "num_tokens": 8541203923.0, + "step": 16708 + }, + { + "epoch": 4.518388318009735, + "grad_norm": 0.8771141171455383, + "learning_rate": 2.4349063611049883e-06, + "loss": 1.7459, + "mean_token_accuracy": 0.5759531855583191, + "num_tokens": 8541722728.0, + "step": 16709 + }, + { + "epoch": 4.518658734451055, + "grad_norm": 0.9607645273208618, + "learning_rate": 2.434422351794652e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5873259902000427, + "num_tokens": 8542219851.0, + "step": 16710 + }, + { + "epoch": 4.518929150892374, + "grad_norm": 0.9333683252334595, + "learning_rate": 2.4339386053014267e-06, + "loss": 1.9346, + "mean_token_accuracy": 0.5748926401138306, + "num_tokens": 8542717955.0, + "step": 16711 + }, + { + "epoch": 4.519199567333694, + "grad_norm": 0.8951353430747986, + "learning_rate": 2.433455121640158e-06, + "loss": 1.7931, + "mean_token_accuracy": 0.5873794555664062, + "num_tokens": 8543242154.0, + "step": 16712 + }, + { + "epoch": 4.519469983775013, + "grad_norm": 0.8889811038970947, + "learning_rate": 2.432971900825677e-06, + "loss": 1.7373, + "mean_token_accuracy": 0.593246579170227, + "num_tokens": 8543766257.0, + "step": 16713 + }, + { + "epoch": 4.5197404002163335, + "grad_norm": 0.8254697322845459, + "learning_rate": 2.432488942872814e-06, + "loss": 1.8652, + "mean_token_accuracy": 0.5576695203781128, + "num_tokens": 8544290501.0, + "step": 16714 + }, + { + "epoch": 4.520010816657653, + "grad_norm": 1.0005327463150024, + "learning_rate": 2.4320062477963847e-06, + "loss": 1.7003, + "mean_token_accuracy": 0.6136133670806885, + "num_tokens": 8544814586.0, + "step": 16715 + }, + { + "epoch": 4.520281233098973, + "grad_norm": 0.9405715465545654, + "learning_rate": 2.4315238156111995e-06, + "loss": 1.8251, + "mean_token_accuracy": 0.5645248293876648, + "num_tokens": 8545338806.0, + "step": 16716 + }, + { + "epoch": 4.520551649540292, + "grad_norm": 1.0086917877197266, + "learning_rate": 2.431041646332063e-06, + "loss": 1.8317, + "mean_token_accuracy": 0.5699583888053894, + "num_tokens": 8545863079.0, + "step": 16717 + }, + { + "epoch": 4.520822065981612, + "grad_norm": 0.836311936378479, + "learning_rate": 2.430559739973769e-06, + "loss": 1.7348, + "mean_token_accuracy": 0.5960805416107178, + "num_tokens": 8546387340.0, + "step": 16718 + }, + { + "epoch": 4.521092482422931, + "grad_norm": 1.034814715385437, + "learning_rate": 2.4300780965511016e-06, + "loss": 1.9332, + "mean_token_accuracy": 0.5508958101272583, + "num_tokens": 8546911545.0, + "step": 16719 + }, + { + "epoch": 4.5213628988642505, + "grad_norm": 0.95281583070755, + "learning_rate": 2.4295967160788418e-06, + "loss": 1.7196, + "mean_token_accuracy": 0.5967532396316528, + "num_tokens": 8547435720.0, + "step": 16720 + }, + { + "epoch": 4.5216333153055706, + "grad_norm": 0.33273008465766907, + "learning_rate": 2.4291155985717597e-06, + "loss": 1.1273, + "mean_token_accuracy": 0.6995182037353516, + "num_tokens": 8547959953.0, + "step": 16721 + }, + { + "epoch": 4.521903731746891, + "grad_norm": 1.0268932580947876, + "learning_rate": 2.428634744044614e-06, + "loss": 1.8398, + "mean_token_accuracy": 0.5837730169296265, + "num_tokens": 8548484102.0, + "step": 16722 + }, + { + "epoch": 4.52217414818821, + "grad_norm": 0.9507642388343811, + "learning_rate": 2.4281541525121615e-06, + "loss": 1.9021, + "mean_token_accuracy": 0.5579521656036377, + "num_tokens": 8549008244.0, + "step": 16723 + }, + { + "epoch": 4.522444564629529, + "grad_norm": 0.8041397929191589, + "learning_rate": 2.4276738239891495e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5963289737701416, + "num_tokens": 8549520356.0, + "step": 16724 + }, + { + "epoch": 4.522714981070849, + "grad_norm": 0.7316538691520691, + "learning_rate": 2.427193758490312e-06, + "loss": 1.7076, + "mean_token_accuracy": 0.5872448086738586, + "num_tokens": 8550044503.0, + "step": 16725 + }, + { + "epoch": 4.522985397512169, + "grad_norm": 0.7785599827766418, + "learning_rate": 2.4267139560303824e-06, + "loss": 1.8284, + "mean_token_accuracy": 0.5785486698150635, + "num_tokens": 8550568739.0, + "step": 16726 + }, + { + "epoch": 4.523255813953488, + "grad_norm": 0.8661019206047058, + "learning_rate": 2.4262344166240804e-06, + "loss": 1.8434, + "mean_token_accuracy": 0.5772829055786133, + "num_tokens": 8551092911.0, + "step": 16727 + }, + { + "epoch": 4.523526230394808, + "grad_norm": 0.8149844408035278, + "learning_rate": 2.4257551402861206e-06, + "loss": 1.8692, + "mean_token_accuracy": 0.5718887448310852, + "num_tokens": 8551617038.0, + "step": 16728 + }, + { + "epoch": 4.523796646836128, + "grad_norm": 0.8154280185699463, + "learning_rate": 2.4252761270312087e-06, + "loss": 1.7763, + "mean_token_accuracy": 0.6143078207969666, + "num_tokens": 8552079604.0, + "step": 16729 + }, + { + "epoch": 4.524067063277447, + "grad_norm": 0.8185290694236755, + "learning_rate": 2.424797376874041e-06, + "loss": 1.7962, + "mean_token_accuracy": 0.5918423533439636, + "num_tokens": 8552594997.0, + "step": 16730 + }, + { + "epoch": 4.524337479718767, + "grad_norm": 0.8444012403488159, + "learning_rate": 2.4243188898293092e-06, + "loss": 1.8487, + "mean_token_accuracy": 0.5770257711410522, + "num_tokens": 8553119237.0, + "step": 16731 + }, + { + "epoch": 4.524607896160086, + "grad_norm": 0.894556999206543, + "learning_rate": 2.423840665911693e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5998680591583252, + "num_tokens": 8553626104.0, + "step": 16732 + }, + { + "epoch": 4.524878312601406, + "grad_norm": 0.859830379486084, + "learning_rate": 2.423362705135864e-06, + "loss": 1.6603, + "mean_token_accuracy": 0.6077401041984558, + "num_tokens": 8554150242.0, + "step": 16733 + }, + { + "epoch": 4.5251487290427255, + "grad_norm": 0.814749002456665, + "learning_rate": 2.4228850075164915e-06, + "loss": 1.8819, + "mean_token_accuracy": 0.5760139226913452, + "num_tokens": 8554662356.0, + "step": 16734 + }, + { + "epoch": 4.525419145484046, + "grad_norm": 0.8361923098564148, + "learning_rate": 2.4224075730682303e-06, + "loss": 1.7736, + "mean_token_accuracy": 0.5938856601715088, + "num_tokens": 8555186624.0, + "step": 16735 + }, + { + "epoch": 4.525689561925365, + "grad_norm": 0.7724778652191162, + "learning_rate": 2.421930401805729e-06, + "loss": 1.7717, + "mean_token_accuracy": 0.5637712478637695, + "num_tokens": 8555710891.0, + "step": 16736 + }, + { + "epoch": 4.525959978366685, + "grad_norm": 0.9769037365913391, + "learning_rate": 2.42145349374363e-06, + "loss": 1.7286, + "mean_token_accuracy": 0.5685552954673767, + "num_tokens": 8556235013.0, + "step": 16737 + }, + { + "epoch": 4.526230394808004, + "grad_norm": 0.9780118465423584, + "learning_rate": 2.420976848896565e-06, + "loss": 1.7773, + "mean_token_accuracy": 0.5736578702926636, + "num_tokens": 8556759085.0, + "step": 16738 + }, + { + "epoch": 4.526500811249324, + "grad_norm": 0.9643204212188721, + "learning_rate": 2.4205004672791597e-06, + "loss": 1.5655, + "mean_token_accuracy": 0.6379523277282715, + "num_tokens": 8557211583.0, + "step": 16739 + }, + { + "epoch": 4.526771227690643, + "grad_norm": 0.8811064958572388, + "learning_rate": 2.4200243489060317e-06, + "loss": 1.9308, + "mean_token_accuracy": 0.5708558559417725, + "num_tokens": 8557735800.0, + "step": 16740 + }, + { + "epoch": 4.527041644131963, + "grad_norm": 0.35355785489082336, + "learning_rate": 2.419548493791788e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7240835428237915, + "num_tokens": 8558216795.0, + "step": 16741 + }, + { + "epoch": 4.527312060573283, + "grad_norm": 1.025946021080017, + "learning_rate": 2.4190729019510297e-06, + "loss": 1.9021, + "mean_token_accuracy": 0.5497995018959045, + "num_tokens": 8558725935.0, + "step": 16742 + }, + { + "epoch": 4.527582477014603, + "grad_norm": 0.9001969695091248, + "learning_rate": 2.4185975733983503e-06, + "loss": 1.7808, + "mean_token_accuracy": 0.5965030789375305, + "num_tokens": 8559250121.0, + "step": 16743 + }, + { + "epoch": 4.527852893455922, + "grad_norm": 0.9445177912712097, + "learning_rate": 2.4181225081483346e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5690222978591919, + "num_tokens": 8559728274.0, + "step": 16744 + }, + { + "epoch": 4.528123309897242, + "grad_norm": 0.8204106688499451, + "learning_rate": 2.417647706215555e-06, + "loss": 1.7139, + "mean_token_accuracy": 0.5977733731269836, + "num_tokens": 8560252544.0, + "step": 16745 + }, + { + "epoch": 4.528393726338561, + "grad_norm": 0.8907406330108643, + "learning_rate": 2.417173167614586e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5863857269287109, + "num_tokens": 8560776820.0, + "step": 16746 + }, + { + "epoch": 4.528664142779881, + "grad_norm": 0.8634737730026245, + "learning_rate": 2.4166988923599825e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.5718089938163757, + "num_tokens": 8561263423.0, + "step": 16747 + }, + { + "epoch": 4.5289345592212005, + "grad_norm": 0.9295963048934937, + "learning_rate": 2.4162248804663004e-06, + "loss": 1.9218, + "mean_token_accuracy": 0.5614640712738037, + "num_tokens": 8561787610.0, + "step": 16748 + }, + { + "epoch": 4.529204975662521, + "grad_norm": 0.8077812194824219, + "learning_rate": 2.4157511319480815e-06, + "loss": 1.9635, + "mean_token_accuracy": 0.5375446677207947, + "num_tokens": 8562311718.0, + "step": 16749 + }, + { + "epoch": 4.52947539210384, + "grad_norm": 0.8095196485519409, + "learning_rate": 2.4152776468198614e-06, + "loss": 1.856, + "mean_token_accuracy": 0.579155445098877, + "num_tokens": 8562835932.0, + "step": 16750 + }, + { + "epoch": 4.52974580854516, + "grad_norm": 0.8212645053863525, + "learning_rate": 2.4148044250961704e-06, + "loss": 1.6699, + "mean_token_accuracy": 0.6034200191497803, + "num_tokens": 8563297156.0, + "step": 16751 + }, + { + "epoch": 4.530016224986479, + "grad_norm": 0.9986596703529358, + "learning_rate": 2.4143314667915272e-06, + "loss": 1.8976, + "mean_token_accuracy": 0.5452648997306824, + "num_tokens": 8563816091.0, + "step": 16752 + }, + { + "epoch": 4.530286641427799, + "grad_norm": 0.8774611949920654, + "learning_rate": 2.413858771920442e-06, + "loss": 1.703, + "mean_token_accuracy": 0.5869935750961304, + "num_tokens": 8564340351.0, + "step": 16753 + }, + { + "epoch": 4.530557057869118, + "grad_norm": 0.9325518012046814, + "learning_rate": 2.4133863404974206e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5707416534423828, + "num_tokens": 8564864542.0, + "step": 16754 + }, + { + "epoch": 4.530827474310438, + "grad_norm": 0.8063117265701294, + "learning_rate": 2.4129141725369596e-06, + "loss": 1.7274, + "mean_token_accuracy": 0.5952452421188354, + "num_tokens": 8565388633.0, + "step": 16755 + }, + { + "epoch": 4.531097890751758, + "grad_norm": 0.8717279434204102, + "learning_rate": 2.4124422680535426e-06, + "loss": 1.7703, + "mean_token_accuracy": 0.5980606079101562, + "num_tokens": 8565912806.0, + "step": 16756 + }, + { + "epoch": 4.531368307193078, + "grad_norm": 0.973611056804657, + "learning_rate": 2.411970627061652e-06, + "loss": 1.8181, + "mean_token_accuracy": 0.5792424082756042, + "num_tokens": 8566436851.0, + "step": 16757 + }, + { + "epoch": 4.531638723634397, + "grad_norm": 0.8079237937927246, + "learning_rate": 2.4114992495757586e-06, + "loss": 1.7609, + "mean_token_accuracy": 0.5966547727584839, + "num_tokens": 8566945078.0, + "step": 16758 + }, + { + "epoch": 4.531909140075717, + "grad_norm": 0.9178565144538879, + "learning_rate": 2.4110281356103236e-06, + "loss": 1.7489, + "mean_token_accuracy": 0.578770637512207, + "num_tokens": 8567469359.0, + "step": 16759 + }, + { + "epoch": 4.532179556517036, + "grad_norm": 0.8067790865898132, + "learning_rate": 2.4105572851798067e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5924404859542847, + "num_tokens": 8567993539.0, + "step": 16760 + }, + { + "epoch": 4.532449972958355, + "grad_norm": 0.3585556745529175, + "learning_rate": 2.4100866982986505e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7177995443344116, + "num_tokens": 8568517599.0, + "step": 16761 + }, + { + "epoch": 4.5327203893996755, + "grad_norm": 0.9296022057533264, + "learning_rate": 2.409616374981296e-06, + "loss": 1.5785, + "mean_token_accuracy": 0.6301122903823853, + "num_tokens": 8569041774.0, + "step": 16762 + }, + { + "epoch": 4.532990805840996, + "grad_norm": 1.1403162479400635, + "learning_rate": 2.409146315242174e-06, + "loss": 1.8428, + "mean_token_accuracy": 0.5796453356742859, + "num_tokens": 8569548176.0, + "step": 16763 + }, + { + "epoch": 4.533261222282315, + "grad_norm": 1.1580008268356323, + "learning_rate": 2.408676519095707e-06, + "loss": 1.9541, + "mean_token_accuracy": 0.5631805658340454, + "num_tokens": 8570072439.0, + "step": 16764 + }, + { + "epoch": 4.533531638723634, + "grad_norm": 0.8959111571311951, + "learning_rate": 2.408206986556309e-06, + "loss": 1.848, + "mean_token_accuracy": 0.5741012096405029, + "num_tokens": 8570575761.0, + "step": 16765 + }, + { + "epoch": 4.533802055164954, + "grad_norm": 0.7838420867919922, + "learning_rate": 2.407737717638387e-06, + "loss": 1.6327, + "mean_token_accuracy": 0.62890625, + "num_tokens": 8571080282.0, + "step": 16766 + }, + { + "epoch": 4.534072471606274, + "grad_norm": 1.0124191045761108, + "learning_rate": 2.407268712356342e-06, + "loss": 1.7704, + "mean_token_accuracy": 0.6043709516525269, + "num_tokens": 8571557797.0, + "step": 16767 + }, + { + "epoch": 4.534342888047593, + "grad_norm": 0.9053703546524048, + "learning_rate": 2.4067999707245605e-06, + "loss": 1.8554, + "mean_token_accuracy": 0.5610170960426331, + "num_tokens": 8572082070.0, + "step": 16768 + }, + { + "epoch": 4.5346133044889125, + "grad_norm": 0.9711825251579285, + "learning_rate": 2.406331492757429e-06, + "loss": 1.8507, + "mean_token_accuracy": 0.5785017013549805, + "num_tokens": 8572606265.0, + "step": 16769 + }, + { + "epoch": 4.534883720930233, + "grad_norm": 0.8623877167701721, + "learning_rate": 2.4058632784693173e-06, + "loss": 1.8298, + "mean_token_accuracy": 0.5706219673156738, + "num_tokens": 8573130422.0, + "step": 16770 + }, + { + "epoch": 4.535154137371552, + "grad_norm": 0.9391494989395142, + "learning_rate": 2.405395327874596e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5847070217132568, + "num_tokens": 8573632245.0, + "step": 16771 + }, + { + "epoch": 4.535424553812872, + "grad_norm": 1.00554621219635, + "learning_rate": 2.404927640987621e-06, + "loss": 1.8923, + "mean_token_accuracy": 0.55824875831604, + "num_tokens": 8574156398.0, + "step": 16772 + }, + { + "epoch": 4.535694970254191, + "grad_norm": 0.9911496639251709, + "learning_rate": 2.404460217822741e-06, + "loss": 1.8294, + "mean_token_accuracy": 0.5756321549415588, + "num_tokens": 8574663908.0, + "step": 16773 + }, + { + "epoch": 4.535965386695511, + "grad_norm": 0.8001202344894409, + "learning_rate": 2.403993058394301e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.5851892828941345, + "num_tokens": 8575188191.0, + "step": 16774 + }, + { + "epoch": 4.53623580313683, + "grad_norm": 0.8445307612419128, + "learning_rate": 2.403526162716633e-06, + "loss": 1.7947, + "mean_token_accuracy": 0.5833855867385864, + "num_tokens": 8575690768.0, + "step": 16775 + }, + { + "epoch": 4.5365062195781505, + "grad_norm": 0.885920524597168, + "learning_rate": 2.4030595308040607e-06, + "loss": 1.8339, + "mean_token_accuracy": 0.5593312382698059, + "num_tokens": 8576215054.0, + "step": 16776 + }, + { + "epoch": 4.53677663601947, + "grad_norm": 0.8225685954093933, + "learning_rate": 2.402593162670906e-06, + "loss": 1.7293, + "mean_token_accuracy": 0.5729436874389648, + "num_tokens": 8576739172.0, + "step": 16777 + }, + { + "epoch": 4.53704705246079, + "grad_norm": 0.9175531268119812, + "learning_rate": 2.402127058331477e-06, + "loss": 1.774, + "mean_token_accuracy": 0.5743162631988525, + "num_tokens": 8577263322.0, + "step": 16778 + }, + { + "epoch": 4.537317468902109, + "grad_norm": 0.9063973426818848, + "learning_rate": 2.4016612178000726e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.578654408454895, + "num_tokens": 8577776408.0, + "step": 16779 + }, + { + "epoch": 4.537587885343429, + "grad_norm": 0.8591316938400269, + "learning_rate": 2.401195641090988e-06, + "loss": 1.8717, + "mean_token_accuracy": 0.5767313241958618, + "num_tokens": 8578247512.0, + "step": 16780 + }, + { + "epoch": 4.537858301784748, + "grad_norm": 0.31499502062797546, + "learning_rate": 2.40073032821851e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.7035381197929382, + "num_tokens": 8578771776.0, + "step": 16781 + }, + { + "epoch": 4.538128718226068, + "grad_norm": 0.9006611108779907, + "learning_rate": 2.400265279196913e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5763422846794128, + "num_tokens": 8579296048.0, + "step": 16782 + }, + { + "epoch": 4.5383991346673875, + "grad_norm": 0.9822134971618652, + "learning_rate": 2.399800494040469e-06, + "loss": 1.8951, + "mean_token_accuracy": 0.5614348649978638, + "num_tokens": 8579812481.0, + "step": 16783 + }, + { + "epoch": 4.538669551108708, + "grad_norm": 0.8403090238571167, + "learning_rate": 2.3993359727634372e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.5721071362495422, + "num_tokens": 8580322301.0, + "step": 16784 + }, + { + "epoch": 4.538939967550027, + "grad_norm": 0.8871883749961853, + "learning_rate": 2.3988717153800696e-06, + "loss": 1.7747, + "mean_token_accuracy": 0.6073023080825806, + "num_tokens": 8580756896.0, + "step": 16785 + }, + { + "epoch": 4.539210383991347, + "grad_norm": 0.9100080132484436, + "learning_rate": 2.398407721904613e-06, + "loss": 1.7759, + "mean_token_accuracy": 0.5888679623603821, + "num_tokens": 8581281135.0, + "step": 16786 + }, + { + "epoch": 4.539480800432666, + "grad_norm": 0.7701008319854736, + "learning_rate": 2.397943992351303e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.582153856754303, + "num_tokens": 8581805399.0, + "step": 16787 + }, + { + "epoch": 4.539751216873986, + "grad_norm": 0.9454638957977295, + "learning_rate": 2.3974805267343676e-06, + "loss": 1.8311, + "mean_token_accuracy": 0.5608140230178833, + "num_tokens": 8582329504.0, + "step": 16788 + }, + { + "epoch": 4.540021633315305, + "grad_norm": 0.9626410007476807, + "learning_rate": 2.3970173250680283e-06, + "loss": 1.9189, + "mean_token_accuracy": 0.5596654415130615, + "num_tokens": 8582853766.0, + "step": 16789 + }, + { + "epoch": 4.5402920497566255, + "grad_norm": 0.8534425497055054, + "learning_rate": 2.3965543873664975e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5663090944290161, + "num_tokens": 8583348879.0, + "step": 16790 + }, + { + "epoch": 4.540562466197945, + "grad_norm": 0.9068966507911682, + "learning_rate": 2.396091713643979e-06, + "loss": 1.9361, + "mean_token_accuracy": 0.5633764266967773, + "num_tokens": 8583873127.0, + "step": 16791 + }, + { + "epoch": 4.540832882639265, + "grad_norm": 0.8749712705612183, + "learning_rate": 2.395629303914669e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5735214948654175, + "num_tokens": 8584397355.0, + "step": 16792 + }, + { + "epoch": 4.541103299080584, + "grad_norm": 0.7772769331932068, + "learning_rate": 2.395167158192756e-06, + "loss": 1.852, + "mean_token_accuracy": 0.5730605125427246, + "num_tokens": 8584921520.0, + "step": 16793 + }, + { + "epoch": 4.541373715521904, + "grad_norm": 0.8221361637115479, + "learning_rate": 2.394705276492419e-06, + "loss": 1.8486, + "mean_token_accuracy": 0.5680505037307739, + "num_tokens": 8585445777.0, + "step": 16794 + }, + { + "epoch": 4.541644131963223, + "grad_norm": 0.9546669125556946, + "learning_rate": 2.3942436588278308e-06, + "loss": 1.7694, + "mean_token_accuracy": 0.591050386428833, + "num_tokens": 8585934728.0, + "step": 16795 + }, + { + "epoch": 4.541914548404543, + "grad_norm": 0.7415429949760437, + "learning_rate": 2.3937823052131553e-06, + "loss": 1.7538, + "mean_token_accuracy": 0.5833135843276978, + "num_tokens": 8586458917.0, + "step": 16796 + }, + { + "epoch": 4.5421849648458625, + "grad_norm": 0.7799370884895325, + "learning_rate": 2.3933212156625488e-06, + "loss": 1.7847, + "mean_token_accuracy": 0.578253448009491, + "num_tokens": 8586983168.0, + "step": 16797 + }, + { + "epoch": 4.542455381287183, + "grad_norm": 0.7844311594963074, + "learning_rate": 2.3928603901901573e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.588427722454071, + "num_tokens": 8587507427.0, + "step": 16798 + }, + { + "epoch": 4.542725797728502, + "grad_norm": 0.8479924201965332, + "learning_rate": 2.392399828810121e-06, + "loss": 1.8871, + "mean_token_accuracy": 0.5691864490509033, + "num_tokens": 8587995804.0, + "step": 16799 + }, + { + "epoch": 4.542996214169822, + "grad_norm": 0.8229315876960754, + "learning_rate": 2.391939531536571e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5837924480438232, + "num_tokens": 8588519961.0, + "step": 16800 + }, + { + "epoch": 4.543266630611141, + "grad_norm": 0.31863805651664734, + "learning_rate": 2.3914794983836317e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.71031653881073, + "num_tokens": 8589044225.0, + "step": 16801 + }, + { + "epoch": 4.54353704705246, + "grad_norm": 0.8869648575782776, + "learning_rate": 2.3910197293654163e-06, + "loss": 1.8676, + "mean_token_accuracy": 0.5646716356277466, + "num_tokens": 8589568389.0, + "step": 16802 + }, + { + "epoch": 4.54380746349378, + "grad_norm": 0.9378745555877686, + "learning_rate": 2.3905602244960337e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.5834107398986816, + "num_tokens": 8590075800.0, + "step": 16803 + }, + { + "epoch": 4.5440778799351005, + "grad_norm": 0.9157971739768982, + "learning_rate": 2.3901009837895817e-06, + "loss": 1.7494, + "mean_token_accuracy": 0.6044089794158936, + "num_tokens": 8590599961.0, + "step": 16804 + }, + { + "epoch": 4.54434829637642, + "grad_norm": 0.9740874171257019, + "learning_rate": 2.3896420072601513e-06, + "loss": 1.7601, + "mean_token_accuracy": 0.568863034248352, + "num_tokens": 8591124186.0, + "step": 16805 + }, + { + "epoch": 4.544618712817739, + "grad_norm": 0.7392487525939941, + "learning_rate": 2.3891832949218265e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5845965147018433, + "num_tokens": 8591648327.0, + "step": 16806 + }, + { + "epoch": 4.544889129259059, + "grad_norm": 0.754852294921875, + "learning_rate": 2.3887248467886815e-06, + "loss": 1.6343, + "mean_token_accuracy": 0.591720700263977, + "num_tokens": 8592172587.0, + "step": 16807 + }, + { + "epoch": 4.545159545700379, + "grad_norm": 0.8329135179519653, + "learning_rate": 2.3882666628747797e-06, + "loss": 1.7593, + "mean_token_accuracy": 0.5970344543457031, + "num_tokens": 8592641746.0, + "step": 16808 + }, + { + "epoch": 4.545429962141698, + "grad_norm": 0.8181179761886597, + "learning_rate": 2.3878087431941835e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5781579613685608, + "num_tokens": 8593165909.0, + "step": 16809 + }, + { + "epoch": 4.5457003785830175, + "grad_norm": 0.8870779871940613, + "learning_rate": 2.3873510877609426e-06, + "loss": 1.8089, + "mean_token_accuracy": 0.5918024182319641, + "num_tokens": 8593627547.0, + "step": 16810 + }, + { + "epoch": 4.5459707950243375, + "grad_norm": 0.8151482939720154, + "learning_rate": 2.386893696589097e-06, + "loss": 1.8812, + "mean_token_accuracy": 0.574657142162323, + "num_tokens": 8594151706.0, + "step": 16811 + }, + { + "epoch": 4.546241211465657, + "grad_norm": 0.8686349987983704, + "learning_rate": 2.3864365696926838e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5622522830963135, + "num_tokens": 8594625944.0, + "step": 16812 + }, + { + "epoch": 4.546511627906977, + "grad_norm": 0.9336299300193787, + "learning_rate": 2.3859797070857272e-06, + "loss": 1.793, + "mean_token_accuracy": 0.5937061309814453, + "num_tokens": 8595150162.0, + "step": 16813 + }, + { + "epoch": 4.546782044348296, + "grad_norm": 0.9153165817260742, + "learning_rate": 2.3855231087822445e-06, + "loss": 1.7806, + "mean_token_accuracy": 0.5732294917106628, + "num_tokens": 8595674358.0, + "step": 16814 + }, + { + "epoch": 4.547052460789616, + "grad_norm": 1.0335599184036255, + "learning_rate": 2.385066774796248e-06, + "loss": 1.8954, + "mean_token_accuracy": 0.5658539533615112, + "num_tokens": 8596177659.0, + "step": 16815 + }, + { + "epoch": 4.547322877230935, + "grad_norm": 0.9444605708122253, + "learning_rate": 2.384610705141735e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5664457082748413, + "num_tokens": 8596701878.0, + "step": 16816 + }, + { + "epoch": 4.547593293672255, + "grad_norm": 0.7976294755935669, + "learning_rate": 2.3841548998327045e-06, + "loss": 1.5042, + "mean_token_accuracy": 0.6603143215179443, + "num_tokens": 8597168839.0, + "step": 16817 + }, + { + "epoch": 4.547863710113575, + "grad_norm": 0.9469597339630127, + "learning_rate": 2.3836993588831383e-06, + "loss": 1.8868, + "mean_token_accuracy": 0.5810592770576477, + "num_tokens": 8597678557.0, + "step": 16818 + }, + { + "epoch": 4.548134126554895, + "grad_norm": 0.8428452610969543, + "learning_rate": 2.3832440823070134e-06, + "loss": 1.7887, + "mean_token_accuracy": 0.5789521336555481, + "num_tokens": 8598202675.0, + "step": 16819 + }, + { + "epoch": 4.548404542996214, + "grad_norm": 0.7928521037101746, + "learning_rate": 2.3827890701183024e-06, + "loss": 1.8422, + "mean_token_accuracy": 0.5808354020118713, + "num_tokens": 8598726799.0, + "step": 16820 + }, + { + "epoch": 4.548674959437534, + "grad_norm": 0.31065818667411804, + "learning_rate": 2.382334322330964e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.7183167934417725, + "num_tokens": 8599250868.0, + "step": 16821 + }, + { + "epoch": 4.548945375878853, + "grad_norm": 0.8681315779685974, + "learning_rate": 2.3818798389589496e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5725980401039124, + "num_tokens": 8599769755.0, + "step": 16822 + }, + { + "epoch": 4.549215792320173, + "grad_norm": 0.8727952837944031, + "learning_rate": 2.3814256200162062e-06, + "loss": 1.8591, + "mean_token_accuracy": 0.5624794960021973, + "num_tokens": 8600293975.0, + "step": 16823 + }, + { + "epoch": 4.5494862087614925, + "grad_norm": 0.8112176060676575, + "learning_rate": 2.3809716655166725e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.575760006904602, + "num_tokens": 8600818116.0, + "step": 16824 + }, + { + "epoch": 4.5497566252028125, + "grad_norm": 0.7838990688323975, + "learning_rate": 2.380517975474273e-06, + "loss": 1.7219, + "mean_token_accuracy": 0.6093105673789978, + "num_tokens": 8601342342.0, + "step": 16825 + }, + { + "epoch": 4.550027041644132, + "grad_norm": 0.7676767706871033, + "learning_rate": 2.3800645499029315e-06, + "loss": 1.9071, + "mean_token_accuracy": 0.5599567890167236, + "num_tokens": 8601866567.0, + "step": 16826 + }, + { + "epoch": 4.550297458085452, + "grad_norm": 0.7527949213981628, + "learning_rate": 2.379611388816559e-06, + "loss": 1.8651, + "mean_token_accuracy": 0.5549832582473755, + "num_tokens": 8602390796.0, + "step": 16827 + }, + { + "epoch": 4.550567874526771, + "grad_norm": 0.9779722094535828, + "learning_rate": 2.379158492229059e-06, + "loss": 1.8276, + "mean_token_accuracy": 0.5952040553092957, + "num_tokens": 8602914974.0, + "step": 16828 + }, + { + "epoch": 4.550838290968091, + "grad_norm": 0.8373638987541199, + "learning_rate": 2.37870586015433e-06, + "loss": 1.7277, + "mean_token_accuracy": 0.603854238986969, + "num_tokens": 8603383480.0, + "step": 16829 + }, + { + "epoch": 4.55110870740941, + "grad_norm": 0.7663862109184265, + "learning_rate": 2.3782534926062583e-06, + "loss": 1.8687, + "mean_token_accuracy": 0.5756329894065857, + "num_tokens": 8603907630.0, + "step": 16830 + }, + { + "epoch": 4.55137912385073, + "grad_norm": 0.8615061044692993, + "learning_rate": 2.3778013895987246e-06, + "loss": 1.8523, + "mean_token_accuracy": 0.5658299922943115, + "num_tokens": 8604404639.0, + "step": 16831 + }, + { + "epoch": 4.55164954029205, + "grad_norm": 0.7622641921043396, + "learning_rate": 2.3773495511456e-06, + "loss": 1.6192, + "mean_token_accuracy": 0.6350172758102417, + "num_tokens": 8604928866.0, + "step": 16832 + }, + { + "epoch": 4.55191995673337, + "grad_norm": 0.8357557654380798, + "learning_rate": 2.376897977260749e-06, + "loss": 1.8035, + "mean_token_accuracy": 0.5723086595535278, + "num_tokens": 8605452959.0, + "step": 16833 + }, + { + "epoch": 4.552190373174689, + "grad_norm": 0.8585945963859558, + "learning_rate": 2.376446667958027e-06, + "loss": 1.9153, + "mean_token_accuracy": 0.5594931840896606, + "num_tokens": 8605969719.0, + "step": 16834 + }, + { + "epoch": 4.552460789616009, + "grad_norm": 0.9626337885856628, + "learning_rate": 2.375995623251282e-06, + "loss": 1.8361, + "mean_token_accuracy": 0.5860323905944824, + "num_tokens": 8606493980.0, + "step": 16835 + }, + { + "epoch": 4.552731206057328, + "grad_norm": 0.8075684309005737, + "learning_rate": 2.375544843154351e-06, + "loss": 1.8229, + "mean_token_accuracy": 0.5851923227310181, + "num_tokens": 8607018237.0, + "step": 16836 + }, + { + "epoch": 4.553001622498648, + "grad_norm": 0.8287628889083862, + "learning_rate": 2.375094327681069e-06, + "loss": 1.8948, + "mean_token_accuracy": 0.5775855779647827, + "num_tokens": 8607542509.0, + "step": 16837 + }, + { + "epoch": 4.5532720389399675, + "grad_norm": 1.001367449760437, + "learning_rate": 2.374644076845256e-06, + "loss": 1.7341, + "mean_token_accuracy": 0.5942363739013672, + "num_tokens": 8608066625.0, + "step": 16838 + }, + { + "epoch": 4.5535424553812875, + "grad_norm": 0.8847835659980774, + "learning_rate": 2.3741940906607276e-06, + "loss": 1.747, + "mean_token_accuracy": 0.5904284715652466, + "num_tokens": 8608590784.0, + "step": 16839 + }, + { + "epoch": 4.553812871822607, + "grad_norm": 0.871610164642334, + "learning_rate": 2.3737443691412924e-06, + "loss": 1.7436, + "mean_token_accuracy": 0.6124692559242249, + "num_tokens": 8609086777.0, + "step": 16840 + }, + { + "epoch": 4.554083288263927, + "grad_norm": 0.3595159351825714, + "learning_rate": 2.373294912300749e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.7196173667907715, + "num_tokens": 8609611015.0, + "step": 16841 + }, + { + "epoch": 4.554353704705246, + "grad_norm": 0.9358869194984436, + "learning_rate": 2.3728457201528863e-06, + "loss": 1.8325, + "mean_token_accuracy": 0.5730156898498535, + "num_tokens": 8610135204.0, + "step": 16842 + }, + { + "epoch": 4.554624121146565, + "grad_norm": 0.9175968766212463, + "learning_rate": 2.372396792711487e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5645980834960938, + "num_tokens": 8610659398.0, + "step": 16843 + }, + { + "epoch": 4.554894537587885, + "grad_norm": 0.872894287109375, + "learning_rate": 2.371948129990328e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5583492517471313, + "num_tokens": 8611143362.0, + "step": 16844 + }, + { + "epoch": 4.555164954029205, + "grad_norm": 0.8519068956375122, + "learning_rate": 2.3714997320031725e-06, + "loss": 1.8348, + "mean_token_accuracy": 0.5761090517044067, + "num_tokens": 8611667637.0, + "step": 16845 + }, + { + "epoch": 4.555435370470525, + "grad_norm": 0.8705888986587524, + "learning_rate": 2.3710515987637806e-06, + "loss": 1.8286, + "mean_token_accuracy": 0.5788267850875854, + "num_tokens": 8612191768.0, + "step": 16846 + }, + { + "epoch": 4.555705786911844, + "grad_norm": 0.8523511290550232, + "learning_rate": 2.3706037302859016e-06, + "loss": 1.7706, + "mean_token_accuracy": 0.5889726281166077, + "num_tokens": 8612715974.0, + "step": 16847 + }, + { + "epoch": 4.555976203353164, + "grad_norm": 0.878427267074585, + "learning_rate": 2.3701561265832767e-06, + "loss": 1.8443, + "mean_token_accuracy": 0.5807710289955139, + "num_tokens": 8613240203.0, + "step": 16848 + }, + { + "epoch": 4.556246619794484, + "grad_norm": 0.8709642291069031, + "learning_rate": 2.3697087876696424e-06, + "loss": 1.8214, + "mean_token_accuracy": 0.5703381896018982, + "num_tokens": 8613764470.0, + "step": 16849 + }, + { + "epoch": 4.556517036235803, + "grad_norm": 0.9140332341194153, + "learning_rate": 2.3692617135587206e-06, + "loss": 1.83, + "mean_token_accuracy": 0.5653303861618042, + "num_tokens": 8614288745.0, + "step": 16850 + }, + { + "epoch": 4.556787452677122, + "grad_norm": 0.8925608992576599, + "learning_rate": 2.368814904264231e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5734416246414185, + "num_tokens": 8614812776.0, + "step": 16851 + }, + { + "epoch": 4.5570578691184425, + "grad_norm": 1.010492205619812, + "learning_rate": 2.3683683597998826e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5593528151512146, + "num_tokens": 8615337054.0, + "step": 16852 + }, + { + "epoch": 4.557328285559762, + "grad_norm": 0.9498529434204102, + "learning_rate": 2.3679220801793786e-06, + "loss": 1.8133, + "mean_token_accuracy": 0.562862753868103, + "num_tokens": 8615861126.0, + "step": 16853 + }, + { + "epoch": 4.557598702001082, + "grad_norm": 0.9242454171180725, + "learning_rate": 2.3674760654164087e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.5885477066040039, + "num_tokens": 8616385310.0, + "step": 16854 + }, + { + "epoch": 4.557869118442401, + "grad_norm": 0.8130989670753479, + "learning_rate": 2.367030315524661e-06, + "loss": 1.8127, + "mean_token_accuracy": 0.5792140960693359, + "num_tokens": 8616909498.0, + "step": 16855 + }, + { + "epoch": 4.558139534883721, + "grad_norm": 1.086971640586853, + "learning_rate": 2.3665848305178117e-06, + "loss": 1.901, + "mean_token_accuracy": 0.5556337833404541, + "num_tokens": 8617388432.0, + "step": 16856 + }, + { + "epoch": 4.55840995132504, + "grad_norm": 0.8630415201187134, + "learning_rate": 2.366139610409527e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.561981737613678, + "num_tokens": 8617912663.0, + "step": 16857 + }, + { + "epoch": 4.55868036776636, + "grad_norm": 0.8520923852920532, + "learning_rate": 2.3656946552134708e-06, + "loss": 1.8981, + "mean_token_accuracy": 0.5683441162109375, + "num_tokens": 8618421444.0, + "step": 16858 + }, + { + "epoch": 4.5589507842076795, + "grad_norm": 0.7785071730613708, + "learning_rate": 2.3652499649432947e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.564562201499939, + "num_tokens": 8618924455.0, + "step": 16859 + }, + { + "epoch": 4.559221200649, + "grad_norm": 0.8986153602600098, + "learning_rate": 2.3648055396126416e-06, + "loss": 1.9269, + "mean_token_accuracy": 0.5868530869483948, + "num_tokens": 8619384200.0, + "step": 16860 + }, + { + "epoch": 4.559491617090319, + "grad_norm": 0.36536699533462524, + "learning_rate": 2.3643613792351503e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7189100980758667, + "num_tokens": 8619850392.0, + "step": 16861 + }, + { + "epoch": 4.559762033531639, + "grad_norm": 0.8387473225593567, + "learning_rate": 2.3639174838244463e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5813307762145996, + "num_tokens": 8620374669.0, + "step": 16862 + }, + { + "epoch": 4.560032449972958, + "grad_norm": 0.9065870046615601, + "learning_rate": 2.363473853394152e-06, + "loss": 1.7515, + "mean_token_accuracy": 0.6136783957481384, + "num_tokens": 8620810179.0, + "step": 16863 + }, + { + "epoch": 4.560302866414278, + "grad_norm": 0.9601135849952698, + "learning_rate": 2.3630304879578785e-06, + "loss": 1.8544, + "mean_token_accuracy": 0.5696645975112915, + "num_tokens": 8621334447.0, + "step": 16864 + }, + { + "epoch": 4.560573282855597, + "grad_norm": 0.8144505023956299, + "learning_rate": 2.362587387529228e-06, + "loss": 1.7701, + "mean_token_accuracy": 0.5934851169586182, + "num_tokens": 8621858705.0, + "step": 16865 + }, + { + "epoch": 4.5608436992969175, + "grad_norm": 0.8936178088188171, + "learning_rate": 2.3621445521217975e-06, + "loss": 1.8346, + "mean_token_accuracy": 0.5877267718315125, + "num_tokens": 8622372600.0, + "step": 16866 + }, + { + "epoch": 4.561114115738237, + "grad_norm": 1.0870405435562134, + "learning_rate": 2.3617019817491753e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.5834928750991821, + "num_tokens": 8622834276.0, + "step": 16867 + }, + { + "epoch": 4.561384532179557, + "grad_norm": 1.2497532367706299, + "learning_rate": 2.361259676424939e-06, + "loss": 1.8046, + "mean_token_accuracy": 0.5753927230834961, + "num_tokens": 8623358539.0, + "step": 16868 + }, + { + "epoch": 4.561654948620876, + "grad_norm": 0.9740030169487, + "learning_rate": 2.3608176361626624e-06, + "loss": 1.7613, + "mean_token_accuracy": 0.5877034664154053, + "num_tokens": 8623829353.0, + "step": 16869 + }, + { + "epoch": 4.561925365062196, + "grad_norm": 0.9001164436340332, + "learning_rate": 2.3603758609759056e-06, + "loss": 1.738, + "mean_token_accuracy": 0.595260500907898, + "num_tokens": 8624353582.0, + "step": 16870 + }, + { + "epoch": 4.562195781503515, + "grad_norm": 1.0603821277618408, + "learning_rate": 2.359934350878225e-06, + "loss": 1.8239, + "mean_token_accuracy": 0.5724985599517822, + "num_tokens": 8624877751.0, + "step": 16871 + }, + { + "epoch": 4.562466197944835, + "grad_norm": 1.1180840730667114, + "learning_rate": 2.359493105883167e-06, + "loss": 1.8961, + "mean_token_accuracy": 0.560638427734375, + "num_tokens": 8625401966.0, + "step": 16872 + }, + { + "epoch": 4.5627366143861545, + "grad_norm": 0.9538993239402771, + "learning_rate": 2.359052126004272e-06, + "loss": 1.5825, + "mean_token_accuracy": 0.6162533760070801, + "num_tokens": 8625926085.0, + "step": 16873 + }, + { + "epoch": 4.563007030827475, + "grad_norm": 0.8487831950187683, + "learning_rate": 2.358611411255067e-06, + "loss": 1.8905, + "mean_token_accuracy": 0.5640268921852112, + "num_tokens": 8626449688.0, + "step": 16874 + }, + { + "epoch": 4.563277447268794, + "grad_norm": 1.0315544605255127, + "learning_rate": 2.358170961649078e-06, + "loss": 1.7037, + "mean_token_accuracy": 0.6195625066757202, + "num_tokens": 8626973959.0, + "step": 16875 + }, + { + "epoch": 4.563547863710114, + "grad_norm": 0.9509431719779968, + "learning_rate": 2.357730777199818e-06, + "loss": 1.7928, + "mean_token_accuracy": 0.5964680314064026, + "num_tokens": 8627458513.0, + "step": 16876 + }, + { + "epoch": 4.563818280151433, + "grad_norm": 1.092602014541626, + "learning_rate": 2.357290857920792e-06, + "loss": 1.8176, + "mean_token_accuracy": 0.5710640549659729, + "num_tokens": 8627982783.0, + "step": 16877 + }, + { + "epoch": 4.564088696592753, + "grad_norm": 1.0795249938964844, + "learning_rate": 2.3568512038255005e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.5875952243804932, + "num_tokens": 8628507064.0, + "step": 16878 + }, + { + "epoch": 4.564359113034072, + "grad_norm": 0.848041296005249, + "learning_rate": 2.3564118149274302e-06, + "loss": 1.7581, + "mean_token_accuracy": 0.5922986268997192, + "num_tokens": 8629031221.0, + "step": 16879 + }, + { + "epoch": 4.5646295294753925, + "grad_norm": 0.8414391875267029, + "learning_rate": 2.355972691240065e-06, + "loss": 1.8674, + "mean_token_accuracy": 0.5662463307380676, + "num_tokens": 8629555431.0, + "step": 16880 + }, + { + "epoch": 4.564899945916712, + "grad_norm": 0.35870465636253357, + "learning_rate": 2.355533832776879e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.7060391902923584, + "num_tokens": 8630079627.0, + "step": 16881 + }, + { + "epoch": 4.565170362358032, + "grad_norm": 1.0010712146759033, + "learning_rate": 2.355095239551336e-06, + "loss": 1.7472, + "mean_token_accuracy": 0.5781759023666382, + "num_tokens": 8630603891.0, + "step": 16882 + }, + { + "epoch": 4.565440778799351, + "grad_norm": 0.9688768982887268, + "learning_rate": 2.3546569115768944e-06, + "loss": 1.749, + "mean_token_accuracy": 0.594355583190918, + "num_tokens": 8631128038.0, + "step": 16883 + }, + { + "epoch": 4.56571119524067, + "grad_norm": 1.188058853149414, + "learning_rate": 2.354218848867003e-06, + "loss": 1.9358, + "mean_token_accuracy": 0.5773416757583618, + "num_tokens": 8631593120.0, + "step": 16884 + }, + { + "epoch": 4.56598161168199, + "grad_norm": 1.0805827379226685, + "learning_rate": 2.3537810514351016e-06, + "loss": 1.8724, + "mean_token_accuracy": 0.554233193397522, + "num_tokens": 8632117252.0, + "step": 16885 + }, + { + "epoch": 4.56625202812331, + "grad_norm": 0.8104338049888611, + "learning_rate": 2.3533435192946264e-06, + "loss": 1.695, + "mean_token_accuracy": 0.6165725588798523, + "num_tokens": 8632619062.0, + "step": 16886 + }, + { + "epoch": 4.5665224445646295, + "grad_norm": 0.853655219078064, + "learning_rate": 2.3529062524589997e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.59051114320755, + "num_tokens": 8633143347.0, + "step": 16887 + }, + { + "epoch": 4.566792861005949, + "grad_norm": 1.0470670461654663, + "learning_rate": 2.352469250941638e-06, + "loss": 2.0038, + "mean_token_accuracy": 0.5465845465660095, + "num_tokens": 8633667462.0, + "step": 16888 + }, + { + "epoch": 4.567063277447269, + "grad_norm": 0.9142959713935852, + "learning_rate": 2.3520325147559507e-06, + "loss": 1.7451, + "mean_token_accuracy": 0.5939508676528931, + "num_tokens": 8634191734.0, + "step": 16889 + }, + { + "epoch": 4.567333693888589, + "grad_norm": 0.9053276181221008, + "learning_rate": 2.3515960439153387e-06, + "loss": 1.9105, + "mean_token_accuracy": 0.567080020904541, + "num_tokens": 8634715907.0, + "step": 16890 + }, + { + "epoch": 4.567604110329908, + "grad_norm": 0.8449053764343262, + "learning_rate": 2.351159838433192e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5834976434707642, + "num_tokens": 8635240167.0, + "step": 16891 + }, + { + "epoch": 4.567874526771227, + "grad_norm": 0.7640904188156128, + "learning_rate": 2.3507238983228973e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5665974617004395, + "num_tokens": 8635764192.0, + "step": 16892 + }, + { + "epoch": 4.568144943212547, + "grad_norm": 0.9238986372947693, + "learning_rate": 2.3502882235978273e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.5610449314117432, + "num_tokens": 8636288284.0, + "step": 16893 + }, + { + "epoch": 4.568415359653867, + "grad_norm": 1.3455100059509277, + "learning_rate": 2.349852814271354e-06, + "loss": 1.8834, + "mean_token_accuracy": 0.5853407382965088, + "num_tokens": 8636750365.0, + "step": 16894 + }, + { + "epoch": 4.568685776095187, + "grad_norm": 0.7927342057228088, + "learning_rate": 2.349417670356833e-06, + "loss": 1.7348, + "mean_token_accuracy": 0.5979070663452148, + "num_tokens": 8637274611.0, + "step": 16895 + }, + { + "epoch": 4.568956192536506, + "grad_norm": 0.8693670034408569, + "learning_rate": 2.348982791867618e-06, + "loss": 1.9104, + "mean_token_accuracy": 0.5661697387695312, + "num_tokens": 8637788542.0, + "step": 16896 + }, + { + "epoch": 4.569226608977826, + "grad_norm": 0.930347204208374, + "learning_rate": 2.3485481788170523e-06, + "loss": 1.7694, + "mean_token_accuracy": 0.6019273996353149, + "num_tokens": 8638267833.0, + "step": 16897 + }, + { + "epoch": 4.569497025419145, + "grad_norm": 0.735284686088562, + "learning_rate": 2.3481138312184713e-06, + "loss": 1.6558, + "mean_token_accuracy": 0.6236435174942017, + "num_tokens": 8638763688.0, + "step": 16898 + }, + { + "epoch": 4.569767441860465, + "grad_norm": 0.8684348464012146, + "learning_rate": 2.347679749085201e-06, + "loss": 1.9503, + "mean_token_accuracy": 0.5646413564682007, + "num_tokens": 8639287935.0, + "step": 16899 + }, + { + "epoch": 4.570037858301784, + "grad_norm": 0.839653491973877, + "learning_rate": 2.3472459324305607e-06, + "loss": 1.8438, + "mean_token_accuracy": 0.5751278400421143, + "num_tokens": 8639812216.0, + "step": 16900 + }, + { + "epoch": 4.5703082747431045, + "grad_norm": 0.35831883549690247, + "learning_rate": 2.346812381267861e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7194700241088867, + "num_tokens": 8640273170.0, + "step": 16901 + }, + { + "epoch": 4.570578691184424, + "grad_norm": 0.9284432530403137, + "learning_rate": 2.346379095610406e-06, + "loss": 1.8187, + "mean_token_accuracy": 0.5752383470535278, + "num_tokens": 8640797371.0, + "step": 16902 + }, + { + "epoch": 4.570849107625744, + "grad_norm": 0.9998489022254944, + "learning_rate": 2.3459460754714876e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.589559018611908, + "num_tokens": 8641274181.0, + "step": 16903 + }, + { + "epoch": 4.571119524067063, + "grad_norm": 0.8354844450950623, + "learning_rate": 2.345513320864394e-06, + "loss": 1.9293, + "mean_token_accuracy": 0.5499958992004395, + "num_tokens": 8641798426.0, + "step": 16904 + }, + { + "epoch": 4.571389940508383, + "grad_norm": 0.8446055054664612, + "learning_rate": 2.345080831802404e-06, + "loss": 1.7329, + "mean_token_accuracy": 0.5990185737609863, + "num_tokens": 8642290133.0, + "step": 16905 + }, + { + "epoch": 4.571660356949702, + "grad_norm": 0.6782953143119812, + "learning_rate": 2.3446486082987846e-06, + "loss": 1.7346, + "mean_token_accuracy": 0.5968799591064453, + "num_tokens": 8642814406.0, + "step": 16906 + }, + { + "epoch": 4.571930773391022, + "grad_norm": 0.8788003325462341, + "learning_rate": 2.3442166503668012e-06, + "loss": 1.7547, + "mean_token_accuracy": 0.5795190930366516, + "num_tokens": 8643338528.0, + "step": 16907 + }, + { + "epoch": 4.572201189832342, + "grad_norm": 1.0470138788223267, + "learning_rate": 2.3437849580197043e-06, + "loss": 1.9807, + "mean_token_accuracy": 0.5582658052444458, + "num_tokens": 8643810407.0, + "step": 16908 + }, + { + "epoch": 4.572471606273662, + "grad_norm": 0.8400087952613831, + "learning_rate": 2.3433535312707413e-06, + "loss": 1.737, + "mean_token_accuracy": 0.6029033660888672, + "num_tokens": 8644334677.0, + "step": 16909 + }, + { + "epoch": 4.572742022714981, + "grad_norm": 0.7472385168075562, + "learning_rate": 2.342922370133151e-06, + "loss": 1.8735, + "mean_token_accuracy": 0.5780599117279053, + "num_tokens": 8644858793.0, + "step": 16910 + }, + { + "epoch": 4.573012439156301, + "grad_norm": 0.7032033205032349, + "learning_rate": 2.34249147462016e-06, + "loss": 1.8061, + "mean_token_accuracy": 0.5818735957145691, + "num_tokens": 8645383030.0, + "step": 16911 + }, + { + "epoch": 4.57328285559762, + "grad_norm": 0.7660267949104309, + "learning_rate": 2.342060844744992e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5695874691009521, + "num_tokens": 8645907305.0, + "step": 16912 + }, + { + "epoch": 4.57355327203894, + "grad_norm": 0.787283182144165, + "learning_rate": 2.3416304805208573e-06, + "loss": 1.7853, + "mean_token_accuracy": 0.5865347385406494, + "num_tokens": 8646431523.0, + "step": 16913 + }, + { + "epoch": 4.573823688480259, + "grad_norm": 0.7942098379135132, + "learning_rate": 2.341200381960962e-06, + "loss": 1.7874, + "mean_token_accuracy": 0.5906670093536377, + "num_tokens": 8646955775.0, + "step": 16914 + }, + { + "epoch": 4.5740941049215795, + "grad_norm": 0.9249874949455261, + "learning_rate": 2.340770549078504e-06, + "loss": 1.6746, + "mean_token_accuracy": 0.5949810147285461, + "num_tokens": 8647479916.0, + "step": 16915 + }, + { + "epoch": 4.574364521362899, + "grad_norm": 0.9082019329071045, + "learning_rate": 2.340340981886671e-06, + "loss": 1.8308, + "mean_token_accuracy": 0.5737850666046143, + "num_tokens": 8647959906.0, + "step": 16916 + }, + { + "epoch": 4.574634937804219, + "grad_norm": 0.824685275554657, + "learning_rate": 2.339911680398641e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5812276601791382, + "num_tokens": 8648484137.0, + "step": 16917 + }, + { + "epoch": 4.574905354245538, + "grad_norm": 0.8344585299491882, + "learning_rate": 2.33948264462759e-06, + "loss": 1.8317, + "mean_token_accuracy": 0.5742802619934082, + "num_tokens": 8648958506.0, + "step": 16918 + }, + { + "epoch": 4.575175770686858, + "grad_norm": 0.957102358341217, + "learning_rate": 2.339053874586681e-06, + "loss": 1.8289, + "mean_token_accuracy": 0.5813291072845459, + "num_tokens": 8649449871.0, + "step": 16919 + }, + { + "epoch": 4.575446187128177, + "grad_norm": 0.8285776376724243, + "learning_rate": 2.3386253702890668e-06, + "loss": 1.718, + "mean_token_accuracy": 0.5766997337341309, + "num_tokens": 8649974099.0, + "step": 16920 + }, + { + "epoch": 4.575716603569497, + "grad_norm": 0.3508078455924988, + "learning_rate": 2.3381971317479e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.7193211913108826, + "num_tokens": 8650488821.0, + "step": 16921 + }, + { + "epoch": 4.575987020010817, + "grad_norm": 1.0674482583999634, + "learning_rate": 2.3377691589763165e-06, + "loss": 1.5943, + "mean_token_accuracy": 0.6241618990898132, + "num_tokens": 8650917486.0, + "step": 16922 + }, + { + "epoch": 4.576257436452137, + "grad_norm": 0.8173161149024963, + "learning_rate": 2.3373414519874483e-06, + "loss": 1.7333, + "mean_token_accuracy": 0.5922096967697144, + "num_tokens": 8651441600.0, + "step": 16923 + }, + { + "epoch": 4.576527852893456, + "grad_norm": 0.8884282112121582, + "learning_rate": 2.336914010794422e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5723124742507935, + "num_tokens": 8651965769.0, + "step": 16924 + }, + { + "epoch": 4.576798269334775, + "grad_norm": 0.9468609690666199, + "learning_rate": 2.336486835410349e-06, + "loss": 2.0503, + "mean_token_accuracy": 0.5308105945587158, + "num_tokens": 8652489937.0, + "step": 16925 + }, + { + "epoch": 4.577068685776095, + "grad_norm": 0.778573751449585, + "learning_rate": 2.3360599258483367e-06, + "loss": 1.828, + "mean_token_accuracy": 0.5755712985992432, + "num_tokens": 8652991280.0, + "step": 16926 + }, + { + "epoch": 4.577339102217415, + "grad_norm": 0.8151898384094238, + "learning_rate": 2.3356332821214873e-06, + "loss": 1.8094, + "mean_token_accuracy": 0.564956545829773, + "num_tokens": 8653515531.0, + "step": 16927 + }, + { + "epoch": 4.5776095186587344, + "grad_norm": 0.8441987633705139, + "learning_rate": 2.3352069042428873e-06, + "loss": 1.7651, + "mean_token_accuracy": 0.5996717810630798, + "num_tokens": 8654039683.0, + "step": 16928 + }, + { + "epoch": 4.577879935100054, + "grad_norm": 0.9080830812454224, + "learning_rate": 2.3347807922256215e-06, + "loss": 1.9573, + "mean_token_accuracy": 0.5580737590789795, + "num_tokens": 8654563961.0, + "step": 16929 + }, + { + "epoch": 4.578150351541374, + "grad_norm": 0.7979499697685242, + "learning_rate": 2.334354946082764e-06, + "loss": 1.8156, + "mean_token_accuracy": 0.588326096534729, + "num_tokens": 8655088141.0, + "step": 16930 + }, + { + "epoch": 4.578420767982694, + "grad_norm": 0.776414155960083, + "learning_rate": 2.333929365827379e-06, + "loss": 1.8275, + "mean_token_accuracy": 0.5763964653015137, + "num_tokens": 8655612420.0, + "step": 16931 + }, + { + "epoch": 4.578691184424013, + "grad_norm": 0.7472061514854431, + "learning_rate": 2.333504051472528e-06, + "loss": 1.8592, + "mean_token_accuracy": 0.5823032855987549, + "num_tokens": 8656136583.0, + "step": 16932 + }, + { + "epoch": 4.578961600865332, + "grad_norm": 0.8564395904541016, + "learning_rate": 2.3330790030312605e-06, + "loss": 1.8616, + "mean_token_accuracy": 0.5601599812507629, + "num_tokens": 8656660676.0, + "step": 16933 + }, + { + "epoch": 4.579232017306652, + "grad_norm": 0.7733900547027588, + "learning_rate": 2.3326542205166144e-06, + "loss": 1.7137, + "mean_token_accuracy": 0.59717857837677, + "num_tokens": 8657121391.0, + "step": 16934 + }, + { + "epoch": 4.5795024337479715, + "grad_norm": 0.8374825716018677, + "learning_rate": 2.3322297039416268e-06, + "loss": 1.8347, + "mean_token_accuracy": 0.5714957118034363, + "num_tokens": 8657645532.0, + "step": 16935 + }, + { + "epoch": 4.579772850189292, + "grad_norm": 0.8147302865982056, + "learning_rate": 2.3318054533193217e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5658847689628601, + "num_tokens": 8658169721.0, + "step": 16936 + }, + { + "epoch": 4.580043266630611, + "grad_norm": 0.8231140971183777, + "learning_rate": 2.3313814686627172e-06, + "loss": 1.8692, + "mean_token_accuracy": 0.5712298154830933, + "num_tokens": 8658693923.0, + "step": 16937 + }, + { + "epoch": 4.580313683071931, + "grad_norm": 0.9064784646034241, + "learning_rate": 2.330957749984822e-06, + "loss": 1.9987, + "mean_token_accuracy": 0.5437716841697693, + "num_tokens": 8659166890.0, + "step": 16938 + }, + { + "epoch": 4.58058409951325, + "grad_norm": 0.7524129152297974, + "learning_rate": 2.3305342972986373e-06, + "loss": 1.7392, + "mean_token_accuracy": 0.5956240892410278, + "num_tokens": 8659691161.0, + "step": 16939 + }, + { + "epoch": 4.58085451595457, + "grad_norm": 0.9493122100830078, + "learning_rate": 2.3301111106171548e-06, + "loss": 1.9024, + "mean_token_accuracy": 0.5720895528793335, + "num_tokens": 8660172270.0, + "step": 16940 + }, + { + "epoch": 4.581124932395889, + "grad_norm": 0.35843804478645325, + "learning_rate": 2.329688189953361e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7226455211639404, + "num_tokens": 8660696508.0, + "step": 16941 + }, + { + "epoch": 4.5813953488372094, + "grad_norm": 0.8237316608428955, + "learning_rate": 2.3292655353202313e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.5850383043289185, + "num_tokens": 8661156050.0, + "step": 16942 + }, + { + "epoch": 4.581665765278529, + "grad_norm": 0.8570128679275513, + "learning_rate": 2.3288431467307324e-06, + "loss": 1.9131, + "mean_token_accuracy": 0.5572454929351807, + "num_tokens": 8661680193.0, + "step": 16943 + }, + { + "epoch": 4.581936181719849, + "grad_norm": 0.8949848413467407, + "learning_rate": 2.328421024197827e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5607041120529175, + "num_tokens": 8662204317.0, + "step": 16944 + }, + { + "epoch": 4.582206598161168, + "grad_norm": 0.9010518193244934, + "learning_rate": 2.327999167734466e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5637935400009155, + "num_tokens": 8662722669.0, + "step": 16945 + }, + { + "epoch": 4.582477014602488, + "grad_norm": 0.926632285118103, + "learning_rate": 2.3275775773535937e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5748025178909302, + "num_tokens": 8663202394.0, + "step": 16946 + }, + { + "epoch": 4.582747431043807, + "grad_norm": 0.8026039004325867, + "learning_rate": 2.3271562530681446e-06, + "loss": 1.7931, + "mean_token_accuracy": 0.5773858428001404, + "num_tokens": 8663726636.0, + "step": 16947 + }, + { + "epoch": 4.583017847485127, + "grad_norm": 0.8785645365715027, + "learning_rate": 2.326735194891048e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5692219734191895, + "num_tokens": 8664250877.0, + "step": 16948 + }, + { + "epoch": 4.5832882639264465, + "grad_norm": 0.6981019377708435, + "learning_rate": 2.3263144028352202e-06, + "loss": 1.7849, + "mean_token_accuracy": 0.5724343061447144, + "num_tokens": 8664754348.0, + "step": 16949 + }, + { + "epoch": 4.583558680367767, + "grad_norm": 0.7888602018356323, + "learning_rate": 2.325893876913574e-06, + "loss": 1.825, + "mean_token_accuracy": 0.5754642486572266, + "num_tokens": 8665278621.0, + "step": 16950 + }, + { + "epoch": 4.583829096809086, + "grad_norm": 0.7831394076347351, + "learning_rate": 2.3254736171390137e-06, + "loss": 1.7549, + "mean_token_accuracy": 0.5922949314117432, + "num_tokens": 8665802734.0, + "step": 16951 + }, + { + "epoch": 4.584099513250406, + "grad_norm": 0.9354718327522278, + "learning_rate": 2.325053623524432e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5844455361366272, + "num_tokens": 8666326936.0, + "step": 16952 + }, + { + "epoch": 4.584369929691725, + "grad_norm": 0.8214250802993774, + "learning_rate": 2.324633896082718e-06, + "loss": 1.89, + "mean_token_accuracy": 0.5542280077934265, + "num_tokens": 8666851178.0, + "step": 16953 + }, + { + "epoch": 4.584640346133045, + "grad_norm": 0.7873017191886902, + "learning_rate": 2.324214434826748e-06, + "loss": 1.8962, + "mean_token_accuracy": 0.5811946392059326, + "num_tokens": 8667336476.0, + "step": 16954 + }, + { + "epoch": 4.584910762574364, + "grad_norm": 0.7690603137016296, + "learning_rate": 2.3237952397693925e-06, + "loss": 1.8321, + "mean_token_accuracy": 0.5810734033584595, + "num_tokens": 8667860738.0, + "step": 16955 + }, + { + "epoch": 4.5851811790156844, + "grad_norm": 0.7758432030677795, + "learning_rate": 2.3233763109235153e-06, + "loss": 1.8989, + "mean_token_accuracy": 0.5712118148803711, + "num_tokens": 8668384934.0, + "step": 16956 + }, + { + "epoch": 4.585451595457004, + "grad_norm": 0.8837285041809082, + "learning_rate": 2.3229576483019674e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.5761094093322754, + "num_tokens": 8668909195.0, + "step": 16957 + }, + { + "epoch": 4.585722011898324, + "grad_norm": 0.8431158661842346, + "learning_rate": 2.322539251917597e-06, + "loss": 1.6461, + "mean_token_accuracy": 0.6032734513282776, + "num_tokens": 8669387369.0, + "step": 16958 + }, + { + "epoch": 4.585992428339643, + "grad_norm": 0.8714032173156738, + "learning_rate": 2.3221211217832425e-06, + "loss": 1.6965, + "mean_token_accuracy": 0.5794398784637451, + "num_tokens": 8669911534.0, + "step": 16959 + }, + { + "epoch": 4.586262844780963, + "grad_norm": 0.8704379200935364, + "learning_rate": 2.3217032579117297e-06, + "loss": 1.9146, + "mean_token_accuracy": 0.5740624666213989, + "num_tokens": 8670435803.0, + "step": 16960 + }, + { + "epoch": 4.586533261222282, + "grad_norm": 0.37053802609443665, + "learning_rate": 2.3212856603158838e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.7048482894897461, + "num_tokens": 8670960057.0, + "step": 16961 + }, + { + "epoch": 4.586803677663602, + "grad_norm": 0.8874877095222473, + "learning_rate": 2.320868329008516e-06, + "loss": 1.7516, + "mean_token_accuracy": 0.5820837020874023, + "num_tokens": 8671484232.0, + "step": 16962 + }, + { + "epoch": 4.5870740941049215, + "grad_norm": 0.9348554015159607, + "learning_rate": 2.320451264002431e-06, + "loss": 1.8137, + "mean_token_accuracy": 0.5729242563247681, + "num_tokens": 8672008512.0, + "step": 16963 + }, + { + "epoch": 4.587344510546242, + "grad_norm": 0.9032049775123596, + "learning_rate": 2.320034465310427e-06, + "loss": 1.8751, + "mean_token_accuracy": 0.5785525441169739, + "num_tokens": 8672481351.0, + "step": 16964 + }, + { + "epoch": 4.587614926987561, + "grad_norm": 0.7481198310852051, + "learning_rate": 2.3196179329452905e-06, + "loss": 1.7959, + "mean_token_accuracy": 0.5716278553009033, + "num_tokens": 8673005585.0, + "step": 16965 + }, + { + "epoch": 4.58788534342888, + "grad_norm": 0.8251271843910217, + "learning_rate": 2.3192016669198033e-06, + "loss": 1.7778, + "mean_token_accuracy": 0.6001039147377014, + "num_tokens": 8673468188.0, + "step": 16966 + }, + { + "epoch": 4.5881557598702, + "grad_norm": 0.866506814956665, + "learning_rate": 2.318785667246738e-06, + "loss": 1.8326, + "mean_token_accuracy": 0.5705225467681885, + "num_tokens": 8673992402.0, + "step": 16967 + }, + { + "epoch": 4.58842617631152, + "grad_norm": 0.7405506372451782, + "learning_rate": 2.318369933938859e-06, + "loss": 1.7611, + "mean_token_accuracy": 0.5961579084396362, + "num_tokens": 8674516577.0, + "step": 16968 + }, + { + "epoch": 4.588696592752839, + "grad_norm": 0.7991982698440552, + "learning_rate": 2.3179544670089194e-06, + "loss": 1.7284, + "mean_token_accuracy": 0.5961869955062866, + "num_tokens": 8675040856.0, + "step": 16969 + }, + { + "epoch": 4.588967009194159, + "grad_norm": 0.7915910482406616, + "learning_rate": 2.3175392664696706e-06, + "loss": 1.9251, + "mean_token_accuracy": 0.5664932727813721, + "num_tokens": 8675565035.0, + "step": 16970 + }, + { + "epoch": 4.589237425635479, + "grad_norm": 0.8236741423606873, + "learning_rate": 2.3171243323338512e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5820907950401306, + "num_tokens": 8676088963.0, + "step": 16971 + }, + { + "epoch": 4.589507842076799, + "grad_norm": 0.8535026907920837, + "learning_rate": 2.3167096646141905e-06, + "loss": 1.8193, + "mean_token_accuracy": 0.5948096513748169, + "num_tokens": 8676560782.0, + "step": 16972 + }, + { + "epoch": 4.589778258518118, + "grad_norm": 0.7980737686157227, + "learning_rate": 2.316295263323414e-06, + "loss": 1.819, + "mean_token_accuracy": 0.586223840713501, + "num_tokens": 8677023647.0, + "step": 16973 + }, + { + "epoch": 4.590048674959437, + "grad_norm": 0.8227489590644836, + "learning_rate": 2.3158811284742354e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5693967938423157, + "num_tokens": 8677547645.0, + "step": 16974 + }, + { + "epoch": 4.590319091400757, + "grad_norm": 0.8519886136054993, + "learning_rate": 2.3154672600793623e-06, + "loss": 1.7131, + "mean_token_accuracy": 0.6053980588912964, + "num_tokens": 8678071918.0, + "step": 16975 + }, + { + "epoch": 4.590589507842076, + "grad_norm": 0.9488861560821533, + "learning_rate": 2.315053658151494e-06, + "loss": 1.8989, + "mean_token_accuracy": 0.5683059692382812, + "num_tokens": 8678535988.0, + "step": 16976 + }, + { + "epoch": 4.5908599242833965, + "grad_norm": 0.818937361240387, + "learning_rate": 2.314640322703319e-06, + "loss": 1.6973, + "mean_token_accuracy": 0.5825399160385132, + "num_tokens": 8679060188.0, + "step": 16977 + }, + { + "epoch": 4.591130340724716, + "grad_norm": 0.8033977746963501, + "learning_rate": 2.314227253747521e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5803571939468384, + "num_tokens": 8679584377.0, + "step": 16978 + }, + { + "epoch": 4.591400757166036, + "grad_norm": 0.778813898563385, + "learning_rate": 2.313814451296774e-06, + "loss": 1.7468, + "mean_token_accuracy": 0.5767934322357178, + "num_tokens": 8680108555.0, + "step": 16979 + }, + { + "epoch": 4.591671173607355, + "grad_norm": 0.895378828048706, + "learning_rate": 2.3134019153637448e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5728548169136047, + "num_tokens": 8680632824.0, + "step": 16980 + }, + { + "epoch": 4.591941590048675, + "grad_norm": 0.3777411878108978, + "learning_rate": 2.3129896459610895e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.7249035835266113, + "num_tokens": 8681157012.0, + "step": 16981 + }, + { + "epoch": 4.592212006489994, + "grad_norm": 0.8642489314079285, + "learning_rate": 2.3125776431014593e-06, + "loss": 1.8048, + "mean_token_accuracy": 0.5804237127304077, + "num_tokens": 8681615901.0, + "step": 16982 + }, + { + "epoch": 4.592482422931314, + "grad_norm": 0.9165647029876709, + "learning_rate": 2.312165906797494e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5828893780708313, + "num_tokens": 8682140168.0, + "step": 16983 + }, + { + "epoch": 4.592752839372634, + "grad_norm": 0.8257402777671814, + "learning_rate": 2.3117544370618303e-06, + "loss": 1.7744, + "mean_token_accuracy": 0.57584148645401, + "num_tokens": 8682664366.0, + "step": 16984 + }, + { + "epoch": 4.593023255813954, + "grad_norm": 0.9969569444656372, + "learning_rate": 2.3113432339070895e-06, + "loss": 1.9442, + "mean_token_accuracy": 0.5817028880119324, + "num_tokens": 8683029313.0, + "step": 16985 + }, + { + "epoch": 4.593293672255273, + "grad_norm": 0.7588747143745422, + "learning_rate": 2.3109322973458896e-06, + "loss": 1.7811, + "mean_token_accuracy": 0.5957651138305664, + "num_tokens": 8683553372.0, + "step": 16986 + }, + { + "epoch": 4.593564088696593, + "grad_norm": 0.8764251470565796, + "learning_rate": 2.3105216273908404e-06, + "loss": 1.8705, + "mean_token_accuracy": 0.5701850652694702, + "num_tokens": 8684077603.0, + "step": 16987 + }, + { + "epoch": 4.593834505137912, + "grad_norm": 0.7991173267364502, + "learning_rate": 2.3101112240545423e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5859208106994629, + "num_tokens": 8684601868.0, + "step": 16988 + }, + { + "epoch": 4.594104921579232, + "grad_norm": 0.8544787764549255, + "learning_rate": 2.3097010873495852e-06, + "loss": 1.6727, + "mean_token_accuracy": 0.6025753617286682, + "num_tokens": 8685125893.0, + "step": 16989 + }, + { + "epoch": 4.594375338020551, + "grad_norm": 0.884781002998352, + "learning_rate": 2.309291217288557e-06, + "loss": 1.9318, + "mean_token_accuracy": 0.5553327202796936, + "num_tokens": 8685650072.0, + "step": 16990 + }, + { + "epoch": 4.5946457544618715, + "grad_norm": 0.938885509967804, + "learning_rate": 2.308881613884032e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5994927287101746, + "num_tokens": 8686174121.0, + "step": 16991 + }, + { + "epoch": 4.594916170903191, + "grad_norm": 0.758450448513031, + "learning_rate": 2.3084722771485765e-06, + "loss": 1.8936, + "mean_token_accuracy": 0.5614482164382935, + "num_tokens": 8686698258.0, + "step": 16992 + }, + { + "epoch": 4.595186587344511, + "grad_norm": 0.9779729843139648, + "learning_rate": 2.3080632070947517e-06, + "loss": 1.7984, + "mean_token_accuracy": 0.5960640907287598, + "num_tokens": 8687179042.0, + "step": 16993 + }, + { + "epoch": 4.59545700378583, + "grad_norm": 1.1284679174423218, + "learning_rate": 2.30765440373511e-06, + "loss": 1.847, + "mean_token_accuracy": 0.5895935297012329, + "num_tokens": 8687703320.0, + "step": 16994 + }, + { + "epoch": 4.59572742022715, + "grad_norm": 0.8393312096595764, + "learning_rate": 2.307245867082193e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5616470575332642, + "num_tokens": 8688227592.0, + "step": 16995 + }, + { + "epoch": 4.595997836668469, + "grad_norm": 0.8710100650787354, + "learning_rate": 2.3068375971485374e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5797982811927795, + "num_tokens": 8688751833.0, + "step": 16996 + }, + { + "epoch": 4.596268253109789, + "grad_norm": 0.8476276993751526, + "learning_rate": 2.3064295939466696e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5646300315856934, + "num_tokens": 8689276090.0, + "step": 16997 + }, + { + "epoch": 4.596538669551109, + "grad_norm": 0.7826051712036133, + "learning_rate": 2.3060218574891065e-06, + "loss": 1.9657, + "mean_token_accuracy": 0.5567432641983032, + "num_tokens": 8689800238.0, + "step": 16998 + }, + { + "epoch": 4.596809085992429, + "grad_norm": 0.8441804647445679, + "learning_rate": 2.3056143877883617e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.6190729141235352, + "num_tokens": 8690259862.0, + "step": 16999 + }, + { + "epoch": 4.597079502433748, + "grad_norm": 1.1802077293395996, + "learning_rate": 2.305207184856936e-06, + "loss": 1.8532, + "mean_token_accuracy": 0.5797924399375916, + "num_tokens": 8690718882.0, + "step": 17000 + }, + { + "epoch": 4.597349918875068, + "grad_norm": 0.3213929831981659, + "learning_rate": 2.3048002487073216e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.6990257501602173, + "num_tokens": 8691243103.0, + "step": 17001 + }, + { + "epoch": 4.597620335316387, + "grad_norm": 1.1932902336120605, + "learning_rate": 2.304393579352008e-06, + "loss": 1.7671, + "mean_token_accuracy": 0.5987293720245361, + "num_tokens": 8691760980.0, + "step": 17002 + }, + { + "epoch": 4.597890751757707, + "grad_norm": 0.9373973608016968, + "learning_rate": 2.303987176803471e-06, + "loss": 1.7706, + "mean_token_accuracy": 0.5938013792037964, + "num_tokens": 8692285226.0, + "step": 17003 + }, + { + "epoch": 4.598161168199026, + "grad_norm": 0.9283604025840759, + "learning_rate": 2.3035810410741815e-06, + "loss": 1.8348, + "mean_token_accuracy": 0.5753538608551025, + "num_tokens": 8692809422.0, + "step": 17004 + }, + { + "epoch": 4.5984315846403465, + "grad_norm": 1.266012191772461, + "learning_rate": 2.3031751721766e-06, + "loss": 1.9039, + "mean_token_accuracy": 0.5881837606430054, + "num_tokens": 8693171951.0, + "step": 17005 + }, + { + "epoch": 4.598702001081666, + "grad_norm": 0.8771986961364746, + "learning_rate": 2.3027695701231782e-06, + "loss": 1.9246, + "mean_token_accuracy": 0.5623118281364441, + "num_tokens": 8693696228.0, + "step": 17006 + }, + { + "epoch": 4.598972417522985, + "grad_norm": 0.9024639129638672, + "learning_rate": 2.3023642349263655e-06, + "loss": 1.7454, + "mean_token_accuracy": 0.558733344078064, + "num_tokens": 8694194780.0, + "step": 17007 + }, + { + "epoch": 4.599242833964305, + "grad_norm": 0.9244751930236816, + "learning_rate": 2.301959166598593e-06, + "loss": 1.674, + "mean_token_accuracy": 0.6127843856811523, + "num_tokens": 8694718927.0, + "step": 17008 + }, + { + "epoch": 4.599513250405625, + "grad_norm": 0.9739463925361633, + "learning_rate": 2.301554365152294e-06, + "loss": 1.8586, + "mean_token_accuracy": 0.5721613764762878, + "num_tokens": 8695243064.0, + "step": 17009 + }, + { + "epoch": 4.599783666846944, + "grad_norm": 1.1541342735290527, + "learning_rate": 2.301149830599889e-06, + "loss": 1.8791, + "mean_token_accuracy": 0.5725786685943604, + "num_tokens": 8695767339.0, + "step": 17010 + }, + { + "epoch": 4.6000540832882635, + "grad_norm": 0.9071625471115112, + "learning_rate": 2.3007455629537876e-06, + "loss": 1.8239, + "mean_token_accuracy": 0.5896816253662109, + "num_tokens": 8696291576.0, + "step": 17011 + }, + { + "epoch": 4.600324499729584, + "grad_norm": 0.9901993870735168, + "learning_rate": 2.3003415622263946e-06, + "loss": 1.8609, + "mean_token_accuracy": 0.5921852588653564, + "num_tokens": 8696716204.0, + "step": 17012 + }, + { + "epoch": 4.600594916170904, + "grad_norm": 0.773942768573761, + "learning_rate": 2.299937828430108e-06, + "loss": 1.7263, + "mean_token_accuracy": 0.5887590646743774, + "num_tokens": 8697240364.0, + "step": 17013 + }, + { + "epoch": 4.600865332612223, + "grad_norm": 0.9245792627334595, + "learning_rate": 2.2995343615773137e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5732636451721191, + "num_tokens": 8697764525.0, + "step": 17014 + }, + { + "epoch": 4.601135749053542, + "grad_norm": 1.0145947933197021, + "learning_rate": 2.2991311616803906e-06, + "loss": 2.0156, + "mean_token_accuracy": 0.5366119146347046, + "num_tokens": 8698288756.0, + "step": 17015 + }, + { + "epoch": 4.601406165494862, + "grad_norm": 0.9558149576187134, + "learning_rate": 2.2987282287517117e-06, + "loss": 1.8226, + "mean_token_accuracy": 0.5940293073654175, + "num_tokens": 8698789219.0, + "step": 17016 + }, + { + "epoch": 4.601676581936181, + "grad_norm": 0.894386887550354, + "learning_rate": 2.29832556280364e-06, + "loss": 1.8348, + "mean_token_accuracy": 0.5776270031929016, + "num_tokens": 8699313340.0, + "step": 17017 + }, + { + "epoch": 4.601946998377501, + "grad_norm": 0.9319589734077454, + "learning_rate": 2.2979231638485287e-06, + "loss": 1.7282, + "mean_token_accuracy": 0.6019723415374756, + "num_tokens": 8699837419.0, + "step": 17018 + }, + { + "epoch": 4.602217414818821, + "grad_norm": 0.8784768581390381, + "learning_rate": 2.297521031898727e-06, + "loss": 1.7767, + "mean_token_accuracy": 0.5880164504051208, + "num_tokens": 8700361629.0, + "step": 17019 + }, + { + "epoch": 4.602487831260141, + "grad_norm": 0.7792714834213257, + "learning_rate": 2.297119166966572e-06, + "loss": 1.7781, + "mean_token_accuracy": 0.5924568176269531, + "num_tokens": 8700865757.0, + "step": 17020 + }, + { + "epoch": 4.60275824770146, + "grad_norm": 0.31567952036857605, + "learning_rate": 2.296717569064395e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.7021546363830566, + "num_tokens": 8701389963.0, + "step": 17021 + }, + { + "epoch": 4.60302866414278, + "grad_norm": 0.9980143308639526, + "learning_rate": 2.296316238204516e-06, + "loss": 1.8481, + "mean_token_accuracy": 0.5909391641616821, + "num_tokens": 8701914195.0, + "step": 17022 + }, + { + "epoch": 4.603299080584099, + "grad_norm": 0.9217724204063416, + "learning_rate": 2.2959151743992525e-06, + "loss": 1.8937, + "mean_token_accuracy": 0.5691567063331604, + "num_tokens": 8702378925.0, + "step": 17023 + }, + { + "epoch": 4.603569497025419, + "grad_norm": 0.8614047765731812, + "learning_rate": 2.2955143776609066e-06, + "loss": 1.8939, + "mean_token_accuracy": 0.5806838274002075, + "num_tokens": 8702881485.0, + "step": 17024 + }, + { + "epoch": 4.6038399134667385, + "grad_norm": 0.9521775245666504, + "learning_rate": 2.2951138480017793e-06, + "loss": 2.0079, + "mean_token_accuracy": 0.5288617610931396, + "num_tokens": 8703405708.0, + "step": 17025 + }, + { + "epoch": 4.604110329908059, + "grad_norm": 0.918862521648407, + "learning_rate": 2.294713585434158e-06, + "loss": 1.789, + "mean_token_accuracy": 0.5831184387207031, + "num_tokens": 8703929957.0, + "step": 17026 + }, + { + "epoch": 4.604380746349378, + "grad_norm": 0.8633571267127991, + "learning_rate": 2.2943135899703252e-06, + "loss": 1.7855, + "mean_token_accuracy": 0.5903176069259644, + "num_tokens": 8704454235.0, + "step": 17027 + }, + { + "epoch": 4.604651162790698, + "grad_norm": 0.8996145129203796, + "learning_rate": 2.293913861622553e-06, + "loss": 1.941, + "mean_token_accuracy": 0.5508524179458618, + "num_tokens": 8704978487.0, + "step": 17028 + }, + { + "epoch": 4.604921579232017, + "grad_norm": 0.7916663289070129, + "learning_rate": 2.293514400403106e-06, + "loss": 1.8767, + "mean_token_accuracy": 0.5702520608901978, + "num_tokens": 8705502689.0, + "step": 17029 + }, + { + "epoch": 4.605191995673337, + "grad_norm": 0.8747709393501282, + "learning_rate": 2.2931152063242417e-06, + "loss": 1.7435, + "mean_token_accuracy": 0.6076192855834961, + "num_tokens": 8706026945.0, + "step": 17030 + }, + { + "epoch": 4.605462412114656, + "grad_norm": 0.9073701500892639, + "learning_rate": 2.292716279398208e-06, + "loss": 1.8564, + "mean_token_accuracy": 0.5828783512115479, + "num_tokens": 8706551220.0, + "step": 17031 + }, + { + "epoch": 4.605732828555976, + "grad_norm": 0.7460679411888123, + "learning_rate": 2.292317619637245e-06, + "loss": 1.8411, + "mean_token_accuracy": 0.5834046006202698, + "num_tokens": 8707075453.0, + "step": 17032 + }, + { + "epoch": 4.606003244997296, + "grad_norm": 0.8760863542556763, + "learning_rate": 2.2919192270535854e-06, + "loss": 1.7601, + "mean_token_accuracy": 0.5908387899398804, + "num_tokens": 8707587246.0, + "step": 17033 + }, + { + "epoch": 4.606273661438616, + "grad_norm": 0.8460512757301331, + "learning_rate": 2.291521101659453e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5971501469612122, + "num_tokens": 8708111462.0, + "step": 17034 + }, + { + "epoch": 4.606544077879935, + "grad_norm": 0.7632831931114197, + "learning_rate": 2.2911232434670617e-06, + "loss": 1.9209, + "mean_token_accuracy": 0.5572165250778198, + "num_tokens": 8708635737.0, + "step": 17035 + }, + { + "epoch": 4.606814494321255, + "grad_norm": 0.8544431924819946, + "learning_rate": 2.2907256524886214e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5782564282417297, + "num_tokens": 8709159995.0, + "step": 17036 + }, + { + "epoch": 4.607084910762574, + "grad_norm": 0.7141490578651428, + "learning_rate": 2.2903283287363306e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5864911079406738, + "num_tokens": 8709684274.0, + "step": 17037 + }, + { + "epoch": 4.607355327203894, + "grad_norm": 0.9545763731002808, + "learning_rate": 2.2899312722223794e-06, + "loss": 1.8387, + "mean_token_accuracy": 0.5867233872413635, + "num_tokens": 8710154154.0, + "step": 17038 + }, + { + "epoch": 4.6076257436452135, + "grad_norm": 0.7710409760475159, + "learning_rate": 2.289534482958953e-06, + "loss": 1.7294, + "mean_token_accuracy": 0.6068300604820251, + "num_tokens": 8710678356.0, + "step": 17039 + }, + { + "epoch": 4.607896160086534, + "grad_norm": 0.754200279712677, + "learning_rate": 2.2891379609582237e-06, + "loss": 1.7928, + "mean_token_accuracy": 0.599562406539917, + "num_tokens": 8711202521.0, + "step": 17040 + }, + { + "epoch": 4.608166576527853, + "grad_norm": 0.3944671154022217, + "learning_rate": 2.2887417062323577e-06, + "loss": 1.1047, + "mean_token_accuracy": 0.7002239227294922, + "num_tokens": 8711726760.0, + "step": 17041 + }, + { + "epoch": 4.608436992969173, + "grad_norm": 0.995381772518158, + "learning_rate": 2.2883457187935155e-06, + "loss": 1.8724, + "mean_token_accuracy": 0.579598069190979, + "num_tokens": 8712250967.0, + "step": 17042 + }, + { + "epoch": 4.608707409410492, + "grad_norm": 0.997460126876831, + "learning_rate": 2.287949998653846e-06, + "loss": 1.8911, + "mean_token_accuracy": 0.5624487400054932, + "num_tokens": 8712775207.0, + "step": 17043 + }, + { + "epoch": 4.608977825851812, + "grad_norm": 0.7578293681144714, + "learning_rate": 2.2875545458254896e-06, + "loss": 1.859, + "mean_token_accuracy": 0.5711420774459839, + "num_tokens": 8713299336.0, + "step": 17044 + }, + { + "epoch": 4.609248242293131, + "grad_norm": 0.8277826905250549, + "learning_rate": 2.2871593603205836e-06, + "loss": 1.793, + "mean_token_accuracy": 0.5760461091995239, + "num_tokens": 8713823512.0, + "step": 17045 + }, + { + "epoch": 4.609518658734451, + "grad_norm": 0.7768989205360413, + "learning_rate": 2.2867644421512503e-06, + "loss": 1.7807, + "mean_token_accuracy": 0.5733934640884399, + "num_tokens": 8714347697.0, + "step": 17046 + }, + { + "epoch": 4.609789075175771, + "grad_norm": 0.8348965644836426, + "learning_rate": 2.286369791329607e-06, + "loss": 1.7576, + "mean_token_accuracy": 0.5682609677314758, + "num_tokens": 8714854381.0, + "step": 17047 + }, + { + "epoch": 4.61005949161709, + "grad_norm": 0.8067528009414673, + "learning_rate": 2.2859754078677637e-06, + "loss": 1.7495, + "mean_token_accuracy": 0.5902003049850464, + "num_tokens": 8715378536.0, + "step": 17048 + }, + { + "epoch": 4.61032990805841, + "grad_norm": 0.8476431965827942, + "learning_rate": 2.2855812917778212e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5648151636123657, + "num_tokens": 8715902792.0, + "step": 17049 + }, + { + "epoch": 4.61060032449973, + "grad_norm": 0.8954124450683594, + "learning_rate": 2.285187443071873e-06, + "loss": 1.8424, + "mean_token_accuracy": 0.5612768530845642, + "num_tokens": 8716426367.0, + "step": 17050 + }, + { + "epoch": 4.610870740941049, + "grad_norm": 1.0100977420806885, + "learning_rate": 2.2847938617620016e-06, + "loss": 1.973, + "mean_token_accuracy": 0.5425629615783691, + "num_tokens": 8716950584.0, + "step": 17051 + }, + { + "epoch": 4.611141157382368, + "grad_norm": 0.8738529086112976, + "learning_rate": 2.284400547860284e-06, + "loss": 1.7838, + "mean_token_accuracy": 0.6013903617858887, + "num_tokens": 8717413704.0, + "step": 17052 + }, + { + "epoch": 4.6114115738236885, + "grad_norm": 0.8490097522735596, + "learning_rate": 2.2840075013787907e-06, + "loss": 1.8994, + "mean_token_accuracy": 0.5719132423400879, + "num_tokens": 8717933567.0, + "step": 17053 + }, + { + "epoch": 4.611681990265009, + "grad_norm": 0.8538036942481995, + "learning_rate": 2.2836147223295783e-06, + "loss": 1.7378, + "mean_token_accuracy": 0.588346004486084, + "num_tokens": 8718398140.0, + "step": 17054 + }, + { + "epoch": 4.611952406706328, + "grad_norm": 0.8996002674102783, + "learning_rate": 2.2832222107246984e-06, + "loss": 1.7834, + "mean_token_accuracy": 0.5953174233436584, + "num_tokens": 8718839941.0, + "step": 17055 + }, + { + "epoch": 4.612222823147647, + "grad_norm": 0.882956326007843, + "learning_rate": 2.282829966576196e-06, + "loss": 1.8807, + "mean_token_accuracy": 0.5664853453636169, + "num_tokens": 8719364204.0, + "step": 17056 + }, + { + "epoch": 4.612493239588967, + "grad_norm": 0.9219801425933838, + "learning_rate": 2.282437989896107e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5769243240356445, + "num_tokens": 8719888381.0, + "step": 17057 + }, + { + "epoch": 4.612763656030286, + "grad_norm": 0.9379281401634216, + "learning_rate": 2.2820462806964556e-06, + "loss": 1.7208, + "mean_token_accuracy": 0.5896167755126953, + "num_tokens": 8720387992.0, + "step": 17058 + }, + { + "epoch": 4.613034072471606, + "grad_norm": 0.9100501537322998, + "learning_rate": 2.2816548389892625e-06, + "loss": 1.8321, + "mean_token_accuracy": 0.5690436363220215, + "num_tokens": 8720860865.0, + "step": 17059 + }, + { + "epoch": 4.6133044889129255, + "grad_norm": 0.9027646780014038, + "learning_rate": 2.2812636647865387e-06, + "loss": 1.8848, + "mean_token_accuracy": 0.5619481205940247, + "num_tokens": 8721385110.0, + "step": 17060 + }, + { + "epoch": 4.613574905354246, + "grad_norm": 0.37839260697364807, + "learning_rate": 2.280872758100284e-06, + "loss": 1.057, + "mean_token_accuracy": 0.7152268290519714, + "num_tokens": 8721909281.0, + "step": 17061 + }, + { + "epoch": 4.613845321795565, + "grad_norm": 0.7946535348892212, + "learning_rate": 2.280482118942496e-06, + "loss": 1.7485, + "mean_token_accuracy": 0.5885909795761108, + "num_tokens": 8722433542.0, + "step": 17062 + }, + { + "epoch": 4.614115738236885, + "grad_norm": 0.9168603420257568, + "learning_rate": 2.2800917473251584e-06, + "loss": 1.9218, + "mean_token_accuracy": 0.5678153038024902, + "num_tokens": 8722957822.0, + "step": 17063 + }, + { + "epoch": 4.614386154678204, + "grad_norm": 0.8529369831085205, + "learning_rate": 2.27970164326025e-06, + "loss": 1.691, + "mean_token_accuracy": 0.6023910641670227, + "num_tokens": 8723482024.0, + "step": 17064 + }, + { + "epoch": 4.614656571119524, + "grad_norm": 0.841951310634613, + "learning_rate": 2.2793118067597383e-06, + "loss": 1.7388, + "mean_token_accuracy": 0.5884362459182739, + "num_tokens": 8723961692.0, + "step": 17065 + }, + { + "epoch": 4.614926987560843, + "grad_norm": 0.7938239574432373, + "learning_rate": 2.2789222378355887e-06, + "loss": 1.9458, + "mean_token_accuracy": 0.5555210113525391, + "num_tokens": 8724485914.0, + "step": 17066 + }, + { + "epoch": 4.6151974040021635, + "grad_norm": 0.8253304362297058, + "learning_rate": 2.2785329364997498e-06, + "loss": 1.8353, + "mean_token_accuracy": 0.5886580944061279, + "num_tokens": 8724931264.0, + "step": 17067 + }, + { + "epoch": 4.615467820443483, + "grad_norm": 0.8737529516220093, + "learning_rate": 2.27814390276417e-06, + "loss": 1.7172, + "mean_token_accuracy": 0.6014553904533386, + "num_tokens": 8725415935.0, + "step": 17068 + }, + { + "epoch": 4.615738236884803, + "grad_norm": 0.8386703729629517, + "learning_rate": 2.277755136640784e-06, + "loss": 1.8613, + "mean_token_accuracy": 0.5739821791648865, + "num_tokens": 8725940215.0, + "step": 17069 + }, + { + "epoch": 4.616008653326122, + "grad_norm": 0.8606948852539062, + "learning_rate": 2.27736663814152e-06, + "loss": 1.8656, + "mean_token_accuracy": 0.582187294960022, + "num_tokens": 8726381506.0, + "step": 17070 + }, + { + "epoch": 4.616279069767442, + "grad_norm": 0.9413778185844421, + "learning_rate": 2.276978407278301e-06, + "loss": 1.927, + "mean_token_accuracy": 0.5555045008659363, + "num_tokens": 8726902495.0, + "step": 17071 + }, + { + "epoch": 4.616549486208761, + "grad_norm": 0.8957047462463379, + "learning_rate": 2.2765904440630358e-06, + "loss": 1.6429, + "mean_token_accuracy": 0.6204559206962585, + "num_tokens": 8727426720.0, + "step": 17072 + }, + { + "epoch": 4.616819902650081, + "grad_norm": 1.1239120960235596, + "learning_rate": 2.276202748507631e-06, + "loss": 1.4128, + "mean_token_accuracy": 0.6686078310012817, + "num_tokens": 8727951001.0, + "step": 17073 + }, + { + "epoch": 4.6170903190914006, + "grad_norm": 0.880216658115387, + "learning_rate": 2.27581532062398e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5909625291824341, + "num_tokens": 8728379767.0, + "step": 17074 + }, + { + "epoch": 4.617360735532721, + "grad_norm": 0.8690861463546753, + "learning_rate": 2.2754281604239704e-06, + "loss": 1.7421, + "mean_token_accuracy": 0.582283616065979, + "num_tokens": 8728839655.0, + "step": 17075 + }, + { + "epoch": 4.61763115197404, + "grad_norm": 0.8474832773208618, + "learning_rate": 2.275041267919483e-06, + "loss": 1.9523, + "mean_token_accuracy": 0.5631293058395386, + "num_tokens": 8729363806.0, + "step": 17076 + }, + { + "epoch": 4.61790156841536, + "grad_norm": 0.9960585236549377, + "learning_rate": 2.2746546431223885e-06, + "loss": 1.8587, + "mean_token_accuracy": 0.5891518592834473, + "num_tokens": 8729822605.0, + "step": 17077 + }, + { + "epoch": 4.618171984856679, + "grad_norm": 0.7961797714233398, + "learning_rate": 2.274268286044548e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5580750703811646, + "num_tokens": 8730346856.0, + "step": 17078 + }, + { + "epoch": 4.618442401297999, + "grad_norm": 0.7948321104049683, + "learning_rate": 2.273882196697817e-06, + "loss": 1.7071, + "mean_token_accuracy": 0.5937620401382446, + "num_tokens": 8730818998.0, + "step": 17079 + }, + { + "epoch": 4.618712817739318, + "grad_norm": 0.8454614877700806, + "learning_rate": 2.273496375094043e-06, + "loss": 1.9234, + "mean_token_accuracy": 0.5617249011993408, + "num_tokens": 8731343202.0, + "step": 17080 + }, + { + "epoch": 4.6189832341806385, + "grad_norm": 0.35813719034194946, + "learning_rate": 2.273110821245062e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7292195558547974, + "num_tokens": 8731817968.0, + "step": 17081 + }, + { + "epoch": 4.619253650621958, + "grad_norm": 0.9030600786209106, + "learning_rate": 2.2727255351627064e-06, + "loss": 1.7253, + "mean_token_accuracy": 0.5936102271080017, + "num_tokens": 8732342171.0, + "step": 17082 + }, + { + "epoch": 4.619524067063278, + "grad_norm": 0.7977579236030579, + "learning_rate": 2.272340516858797e-06, + "loss": 1.7965, + "mean_token_accuracy": 0.5822563767433167, + "num_tokens": 8732866405.0, + "step": 17083 + }, + { + "epoch": 4.619794483504597, + "grad_norm": 0.8487864136695862, + "learning_rate": 2.2719557663451446e-06, + "loss": 1.7033, + "mean_token_accuracy": 0.5869673490524292, + "num_tokens": 8733390241.0, + "step": 17084 + }, + { + "epoch": 4.620064899945917, + "grad_norm": 0.915501058101654, + "learning_rate": 2.271571283633559e-06, + "loss": 1.8629, + "mean_token_accuracy": 0.5465975403785706, + "num_tokens": 8733914439.0, + "step": 17085 + }, + { + "epoch": 4.620335316387236, + "grad_norm": 0.7685235738754272, + "learning_rate": 2.271187068735835e-06, + "loss": 1.9025, + "mean_token_accuracy": 0.5595870614051819, + "num_tokens": 8734438619.0, + "step": 17086 + }, + { + "epoch": 4.620605732828556, + "grad_norm": 0.8080111742019653, + "learning_rate": 2.2708031216637593e-06, + "loss": 1.7134, + "mean_token_accuracy": 0.602594792842865, + "num_tokens": 8734914267.0, + "step": 17087 + }, + { + "epoch": 4.6208761492698756, + "grad_norm": 0.9933121204376221, + "learning_rate": 2.270419442429117e-06, + "loss": 1.8625, + "mean_token_accuracy": 0.5814926028251648, + "num_tokens": 8735409399.0, + "step": 17088 + }, + { + "epoch": 4.621146565711195, + "grad_norm": 0.9123358726501465, + "learning_rate": 2.270036031043678e-06, + "loss": 1.842, + "mean_token_accuracy": 0.5858700275421143, + "num_tokens": 8735933638.0, + "step": 17089 + }, + { + "epoch": 4.621416982152515, + "grad_norm": 0.9294096231460571, + "learning_rate": 2.269652887519205e-06, + "loss": 1.8116, + "mean_token_accuracy": 0.5873157978057861, + "num_tokens": 8736429980.0, + "step": 17090 + }, + { + "epoch": 4.621687398593835, + "grad_norm": 0.8664866089820862, + "learning_rate": 2.2692700118674576e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5696889162063599, + "num_tokens": 8736954062.0, + "step": 17091 + }, + { + "epoch": 4.621957815035154, + "grad_norm": 0.7694582343101501, + "learning_rate": 2.2688874041001803e-06, + "loss": 1.9188, + "mean_token_accuracy": 0.5498312711715698, + "num_tokens": 8737478240.0, + "step": 17092 + }, + { + "epoch": 4.622228231476473, + "grad_norm": 0.7784567475318909, + "learning_rate": 2.2685050642291136e-06, + "loss": 1.8712, + "mean_token_accuracy": 0.5605549812316895, + "num_tokens": 8738002389.0, + "step": 17093 + }, + { + "epoch": 4.622498647917793, + "grad_norm": 0.9509612917900085, + "learning_rate": 2.268122992265991e-06, + "loss": 1.8561, + "mean_token_accuracy": 0.5917273759841919, + "num_tokens": 8738526629.0, + "step": 17094 + }, + { + "epoch": 4.6227690643591135, + "grad_norm": 0.8422954082489014, + "learning_rate": 2.267741188222532e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.5748326778411865, + "num_tokens": 8739050809.0, + "step": 17095 + }, + { + "epoch": 4.623039480800433, + "grad_norm": 0.833888828754425, + "learning_rate": 2.267359652110455e-06, + "loss": 1.7575, + "mean_token_accuracy": 0.5954342484474182, + "num_tokens": 8739574940.0, + "step": 17096 + }, + { + "epoch": 4.623309897241752, + "grad_norm": 0.8448900580406189, + "learning_rate": 2.2669783839414645e-06, + "loss": 1.8599, + "mean_token_accuracy": 0.5675106644630432, + "num_tokens": 8740099209.0, + "step": 17097 + }, + { + "epoch": 4.623580313683072, + "grad_norm": 0.8836487531661987, + "learning_rate": 2.2665973837272586e-06, + "loss": 1.8768, + "mean_token_accuracy": 0.5695300102233887, + "num_tokens": 8740623482.0, + "step": 17098 + }, + { + "epoch": 4.623850730124391, + "grad_norm": 0.8901658058166504, + "learning_rate": 2.266216651479529e-06, + "loss": 1.7089, + "mean_token_accuracy": 0.591826319694519, + "num_tokens": 8741147652.0, + "step": 17099 + }, + { + "epoch": 4.624121146565711, + "grad_norm": 0.7374269366264343, + "learning_rate": 2.2658361872099567e-06, + "loss": 1.8124, + "mean_token_accuracy": 0.5699625015258789, + "num_tokens": 8741671821.0, + "step": 17100 + }, + { + "epoch": 4.6243915630070305, + "grad_norm": 0.3362453877925873, + "learning_rate": 2.2654559909302145e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.7123329639434814, + "num_tokens": 8742159252.0, + "step": 17101 + }, + { + "epoch": 4.624661979448351, + "grad_norm": 0.7962809801101685, + "learning_rate": 2.265076062651971e-06, + "loss": 1.8441, + "mean_token_accuracy": 0.5766936540603638, + "num_tokens": 8742683467.0, + "step": 17102 + }, + { + "epoch": 4.62493239588967, + "grad_norm": 0.8175825476646423, + "learning_rate": 2.26469640238688e-06, + "loss": 1.7239, + "mean_token_accuracy": 0.6001389622688293, + "num_tokens": 8743188180.0, + "step": 17103 + }, + { + "epoch": 4.62520281233099, + "grad_norm": 0.7801482677459717, + "learning_rate": 2.2643170101465923e-06, + "loss": 1.7536, + "mean_token_accuracy": 0.6005823016166687, + "num_tokens": 8743680307.0, + "step": 17104 + }, + { + "epoch": 4.625473228772309, + "grad_norm": 0.8035078048706055, + "learning_rate": 2.2639378859427504e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5848196744918823, + "num_tokens": 8744204570.0, + "step": 17105 + }, + { + "epoch": 4.625743645213629, + "grad_norm": 0.7676151394844055, + "learning_rate": 2.263559029786983e-06, + "loss": 1.8668, + "mean_token_accuracy": 0.5612114071846008, + "num_tokens": 8744728714.0, + "step": 17106 + }, + { + "epoch": 4.626014061654948, + "grad_norm": 0.9587094187736511, + "learning_rate": 2.2631804416909187e-06, + "loss": 1.9394, + "mean_token_accuracy": 0.5592865347862244, + "num_tokens": 8745185911.0, + "step": 17107 + }, + { + "epoch": 4.626284478096268, + "grad_norm": 0.7236048579216003, + "learning_rate": 2.26280212166617e-06, + "loss": 1.7854, + "mean_token_accuracy": 0.5908327102661133, + "num_tokens": 8745710196.0, + "step": 17108 + }, + { + "epoch": 4.626554894537588, + "grad_norm": 0.8356081247329712, + "learning_rate": 2.2624240697243484e-06, + "loss": 1.7441, + "mean_token_accuracy": 0.5973576307296753, + "num_tokens": 8746140617.0, + "step": 17109 + }, + { + "epoch": 4.626825310978908, + "grad_norm": 0.7697744369506836, + "learning_rate": 2.2620462858770503e-06, + "loss": 1.7808, + "mean_token_accuracy": 0.5764628648757935, + "num_tokens": 8746664874.0, + "step": 17110 + }, + { + "epoch": 4.627095727420227, + "grad_norm": 0.7599928379058838, + "learning_rate": 2.2616687701358703e-06, + "loss": 1.7759, + "mean_token_accuracy": 0.5937728881835938, + "num_tokens": 8747189033.0, + "step": 17111 + }, + { + "epoch": 4.627366143861547, + "grad_norm": 0.8373153209686279, + "learning_rate": 2.2612915225123895e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.5709436535835266, + "num_tokens": 8747713312.0, + "step": 17112 + }, + { + "epoch": 4.627636560302866, + "grad_norm": 0.7914724946022034, + "learning_rate": 2.260914543018184e-06, + "loss": 1.7725, + "mean_token_accuracy": 0.5826683044433594, + "num_tokens": 8748237555.0, + "step": 17113 + }, + { + "epoch": 4.627906976744186, + "grad_norm": 0.8642527461051941, + "learning_rate": 2.2605378316648204e-06, + "loss": 1.9194, + "mean_token_accuracy": 0.5667564868927002, + "num_tokens": 8748699855.0, + "step": 17114 + }, + { + "epoch": 4.6281773931855055, + "grad_norm": 0.8120209574699402, + "learning_rate": 2.260161388463856e-06, + "loss": 1.808, + "mean_token_accuracy": 0.5916539430618286, + "num_tokens": 8749224086.0, + "step": 17115 + }, + { + "epoch": 4.628447809626826, + "grad_norm": 0.7775019407272339, + "learning_rate": 2.2597852134268428e-06, + "loss": 1.8162, + "mean_token_accuracy": 0.5840600728988647, + "num_tokens": 8749748260.0, + "step": 17116 + }, + { + "epoch": 4.628718226068145, + "grad_norm": 0.8022938370704651, + "learning_rate": 2.2594093065653233e-06, + "loss": 1.8688, + "mean_token_accuracy": 0.5735067129135132, + "num_tokens": 8750272459.0, + "step": 17117 + }, + { + "epoch": 4.628988642509465, + "grad_norm": 0.7855842709541321, + "learning_rate": 2.2590336678908298e-06, + "loss": 1.7774, + "mean_token_accuracy": 0.5725094079971313, + "num_tokens": 8750777575.0, + "step": 17118 + }, + { + "epoch": 4.629259058950784, + "grad_norm": 0.8208913803100586, + "learning_rate": 2.2586582974148893e-06, + "loss": 1.8108, + "mean_token_accuracy": 0.5702924728393555, + "num_tokens": 8751301798.0, + "step": 17119 + }, + { + "epoch": 4.629529475392104, + "grad_norm": 0.8049347400665283, + "learning_rate": 2.258283195149019e-06, + "loss": 1.7641, + "mean_token_accuracy": 0.5785530805587769, + "num_tokens": 8751825935.0, + "step": 17120 + }, + { + "epoch": 4.629799891833423, + "grad_norm": 0.35906392335891724, + "learning_rate": 2.2579083611047277e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7363874316215515, + "num_tokens": 8752350164.0, + "step": 17121 + }, + { + "epoch": 4.630070308274743, + "grad_norm": 0.7318272590637207, + "learning_rate": 2.2575337952935163e-06, + "loss": 1.5944, + "mean_token_accuracy": 0.6503514051437378, + "num_tokens": 8752850785.0, + "step": 17122 + }, + { + "epoch": 4.630340724716063, + "grad_norm": 0.8421204686164856, + "learning_rate": 2.2571594977268793e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5806841850280762, + "num_tokens": 8753374994.0, + "step": 17123 + }, + { + "epoch": 4.630611141157383, + "grad_norm": 0.811809241771698, + "learning_rate": 2.256785468416299e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5687901377677917, + "num_tokens": 8753899187.0, + "step": 17124 + }, + { + "epoch": 4.630881557598702, + "grad_norm": 0.7606868147850037, + "learning_rate": 2.256411707373252e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5718507766723633, + "num_tokens": 8754423449.0, + "step": 17125 + }, + { + "epoch": 4.631151974040022, + "grad_norm": 0.8044995069503784, + "learning_rate": 2.256038214609209e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.5720430016517639, + "num_tokens": 8754947626.0, + "step": 17126 + }, + { + "epoch": 4.631422390481341, + "grad_norm": 0.9429145455360413, + "learning_rate": 2.255664990135626e-06, + "loss": 1.8481, + "mean_token_accuracy": 0.5881150960922241, + "num_tokens": 8755418368.0, + "step": 17127 + }, + { + "epoch": 4.631692806922661, + "grad_norm": 0.7745662331581116, + "learning_rate": 2.2552920339639574e-06, + "loss": 1.7724, + "mean_token_accuracy": 0.5869478583335876, + "num_tokens": 8755942607.0, + "step": 17128 + }, + { + "epoch": 4.6319632233639805, + "grad_norm": 0.7571633458137512, + "learning_rate": 2.254919346105646e-06, + "loss": 1.678, + "mean_token_accuracy": 0.6015604138374329, + "num_tokens": 8756466787.0, + "step": 17129 + }, + { + "epoch": 4.6322336398053, + "grad_norm": 1.0318046808242798, + "learning_rate": 2.254546926572126e-06, + "loss": 1.5283, + "mean_token_accuracy": 0.63739413022995, + "num_tokens": 8756943027.0, + "step": 17130 + }, + { + "epoch": 4.63250405624662, + "grad_norm": 0.7753101587295532, + "learning_rate": 2.2541747753748252e-06, + "loss": 1.8481, + "mean_token_accuracy": 0.5743535757064819, + "num_tokens": 8757467259.0, + "step": 17131 + }, + { + "epoch": 4.63277447268794, + "grad_norm": 0.8320646286010742, + "learning_rate": 2.253802892525163e-06, + "loss": 1.7789, + "mean_token_accuracy": 0.5965892672538757, + "num_tokens": 8757979410.0, + "step": 17132 + }, + { + "epoch": 4.633044889129259, + "grad_norm": 0.8597398400306702, + "learning_rate": 2.253431278034547e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5822770595550537, + "num_tokens": 8758433123.0, + "step": 17133 + }, + { + "epoch": 4.633315305570578, + "grad_norm": 0.7947077751159668, + "learning_rate": 2.253059931914383e-06, + "loss": 1.5823, + "mean_token_accuracy": 0.6398220062255859, + "num_tokens": 8758893806.0, + "step": 17134 + }, + { + "epoch": 4.633585722011898, + "grad_norm": 0.8488785624504089, + "learning_rate": 2.2526888541760622e-06, + "loss": 1.84, + "mean_token_accuracy": 0.57663893699646, + "num_tokens": 8759418017.0, + "step": 17135 + }, + { + "epoch": 4.633856138453218, + "grad_norm": 0.9112587571144104, + "learning_rate": 2.252318044830972e-06, + "loss": 1.7962, + "mean_token_accuracy": 0.5776004791259766, + "num_tokens": 8759856513.0, + "step": 17136 + }, + { + "epoch": 4.634126554894538, + "grad_norm": 0.8673698902130127, + "learning_rate": 2.25194750389049e-06, + "loss": 1.8673, + "mean_token_accuracy": 0.5763348340988159, + "num_tokens": 8760380757.0, + "step": 17137 + }, + { + "epoch": 4.634396971335857, + "grad_norm": 0.7480072379112244, + "learning_rate": 2.2515772313659846e-06, + "loss": 1.7706, + "mean_token_accuracy": 0.590430498123169, + "num_tokens": 8760904983.0, + "step": 17138 + }, + { + "epoch": 4.634667387777177, + "grad_norm": 0.8290330767631531, + "learning_rate": 2.2512072272688167e-06, + "loss": 1.7759, + "mean_token_accuracy": 0.5947412252426147, + "num_tokens": 8761429185.0, + "step": 17139 + }, + { + "epoch": 4.634937804218496, + "grad_norm": 0.9365748763084412, + "learning_rate": 2.25083749161034e-06, + "loss": 1.8908, + "mean_token_accuracy": 0.5697214603424072, + "num_tokens": 8761953262.0, + "step": 17140 + }, + { + "epoch": 4.635208220659816, + "grad_norm": 0.39936205744743347, + "learning_rate": 2.2504680244018984e-06, + "loss": 1.1517, + "mean_token_accuracy": 0.698472797870636, + "num_tokens": 8762386239.0, + "step": 17141 + }, + { + "epoch": 4.635478637101135, + "grad_norm": 1.0381910800933838, + "learning_rate": 2.250098825654829e-06, + "loss": 1.8504, + "mean_token_accuracy": 0.5576778650283813, + "num_tokens": 8762910484.0, + "step": 17142 + }, + { + "epoch": 4.6357490535424555, + "grad_norm": 0.9566667079925537, + "learning_rate": 2.2497298953804596e-06, + "loss": 1.8235, + "mean_token_accuracy": 0.5841966867446899, + "num_tokens": 8763434716.0, + "step": 17143 + }, + { + "epoch": 4.636019469983775, + "grad_norm": 0.9198089241981506, + "learning_rate": 2.2493612335901083e-06, + "loss": 1.7634, + "mean_token_accuracy": 0.5940437316894531, + "num_tokens": 8763958814.0, + "step": 17144 + }, + { + "epoch": 4.636289886425095, + "grad_norm": 0.8413243889808655, + "learning_rate": 2.24899284029509e-06, + "loss": 1.8352, + "mean_token_accuracy": 0.5651045441627502, + "num_tokens": 8764483083.0, + "step": 17145 + }, + { + "epoch": 4.636560302866414, + "grad_norm": 0.8078871369361877, + "learning_rate": 2.2486247155067065e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5789554119110107, + "num_tokens": 8765007168.0, + "step": 17146 + }, + { + "epoch": 4.636830719307734, + "grad_norm": 1.0698790550231934, + "learning_rate": 2.248256859236252e-06, + "loss": 1.7806, + "mean_token_accuracy": 0.5969605445861816, + "num_tokens": 8765485243.0, + "step": 17147 + }, + { + "epoch": 4.637101135749053, + "grad_norm": 0.8938563466072083, + "learning_rate": 2.2478892714950144e-06, + "loss": 1.7725, + "mean_token_accuracy": 0.5888782739639282, + "num_tokens": 8766009490.0, + "step": 17148 + }, + { + "epoch": 4.637371552190373, + "grad_norm": 0.8620222210884094, + "learning_rate": 2.2475219522942716e-06, + "loss": 1.7123, + "mean_token_accuracy": 0.5956952571868896, + "num_tokens": 8766533619.0, + "step": 17149 + }, + { + "epoch": 4.6376419686316925, + "grad_norm": 0.8140563368797302, + "learning_rate": 2.2471549016452956e-06, + "loss": 1.7273, + "mean_token_accuracy": 0.5977970361709595, + "num_tokens": 8767047173.0, + "step": 17150 + }, + { + "epoch": 4.637912385073013, + "grad_norm": 0.8444974422454834, + "learning_rate": 2.246788119559347e-06, + "loss": 1.7005, + "mean_token_accuracy": 0.5938185453414917, + "num_tokens": 8767525044.0, + "step": 17151 + }, + { + "epoch": 4.638182801514332, + "grad_norm": 0.9077261686325073, + "learning_rate": 2.2464216060476808e-06, + "loss": 1.7718, + "mean_token_accuracy": 0.5925096273422241, + "num_tokens": 8767999786.0, + "step": 17152 + }, + { + "epoch": 4.638453217955652, + "grad_norm": 1.0023810863494873, + "learning_rate": 2.246055361121542e-06, + "loss": 1.8605, + "mean_token_accuracy": 0.5834569931030273, + "num_tokens": 8768514453.0, + "step": 17153 + }, + { + "epoch": 4.638723634396971, + "grad_norm": 0.9815492033958435, + "learning_rate": 2.2456893847921698e-06, + "loss": 1.8062, + "mean_token_accuracy": 0.5680864453315735, + "num_tokens": 8769038668.0, + "step": 17154 + }, + { + "epoch": 4.638994050838291, + "grad_norm": 0.9200440049171448, + "learning_rate": 2.2453236770707905e-06, + "loss": 1.8478, + "mean_token_accuracy": 0.5779579877853394, + "num_tokens": 8769507954.0, + "step": 17155 + }, + { + "epoch": 4.63926446727961, + "grad_norm": 0.7971384525299072, + "learning_rate": 2.2449582379686268e-06, + "loss": 1.7887, + "mean_token_accuracy": 0.5624333620071411, + "num_tokens": 8770032224.0, + "step": 17156 + }, + { + "epoch": 4.6395348837209305, + "grad_norm": 1.0350301265716553, + "learning_rate": 2.244593067496891e-06, + "loss": 1.9051, + "mean_token_accuracy": 0.5558741092681885, + "num_tokens": 8770556353.0, + "step": 17157 + }, + { + "epoch": 4.63980530016225, + "grad_norm": 0.9469469785690308, + "learning_rate": 2.244228165666789e-06, + "loss": 1.792, + "mean_token_accuracy": 0.5922034978866577, + "num_tokens": 8771080411.0, + "step": 17158 + }, + { + "epoch": 4.64007571660357, + "grad_norm": 0.9404756426811218, + "learning_rate": 2.243863532489515e-06, + "loss": 1.8314, + "mean_token_accuracy": 0.5828396081924438, + "num_tokens": 8771587811.0, + "step": 17159 + }, + { + "epoch": 4.640346133044889, + "grad_norm": 0.8683590292930603, + "learning_rate": 2.2434991679762587e-06, + "loss": 1.8447, + "mean_token_accuracy": 0.5576380491256714, + "num_tokens": 8772026937.0, + "step": 17160 + }, + { + "epoch": 4.640616549486209, + "grad_norm": 0.34384745359420776, + "learning_rate": 2.243135072138199e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.7148593664169312, + "num_tokens": 8772551199.0, + "step": 17161 + }, + { + "epoch": 4.640886965927528, + "grad_norm": 1.0004267692565918, + "learning_rate": 2.242771244986507e-06, + "loss": 1.9474, + "mean_token_accuracy": 0.5695157051086426, + "num_tokens": 8773016216.0, + "step": 17162 + }, + { + "epoch": 4.641157382368848, + "grad_norm": 1.0148392915725708, + "learning_rate": 2.2424076865323463e-06, + "loss": 1.7925, + "mean_token_accuracy": 0.5860006213188171, + "num_tokens": 8773540316.0, + "step": 17163 + }, + { + "epoch": 4.6414277988101675, + "grad_norm": 1.003613829612732, + "learning_rate": 2.242044396786873e-06, + "loss": 1.8486, + "mean_token_accuracy": 0.5713448524475098, + "num_tokens": 8774020611.0, + "step": 17164 + }, + { + "epoch": 4.641698215251488, + "grad_norm": 0.8186863660812378, + "learning_rate": 2.241681375761232e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.5704571008682251, + "num_tokens": 8774521200.0, + "step": 17165 + }, + { + "epoch": 4.641968631692807, + "grad_norm": 0.8933201432228088, + "learning_rate": 2.241318623466563e-06, + "loss": 1.909, + "mean_token_accuracy": 0.5647091865539551, + "num_tokens": 8775045469.0, + "step": 17166 + }, + { + "epoch": 4.642239048134127, + "grad_norm": 0.8797285556793213, + "learning_rate": 2.2409561399139973e-06, + "loss": 1.8905, + "mean_token_accuracy": 0.5710434913635254, + "num_tokens": 8775512091.0, + "step": 17167 + }, + { + "epoch": 4.642509464575446, + "grad_norm": 0.8967201113700867, + "learning_rate": 2.240593925114656e-06, + "loss": 1.9654, + "mean_token_accuracy": 0.5256105065345764, + "num_tokens": 8776036283.0, + "step": 17168 + }, + { + "epoch": 4.642779881016766, + "grad_norm": 0.8539875149726868, + "learning_rate": 2.2402319790796527e-06, + "loss": 1.8049, + "mean_token_accuracy": 0.593339204788208, + "num_tokens": 8776498641.0, + "step": 17169 + }, + { + "epoch": 4.643050297458085, + "grad_norm": 0.9482136964797974, + "learning_rate": 2.239870301820092e-06, + "loss": 1.8201, + "mean_token_accuracy": 0.5713968276977539, + "num_tokens": 8776992695.0, + "step": 17170 + }, + { + "epoch": 4.643320713899405, + "grad_norm": 1.0117228031158447, + "learning_rate": 2.2395088933470733e-06, + "loss": 1.7697, + "mean_token_accuracy": 0.60199373960495, + "num_tokens": 8777516891.0, + "step": 17171 + }, + { + "epoch": 4.643591130340725, + "grad_norm": 0.7602086067199707, + "learning_rate": 2.2391477536716838e-06, + "loss": 1.8794, + "mean_token_accuracy": 0.5594093799591064, + "num_tokens": 8778041114.0, + "step": 17172 + }, + { + "epoch": 4.643861546782045, + "grad_norm": 0.8543208837509155, + "learning_rate": 2.238786882805006e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.5823542475700378, + "num_tokens": 8778565340.0, + "step": 17173 + }, + { + "epoch": 4.644131963223364, + "grad_norm": 0.7136579751968384, + "learning_rate": 2.238426280758111e-06, + "loss": 1.7787, + "mean_token_accuracy": 0.5716978311538696, + "num_tokens": 8779089537.0, + "step": 17174 + }, + { + "epoch": 4.644402379664683, + "grad_norm": 0.8169087171554565, + "learning_rate": 2.2380659475420647e-06, + "loss": 1.8456, + "mean_token_accuracy": 0.5851144790649414, + "num_tokens": 8779613673.0, + "step": 17175 + }, + { + "epoch": 4.644672796106003, + "grad_norm": 0.8059051036834717, + "learning_rate": 2.2377058831679218e-06, + "loss": 1.6948, + "mean_token_accuracy": 0.6045504808425903, + "num_tokens": 8780055039.0, + "step": 17176 + }, + { + "epoch": 4.644943212547323, + "grad_norm": 0.8192271590232849, + "learning_rate": 2.2373460876467314e-06, + "loss": 1.8732, + "mean_token_accuracy": 0.5684219598770142, + "num_tokens": 8780579297.0, + "step": 17177 + }, + { + "epoch": 4.6452136289886425, + "grad_norm": 0.818894624710083, + "learning_rate": 2.236986560989531e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.5706737041473389, + "num_tokens": 8781103577.0, + "step": 17178 + }, + { + "epoch": 4.645484045429962, + "grad_norm": 0.7983912229537964, + "learning_rate": 2.2366273032073528e-06, + "loss": 1.8928, + "mean_token_accuracy": 0.5466378331184387, + "num_tokens": 8781627746.0, + "step": 17179 + }, + { + "epoch": 4.645754461871282, + "grad_norm": 0.9297729134559631, + "learning_rate": 2.2362683143112216e-06, + "loss": 1.8914, + "mean_token_accuracy": 0.5651816129684448, + "num_tokens": 8782151973.0, + "step": 17180 + }, + { + "epoch": 4.646024878312601, + "grad_norm": 0.34925681352615356, + "learning_rate": 2.235909594312152e-06, + "loss": 1.1307, + "mean_token_accuracy": 0.6975258588790894, + "num_tokens": 8782652097.0, + "step": 17181 + }, + { + "epoch": 4.646295294753921, + "grad_norm": 0.9943056106567383, + "learning_rate": 2.2355511432211473e-06, + "loss": 1.8737, + "mean_token_accuracy": 0.5734085440635681, + "num_tokens": 8783176347.0, + "step": 17182 + }, + { + "epoch": 4.64656571119524, + "grad_norm": 0.9242313504219055, + "learning_rate": 2.2351929610492092e-06, + "loss": 1.9425, + "mean_token_accuracy": 0.5708335041999817, + "num_tokens": 8783646071.0, + "step": 17183 + }, + { + "epoch": 4.64683612763656, + "grad_norm": 0.8201542496681213, + "learning_rate": 2.2348350478073274e-06, + "loss": 1.8127, + "mean_token_accuracy": 0.5865036249160767, + "num_tokens": 8784170179.0, + "step": 17184 + }, + { + "epoch": 4.64710654407788, + "grad_norm": 0.7610831260681152, + "learning_rate": 2.2344774035064815e-06, + "loss": 1.7969, + "mean_token_accuracy": 0.5865973234176636, + "num_tokens": 8784694205.0, + "step": 17185 + }, + { + "epoch": 4.6473769605192, + "grad_norm": 0.7777110934257507, + "learning_rate": 2.2341200281576474e-06, + "loss": 1.8887, + "mean_token_accuracy": 0.5608829259872437, + "num_tokens": 8785204851.0, + "step": 17186 + }, + { + "epoch": 4.647647376960519, + "grad_norm": 0.9242169260978699, + "learning_rate": 2.2337629217717886e-06, + "loss": 1.8441, + "mean_token_accuracy": 0.5746862888336182, + "num_tokens": 8785729125.0, + "step": 17187 + }, + { + "epoch": 4.647917793401839, + "grad_norm": 0.7972890734672546, + "learning_rate": 2.2334060843598646e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.5753366351127625, + "num_tokens": 8786253074.0, + "step": 17188 + }, + { + "epoch": 4.648188209843158, + "grad_norm": 0.7964666485786438, + "learning_rate": 2.233049515932823e-06, + "loss": 1.7234, + "mean_token_accuracy": 0.6029805541038513, + "num_tokens": 8786714934.0, + "step": 17189 + }, + { + "epoch": 4.648458626284478, + "grad_norm": 0.7853700518608093, + "learning_rate": 2.232693216501603e-06, + "loss": 1.7804, + "mean_token_accuracy": 0.5778719186782837, + "num_tokens": 8787239219.0, + "step": 17190 + }, + { + "epoch": 4.6487290427257975, + "grad_norm": 0.8656044602394104, + "learning_rate": 2.232337186077138e-06, + "loss": 1.94, + "mean_token_accuracy": 0.5604296922683716, + "num_tokens": 8787763349.0, + "step": 17191 + }, + { + "epoch": 4.6489994591671175, + "grad_norm": 1.0725494623184204, + "learning_rate": 2.2319814246703532e-06, + "loss": 1.8615, + "mean_token_accuracy": 0.5747316479682922, + "num_tokens": 8788287557.0, + "step": 17192 + }, + { + "epoch": 4.649269875608437, + "grad_norm": 0.8128586411476135, + "learning_rate": 2.231625932292161e-06, + "loss": 1.8376, + "mean_token_accuracy": 0.5687653422355652, + "num_tokens": 8788811755.0, + "step": 17193 + }, + { + "epoch": 4.649540292049757, + "grad_norm": 0.7574150562286377, + "learning_rate": 2.2312707089534747e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5654963850975037, + "num_tokens": 8789335990.0, + "step": 17194 + }, + { + "epoch": 4.649810708491076, + "grad_norm": 0.7957658171653748, + "learning_rate": 2.2309157546651884e-06, + "loss": 1.798, + "mean_token_accuracy": 0.569104790687561, + "num_tokens": 8789858039.0, + "step": 17195 + }, + { + "epoch": 4.650081124932396, + "grad_norm": 0.8799095749855042, + "learning_rate": 2.2305610694381947e-06, + "loss": 1.795, + "mean_token_accuracy": 0.5826004147529602, + "num_tokens": 8790339430.0, + "step": 17196 + }, + { + "epoch": 4.650351541373715, + "grad_norm": 0.7540492415428162, + "learning_rate": 2.2302066532833776e-06, + "loss": 1.8027, + "mean_token_accuracy": 0.5621591806411743, + "num_tokens": 8790863584.0, + "step": 17197 + }, + { + "epoch": 4.650621957815035, + "grad_norm": 0.9707773327827454, + "learning_rate": 2.2298525062116103e-06, + "loss": 1.7835, + "mean_token_accuracy": 0.5958550572395325, + "num_tokens": 8791338297.0, + "step": 17198 + }, + { + "epoch": 4.650892374256355, + "grad_norm": 1.1996608972549438, + "learning_rate": 2.2294986282337583e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5892792344093323, + "num_tokens": 8791848536.0, + "step": 17199 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 1.093064308166504, + "learning_rate": 2.229145019360683e-06, + "loss": 1.8622, + "mean_token_accuracy": 0.5826518535614014, + "num_tokens": 8792372615.0, + "step": 17200 + }, + { + "epoch": 4.651433207138994, + "grad_norm": 0.34701743721961975, + "learning_rate": 2.2287916796032308e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.706233561038971, + "num_tokens": 8792896889.0, + "step": 17201 + }, + { + "epoch": 4.651703623580314, + "grad_norm": 0.9397376775741577, + "learning_rate": 2.228438608972243e-06, + "loss": 1.874, + "mean_token_accuracy": 0.5828945636749268, + "num_tokens": 8793421171.0, + "step": 17202 + }, + { + "epoch": 4.651974040021633, + "grad_norm": 0.9614442586898804, + "learning_rate": 2.2280858074785556e-06, + "loss": 1.7254, + "mean_token_accuracy": 0.5995441675186157, + "num_tokens": 8793945303.0, + "step": 17203 + }, + { + "epoch": 4.652244456462953, + "grad_norm": 0.8354688286781311, + "learning_rate": 2.227733275132992e-06, + "loss": 1.8448, + "mean_token_accuracy": 0.5882532596588135, + "num_tokens": 8794469572.0, + "step": 17204 + }, + { + "epoch": 4.6525148729042725, + "grad_norm": 0.8227362632751465, + "learning_rate": 2.2273810119463686e-06, + "loss": 1.7471, + "mean_token_accuracy": 0.5781108140945435, + "num_tokens": 8794993766.0, + "step": 17205 + }, + { + "epoch": 4.6527852893455925, + "grad_norm": 0.7809157371520996, + "learning_rate": 2.2270290179294947e-06, + "loss": 1.7548, + "mean_token_accuracy": 0.5846964716911316, + "num_tokens": 8795518038.0, + "step": 17206 + }, + { + "epoch": 4.653055705786912, + "grad_norm": 1.0443739891052246, + "learning_rate": 2.2266772930931706e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5658046007156372, + "num_tokens": 8796042187.0, + "step": 17207 + }, + { + "epoch": 4.653326122228232, + "grad_norm": 0.8576847314834595, + "learning_rate": 2.226325837448187e-06, + "loss": 1.7707, + "mean_token_accuracy": 0.5873301029205322, + "num_tokens": 8796521297.0, + "step": 17208 + }, + { + "epoch": 4.653596538669551, + "grad_norm": 0.9187914729118347, + "learning_rate": 2.2259746510053287e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5812005996704102, + "num_tokens": 8797045451.0, + "step": 17209 + }, + { + "epoch": 4.653866955110871, + "grad_norm": 0.8686415553092957, + "learning_rate": 2.2256237337753705e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5718203783035278, + "num_tokens": 8797569641.0, + "step": 17210 + }, + { + "epoch": 4.65413737155219, + "grad_norm": 0.977271318435669, + "learning_rate": 2.2252730857690803e-06, + "loss": 1.7286, + "mean_token_accuracy": 0.5839965343475342, + "num_tokens": 8798093757.0, + "step": 17211 + }, + { + "epoch": 4.6544077879935095, + "grad_norm": 0.8827254176139832, + "learning_rate": 2.2249227069972175e-06, + "loss": 1.8378, + "mean_token_accuracy": 0.5808850526809692, + "num_tokens": 8798617933.0, + "step": 17212 + }, + { + "epoch": 4.65467820443483, + "grad_norm": 0.8238694667816162, + "learning_rate": 2.2245725974705305e-06, + "loss": 1.8253, + "mean_token_accuracy": 0.5731731653213501, + "num_tokens": 8799142100.0, + "step": 17213 + }, + { + "epoch": 4.65494862087615, + "grad_norm": 0.9650223851203918, + "learning_rate": 2.224222757199764e-06, + "loss": 1.7647, + "mean_token_accuracy": 0.5923476219177246, + "num_tokens": 8799666289.0, + "step": 17214 + }, + { + "epoch": 4.655219037317469, + "grad_norm": 0.8066261410713196, + "learning_rate": 2.22387318619565e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5888312458992004, + "num_tokens": 8800103886.0, + "step": 17215 + }, + { + "epoch": 4.655489453758788, + "grad_norm": 0.8795541524887085, + "learning_rate": 2.2235238844689164e-06, + "loss": 1.7151, + "mean_token_accuracy": 0.6097440719604492, + "num_tokens": 8800594194.0, + "step": 17216 + }, + { + "epoch": 4.655759870200108, + "grad_norm": 1.190871000289917, + "learning_rate": 2.22317485203028e-06, + "loss": 1.8097, + "mean_token_accuracy": 0.5888621807098389, + "num_tokens": 8801051076.0, + "step": 17217 + }, + { + "epoch": 4.656030286641428, + "grad_norm": 0.9124366044998169, + "learning_rate": 2.22282608889045e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.5633789896965027, + "num_tokens": 8801542335.0, + "step": 17218 + }, + { + "epoch": 4.6563007030827475, + "grad_norm": 0.6942065954208374, + "learning_rate": 2.2224775950601275e-06, + "loss": 1.6807, + "mean_token_accuracy": 0.6048283576965332, + "num_tokens": 8802037301.0, + "step": 17219 + }, + { + "epoch": 4.656571119524067, + "grad_norm": 0.8293370604515076, + "learning_rate": 2.2221293705500055e-06, + "loss": 1.8421, + "mean_token_accuracy": 0.5713924169540405, + "num_tokens": 8802561423.0, + "step": 17220 + }, + { + "epoch": 4.656841535965387, + "grad_norm": 0.39384475350379944, + "learning_rate": 2.221781415370768e-06, + "loss": 1.067, + "mean_token_accuracy": 0.7017533779144287, + "num_tokens": 8803079855.0, + "step": 17221 + }, + { + "epoch": 4.657111952406706, + "grad_norm": 0.8979785442352295, + "learning_rate": 2.2214337295330915e-06, + "loss": 1.8219, + "mean_token_accuracy": 0.5808030962944031, + "num_tokens": 8803603948.0, + "step": 17222 + }, + { + "epoch": 4.657382368848026, + "grad_norm": 0.8450851440429688, + "learning_rate": 2.221086313047645e-06, + "loss": 1.8666, + "mean_token_accuracy": 0.5756634473800659, + "num_tokens": 8804128014.0, + "step": 17223 + }, + { + "epoch": 4.657652785289345, + "grad_norm": 0.9567673206329346, + "learning_rate": 2.2207391659250886e-06, + "loss": 1.8508, + "mean_token_accuracy": 0.5584439635276794, + "num_tokens": 8804652135.0, + "step": 17224 + }, + { + "epoch": 4.657923201730665, + "grad_norm": 0.7954872846603394, + "learning_rate": 2.220392288176071e-06, + "loss": 1.7754, + "mean_token_accuracy": 0.5967711806297302, + "num_tokens": 8805176383.0, + "step": 17225 + }, + { + "epoch": 4.6581936181719845, + "grad_norm": 0.8519243597984314, + "learning_rate": 2.2200456798112375e-06, + "loss": 1.7468, + "mean_token_accuracy": 0.5832176804542542, + "num_tokens": 8805700496.0, + "step": 17226 + }, + { + "epoch": 4.658464034613305, + "grad_norm": 0.9716761708259583, + "learning_rate": 2.219699340841223e-06, + "loss": 1.9514, + "mean_token_accuracy": 0.5456749796867371, + "num_tokens": 8806173824.0, + "step": 17227 + }, + { + "epoch": 4.658734451054624, + "grad_norm": 0.8370904326438904, + "learning_rate": 2.219353271276653e-06, + "loss": 1.8497, + "mean_token_accuracy": 0.5761332511901855, + "num_tokens": 8806678864.0, + "step": 17228 + }, + { + "epoch": 4.659004867495944, + "grad_norm": 0.7930724620819092, + "learning_rate": 2.2190074711281484e-06, + "loss": 1.7699, + "mean_token_accuracy": 0.5943739414215088, + "num_tokens": 8807195014.0, + "step": 17229 + }, + { + "epoch": 4.659275283937263, + "grad_norm": 0.9293212294578552, + "learning_rate": 2.2186619404063175e-06, + "loss": 1.9218, + "mean_token_accuracy": 0.5618818998336792, + "num_tokens": 8807677824.0, + "step": 17230 + }, + { + "epoch": 4.659545700378583, + "grad_norm": 0.8333704471588135, + "learning_rate": 2.2183166791217615e-06, + "loss": 1.6569, + "mean_token_accuracy": 0.6194562315940857, + "num_tokens": 8808201943.0, + "step": 17231 + }, + { + "epoch": 4.659816116819902, + "grad_norm": 0.7992066144943237, + "learning_rate": 2.217971687285075e-06, + "loss": 1.8173, + "mean_token_accuracy": 0.5881783962249756, + "num_tokens": 8808726132.0, + "step": 17232 + }, + { + "epoch": 4.6600865332612225, + "grad_norm": 0.7902987003326416, + "learning_rate": 2.217626964906844e-06, + "loss": 1.8869, + "mean_token_accuracy": 0.5595167875289917, + "num_tokens": 8809250405.0, + "step": 17233 + }, + { + "epoch": 4.660356949702542, + "grad_norm": 0.8765343427658081, + "learning_rate": 2.217282511997645e-06, + "loss": 1.7866, + "mean_token_accuracy": 0.5513492226600647, + "num_tokens": 8809774532.0, + "step": 17234 + }, + { + "epoch": 4.660627366143862, + "grad_norm": 0.9305617809295654, + "learning_rate": 2.216938328568047e-06, + "loss": 1.796, + "mean_token_accuracy": 0.575435996055603, + "num_tokens": 8810265517.0, + "step": 17235 + }, + { + "epoch": 4.660897782585181, + "grad_norm": 0.8045187592506409, + "learning_rate": 2.216594414628609e-06, + "loss": 1.8644, + "mean_token_accuracy": 0.5704358816146851, + "num_tokens": 8810789738.0, + "step": 17236 + }, + { + "epoch": 4.661168199026501, + "grad_norm": 0.8401902318000793, + "learning_rate": 2.216250770189886e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5830888748168945, + "num_tokens": 8811313923.0, + "step": 17237 + }, + { + "epoch": 4.66143861546782, + "grad_norm": 0.7973935008049011, + "learning_rate": 2.2159073952624215e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.568800151348114, + "num_tokens": 8811838080.0, + "step": 17238 + }, + { + "epoch": 4.66170903190914, + "grad_norm": 0.8307962417602539, + "learning_rate": 2.2155642898567484e-06, + "loss": 1.8535, + "mean_token_accuracy": 0.5787006616592407, + "num_tokens": 8812362296.0, + "step": 17239 + }, + { + "epoch": 4.6619794483504595, + "grad_norm": 0.897754430770874, + "learning_rate": 2.215221453983397e-06, + "loss": 1.6754, + "mean_token_accuracy": 0.6152591705322266, + "num_tokens": 8812850487.0, + "step": 17240 + }, + { + "epoch": 4.66224986479178, + "grad_norm": 0.3466331958770752, + "learning_rate": 2.2148788876528866e-06, + "loss": 1.193, + "mean_token_accuracy": 0.6823381781578064, + "num_tokens": 8813374703.0, + "step": 17241 + }, + { + "epoch": 4.662520281233099, + "grad_norm": 0.8821172118186951, + "learning_rate": 2.214536590875726e-06, + "loss": 1.76, + "mean_token_accuracy": 0.5861104726791382, + "num_tokens": 8813868516.0, + "step": 17242 + }, + { + "epoch": 4.662790697674419, + "grad_norm": 0.793683648109436, + "learning_rate": 2.2141945636624206e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5843725800514221, + "num_tokens": 8814373947.0, + "step": 17243 + }, + { + "epoch": 4.663061114115738, + "grad_norm": 0.9326355457305908, + "learning_rate": 2.213852806023463e-06, + "loss": 1.6828, + "mean_token_accuracy": 0.6059324741363525, + "num_tokens": 8814898190.0, + "step": 17244 + }, + { + "epoch": 4.663331530557058, + "grad_norm": 0.7546876668930054, + "learning_rate": 2.213511317969339e-06, + "loss": 1.781, + "mean_token_accuracy": 0.592623770236969, + "num_tokens": 8815422369.0, + "step": 17245 + }, + { + "epoch": 4.663601946998377, + "grad_norm": 0.7553966045379639, + "learning_rate": 2.213170099510528e-06, + "loss": 1.8039, + "mean_token_accuracy": 0.5742682814598083, + "num_tokens": 8815941991.0, + "step": 17246 + }, + { + "epoch": 4.6638723634396975, + "grad_norm": 0.752724289894104, + "learning_rate": 2.212829150657498e-06, + "loss": 1.7088, + "mean_token_accuracy": 0.5908505916595459, + "num_tokens": 8816463795.0, + "step": 17247 + }, + { + "epoch": 4.664142779881017, + "grad_norm": 0.7813534736633301, + "learning_rate": 2.2124884714207105e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.5535151958465576, + "num_tokens": 8816988067.0, + "step": 17248 + }, + { + "epoch": 4.664413196322337, + "grad_norm": 0.7950233221054077, + "learning_rate": 2.212148061810619e-06, + "loss": 1.8075, + "mean_token_accuracy": 0.5758999586105347, + "num_tokens": 8817512256.0, + "step": 17249 + }, + { + "epoch": 4.664683612763656, + "grad_norm": 0.8090137839317322, + "learning_rate": 2.21180792183767e-06, + "loss": 1.7883, + "mean_token_accuracy": 0.5869960188865662, + "num_tokens": 8818036426.0, + "step": 17250 + }, + { + "epoch": 4.664954029204976, + "grad_norm": 0.8203831315040588, + "learning_rate": 2.2114680515122967e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5627768039703369, + "num_tokens": 8818560675.0, + "step": 17251 + }, + { + "epoch": 4.665224445646295, + "grad_norm": 0.8121110200881958, + "learning_rate": 2.2111284508449306e-06, + "loss": 1.8051, + "mean_token_accuracy": 0.5595149993896484, + "num_tokens": 8819084861.0, + "step": 17252 + }, + { + "epoch": 4.665494862087614, + "grad_norm": 0.8628784418106079, + "learning_rate": 2.2107891198459895e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5716977715492249, + "num_tokens": 8819589693.0, + "step": 17253 + }, + { + "epoch": 4.6657652785289345, + "grad_norm": 0.8196929693222046, + "learning_rate": 2.2104500585258844e-06, + "loss": 1.8241, + "mean_token_accuracy": 0.569904088973999, + "num_tokens": 8820113915.0, + "step": 17254 + }, + { + "epoch": 4.666035694970255, + "grad_norm": 0.963147759437561, + "learning_rate": 2.210111266895021e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.5711708664894104, + "num_tokens": 8820631909.0, + "step": 17255 + }, + { + "epoch": 4.666306111411574, + "grad_norm": 0.8821985125541687, + "learning_rate": 2.209772744963793e-06, + "loss": 1.9532, + "mean_token_accuracy": 0.5594509243965149, + "num_tokens": 8821156156.0, + "step": 17256 + }, + { + "epoch": 4.666576527852893, + "grad_norm": 0.7595254182815552, + "learning_rate": 2.209434492742587e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5688061118125916, + "num_tokens": 8821680334.0, + "step": 17257 + }, + { + "epoch": 4.666846944294213, + "grad_norm": 0.9132283926010132, + "learning_rate": 2.209096510241784e-06, + "loss": 1.7401, + "mean_token_accuracy": 0.5864849090576172, + "num_tokens": 8822204600.0, + "step": 17258 + }, + { + "epoch": 4.667117360735533, + "grad_norm": 0.86202472448349, + "learning_rate": 2.20875879747175e-06, + "loss": 1.757, + "mean_token_accuracy": 0.5898383855819702, + "num_tokens": 8822728858.0, + "step": 17259 + }, + { + "epoch": 4.667387777176852, + "grad_norm": 0.8741742968559265, + "learning_rate": 2.2084213544428518e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5693701505661011, + "num_tokens": 8823229645.0, + "step": 17260 + }, + { + "epoch": 4.667658193618172, + "grad_norm": 0.3506964147090912, + "learning_rate": 2.2080841811654393e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.722848653793335, + "num_tokens": 8823753712.0, + "step": 17261 + }, + { + "epoch": 4.667928610059492, + "grad_norm": 0.7826945185661316, + "learning_rate": 2.207747277649859e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5700850486755371, + "num_tokens": 8824277896.0, + "step": 17262 + }, + { + "epoch": 4.668199026500811, + "grad_norm": 0.8564316630363464, + "learning_rate": 2.2074106439064493e-06, + "loss": 1.7775, + "mean_token_accuracy": 0.5785689353942871, + "num_tokens": 8824802102.0, + "step": 17263 + }, + { + "epoch": 4.668469442942131, + "grad_norm": 0.7918084263801575, + "learning_rate": 2.207074279945539e-06, + "loss": 1.712, + "mean_token_accuracy": 0.580186665058136, + "num_tokens": 8825326340.0, + "step": 17264 + }, + { + "epoch": 4.66873985938345, + "grad_norm": 0.8608539700508118, + "learning_rate": 2.2067381857774465e-06, + "loss": 1.7522, + "mean_token_accuracy": 0.6026296615600586, + "num_tokens": 8825850589.0, + "step": 17265 + }, + { + "epoch": 4.66901027582477, + "grad_norm": 0.8695576190948486, + "learning_rate": 2.206402361412487e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5643399953842163, + "num_tokens": 8826269542.0, + "step": 17266 + }, + { + "epoch": 4.669280692266089, + "grad_norm": 0.8198569416999817, + "learning_rate": 2.206066806860964e-06, + "loss": 1.7956, + "mean_token_accuracy": 0.5808387994766235, + "num_tokens": 8826793821.0, + "step": 17267 + }, + { + "epoch": 4.6695511087074095, + "grad_norm": 0.7477181553840637, + "learning_rate": 2.2057315221331706e-06, + "loss": 1.7991, + "mean_token_accuracy": 0.571887731552124, + "num_tokens": 8827317943.0, + "step": 17268 + }, + { + "epoch": 4.669821525148729, + "grad_norm": 1.0160778760910034, + "learning_rate": 2.205396507239398e-06, + "loss": 1.8301, + "mean_token_accuracy": 0.5857479572296143, + "num_tokens": 8827820497.0, + "step": 17269 + }, + { + "epoch": 4.670091941590049, + "grad_norm": 0.8572158217430115, + "learning_rate": 2.2050617621899237e-06, + "loss": 1.6715, + "mean_token_accuracy": 0.6018930673599243, + "num_tokens": 8828344662.0, + "step": 17270 + }, + { + "epoch": 4.670362358031368, + "grad_norm": 0.874317467212677, + "learning_rate": 2.204727286995018e-06, + "loss": 1.8996, + "mean_token_accuracy": 0.5708490610122681, + "num_tokens": 8828840226.0, + "step": 17271 + }, + { + "epoch": 4.670632774472688, + "grad_norm": 0.8181436657905579, + "learning_rate": 2.2043930816649444e-06, + "loss": 1.8271, + "mean_token_accuracy": 0.6098072528839111, + "num_tokens": 8829252878.0, + "step": 17272 + }, + { + "epoch": 4.670903190914007, + "grad_norm": 0.8471354842185974, + "learning_rate": 2.2040591462099587e-06, + "loss": 1.8979, + "mean_token_accuracy": 0.5425125360488892, + "num_tokens": 8829777024.0, + "step": 17273 + }, + { + "epoch": 4.671173607355327, + "grad_norm": 0.7847505211830139, + "learning_rate": 2.2037254806403034e-06, + "loss": 1.7164, + "mean_token_accuracy": 0.6014920473098755, + "num_tokens": 8830301141.0, + "step": 17274 + }, + { + "epoch": 4.671444023796647, + "grad_norm": 0.8494294881820679, + "learning_rate": 2.2033920849662207e-06, + "loss": 1.8626, + "mean_token_accuracy": 0.5807472467422485, + "num_tokens": 8830825347.0, + "step": 17275 + }, + { + "epoch": 4.671714440237967, + "grad_norm": 0.89814293384552, + "learning_rate": 2.2030589591979364e-06, + "loss": 1.747, + "mean_token_accuracy": 0.6007068157196045, + "num_tokens": 8831349621.0, + "step": 17276 + }, + { + "epoch": 4.671984856679286, + "grad_norm": 0.7315708994865417, + "learning_rate": 2.2027261033456733e-06, + "loss": 1.83, + "mean_token_accuracy": 0.5895353555679321, + "num_tokens": 8831873894.0, + "step": 17277 + }, + { + "epoch": 4.672255273120606, + "grad_norm": 0.7359226942062378, + "learning_rate": 2.2023935174196446e-06, + "loss": 1.7511, + "mean_token_accuracy": 0.6074279546737671, + "num_tokens": 8832345182.0, + "step": 17278 + }, + { + "epoch": 4.672525689561925, + "grad_norm": 0.7904064059257507, + "learning_rate": 2.2020612014300545e-06, + "loss": 1.8959, + "mean_token_accuracy": 0.5470166206359863, + "num_tokens": 8832869401.0, + "step": 17279 + }, + { + "epoch": 4.672796106003245, + "grad_norm": 0.8142538666725159, + "learning_rate": 2.2017291553871013e-06, + "loss": 1.8379, + "mean_token_accuracy": 0.5852431058883667, + "num_tokens": 8833393638.0, + "step": 17280 + }, + { + "epoch": 4.673066522444564, + "grad_norm": 0.32374197244644165, + "learning_rate": 2.201397379300971e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.7056059241294861, + "num_tokens": 8833917825.0, + "step": 17281 + }, + { + "epoch": 4.6733369388858845, + "grad_norm": 0.8832825422286987, + "learning_rate": 2.2010658731818433e-06, + "loss": 1.7711, + "mean_token_accuracy": 0.5878826379776001, + "num_tokens": 8834441965.0, + "step": 17282 + }, + { + "epoch": 4.673607355327204, + "grad_norm": 0.9021419286727905, + "learning_rate": 2.200734637039891e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5789152979850769, + "num_tokens": 8834955654.0, + "step": 17283 + }, + { + "epoch": 4.673877771768524, + "grad_norm": 0.8491938710212708, + "learning_rate": 2.200403670885278e-06, + "loss": 1.7988, + "mean_token_accuracy": 0.5752241611480713, + "num_tokens": 8835479816.0, + "step": 17284 + }, + { + "epoch": 4.674148188209843, + "grad_norm": 0.8029590845108032, + "learning_rate": 2.200072974728157e-06, + "loss": 1.8565, + "mean_token_accuracy": 0.5644059181213379, + "num_tokens": 8835952686.0, + "step": 17285 + }, + { + "epoch": 4.674418604651163, + "grad_norm": 0.8698948621749878, + "learning_rate": 2.199742548578677e-06, + "loss": 1.8239, + "mean_token_accuracy": 0.5743876695632935, + "num_tokens": 8836476896.0, + "step": 17286 + }, + { + "epoch": 4.674689021092482, + "grad_norm": 0.7906275987625122, + "learning_rate": 2.199412392446975e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.5789411067962646, + "num_tokens": 8837001136.0, + "step": 17287 + }, + { + "epoch": 4.674959437533802, + "grad_norm": 0.8450767397880554, + "learning_rate": 2.1990825063431814e-06, + "loss": 1.7703, + "mean_token_accuracy": 0.587742805480957, + "num_tokens": 8837525342.0, + "step": 17288 + }, + { + "epoch": 4.675229853975122, + "grad_norm": 0.8571842908859253, + "learning_rate": 2.198752890277419e-06, + "loss": 1.7143, + "mean_token_accuracy": 0.596469521522522, + "num_tokens": 8838049611.0, + "step": 17289 + }, + { + "epoch": 4.675500270416442, + "grad_norm": 0.835813581943512, + "learning_rate": 2.1984235442597997e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5882905721664429, + "num_tokens": 8838546005.0, + "step": 17290 + }, + { + "epoch": 4.675770686857761, + "grad_norm": 0.8650466799736023, + "learning_rate": 2.1980944683004307e-06, + "loss": 1.7764, + "mean_token_accuracy": 0.5776107311248779, + "num_tokens": 8839070249.0, + "step": 17291 + }, + { + "epoch": 4.676041103299081, + "grad_norm": 0.7424260377883911, + "learning_rate": 2.1977656624094076e-06, + "loss": 1.6337, + "mean_token_accuracy": 0.6155385971069336, + "num_tokens": 8839594517.0, + "step": 17292 + }, + { + "epoch": 4.6763115197404, + "grad_norm": 0.9398472309112549, + "learning_rate": 2.1974371265968195e-06, + "loss": 1.8775, + "mean_token_accuracy": 0.5706210136413574, + "num_tokens": 8840118687.0, + "step": 17293 + }, + { + "epoch": 4.676581936181719, + "grad_norm": 0.8465918898582458, + "learning_rate": 2.197108860872747e-06, + "loss": 1.8307, + "mean_token_accuracy": 0.5613726377487183, + "num_tokens": 8840642973.0, + "step": 17294 + }, + { + "epoch": 4.676852352623039, + "grad_norm": 1.046474575996399, + "learning_rate": 2.196780865247263e-06, + "loss": 1.7965, + "mean_token_accuracy": 0.5716365575790405, + "num_tokens": 8841076702.0, + "step": 17295 + }, + { + "epoch": 4.6771227690643595, + "grad_norm": 0.775453507900238, + "learning_rate": 2.196453139730431e-06, + "loss": 1.89, + "mean_token_accuracy": 0.5713704824447632, + "num_tokens": 8841546883.0, + "step": 17296 + }, + { + "epoch": 4.677393185505679, + "grad_norm": 0.8822639584541321, + "learning_rate": 2.196125684332305e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.5963344573974609, + "num_tokens": 8842071058.0, + "step": 17297 + }, + { + "epoch": 4.677663601946998, + "grad_norm": 0.8835738897323608, + "learning_rate": 2.1957984990629345e-06, + "loss": 1.9963, + "mean_token_accuracy": 0.5501824617385864, + "num_tokens": 8842493104.0, + "step": 17298 + }, + { + "epoch": 4.677934018388318, + "grad_norm": 1.0069910287857056, + "learning_rate": 2.195471583932357e-06, + "loss": 1.8341, + "mean_token_accuracy": 0.5728811621665955, + "num_tokens": 8843017166.0, + "step": 17299 + }, + { + "epoch": 4.678204434829638, + "grad_norm": 0.732319712638855, + "learning_rate": 2.195144938950603e-06, + "loss": 1.8629, + "mean_token_accuracy": 0.5688921213150024, + "num_tokens": 8843541311.0, + "step": 17300 + }, + { + "epoch": 4.678474851270957, + "grad_norm": 0.33193737268447876, + "learning_rate": 2.1948185641276963e-06, + "loss": 1.1554, + "mean_token_accuracy": 0.6890288591384888, + "num_tokens": 8844065479.0, + "step": 17301 + }, + { + "epoch": 4.6787452677122765, + "grad_norm": 0.8410934209823608, + "learning_rate": 2.1944924594736504e-06, + "loss": 1.8622, + "mean_token_accuracy": 0.5612457394599915, + "num_tokens": 8844589639.0, + "step": 17302 + }, + { + "epoch": 4.679015684153597, + "grad_norm": 0.8755311369895935, + "learning_rate": 2.1941666249984724e-06, + "loss": 1.818, + "mean_token_accuracy": 0.5839207768440247, + "num_tokens": 8845113874.0, + "step": 17303 + }, + { + "epoch": 4.679286100594916, + "grad_norm": 0.8995140790939331, + "learning_rate": 2.1938410607121573e-06, + "loss": 1.8579, + "mean_token_accuracy": 0.5576527118682861, + "num_tokens": 8845598436.0, + "step": 17304 + }, + { + "epoch": 4.679556517036236, + "grad_norm": 0.995785117149353, + "learning_rate": 2.1935157666246958e-06, + "loss": 1.902, + "mean_token_accuracy": 0.5624648332595825, + "num_tokens": 8846122569.0, + "step": 17305 + }, + { + "epoch": 4.679826933477555, + "grad_norm": 0.9719892144203186, + "learning_rate": 2.1931907427460676e-06, + "loss": 1.9021, + "mean_token_accuracy": 0.5763037204742432, + "num_tokens": 8846646757.0, + "step": 17306 + }, + { + "epoch": 4.680097349918875, + "grad_norm": 0.8645568490028381, + "learning_rate": 2.1928659890862487e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.5615347623825073, + "num_tokens": 8847106254.0, + "step": 17307 + }, + { + "epoch": 4.680367766360194, + "grad_norm": 0.7722135782241821, + "learning_rate": 2.192541505655199e-06, + "loss": 1.7423, + "mean_token_accuracy": 0.5937424898147583, + "num_tokens": 8847630454.0, + "step": 17308 + }, + { + "epoch": 4.6806381828015144, + "grad_norm": 0.7481585144996643, + "learning_rate": 2.192217292462879e-06, + "loss": 1.7349, + "mean_token_accuracy": 0.5629872679710388, + "num_tokens": 8848154681.0, + "step": 17309 + }, + { + "epoch": 4.680908599242834, + "grad_norm": 0.7450268864631653, + "learning_rate": 2.191893349519234e-06, + "loss": 1.71, + "mean_token_accuracy": 0.6110777854919434, + "num_tokens": 8848678855.0, + "step": 17310 + }, + { + "epoch": 4.681179015684154, + "grad_norm": 0.8088111281394958, + "learning_rate": 2.1915696768342042e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5782700777053833, + "num_tokens": 8849202942.0, + "step": 17311 + }, + { + "epoch": 4.681449432125473, + "grad_norm": 0.9764384627342224, + "learning_rate": 2.1912462744177205e-06, + "loss": 1.7499, + "mean_token_accuracy": 0.6095350980758667, + "num_tokens": 8849727226.0, + "step": 17312 + }, + { + "epoch": 4.681719848566793, + "grad_norm": 0.8062869310379028, + "learning_rate": 2.1909231422797055e-06, + "loss": 1.8224, + "mean_token_accuracy": 0.5682207345962524, + "num_tokens": 8850251414.0, + "step": 17313 + }, + { + "epoch": 4.681990265008112, + "grad_norm": 1.00806725025177, + "learning_rate": 2.190600280430074e-06, + "loss": 1.7959, + "mean_token_accuracy": 0.5732276439666748, + "num_tokens": 8850714653.0, + "step": 17314 + }, + { + "epoch": 4.682260681449432, + "grad_norm": 0.763457179069519, + "learning_rate": 2.1902776888787344e-06, + "loss": 1.8038, + "mean_token_accuracy": 0.5904747247695923, + "num_tokens": 8851238908.0, + "step": 17315 + }, + { + "epoch": 4.6825310978907515, + "grad_norm": 1.1063554286956787, + "learning_rate": 2.1899553676355815e-06, + "loss": 1.9155, + "mean_token_accuracy": 0.5515990853309631, + "num_tokens": 8851756174.0, + "step": 17316 + }, + { + "epoch": 4.682801514332072, + "grad_norm": 0.9763787388801575, + "learning_rate": 2.189633316710507e-06, + "loss": 1.9366, + "mean_token_accuracy": 0.563347578048706, + "num_tokens": 8852228100.0, + "step": 17317 + }, + { + "epoch": 4.683071930773391, + "grad_norm": 0.8555402159690857, + "learning_rate": 2.189311536113392e-06, + "loss": 1.9129, + "mean_token_accuracy": 0.5598392486572266, + "num_tokens": 8852752102.0, + "step": 17318 + }, + { + "epoch": 4.683342347214711, + "grad_norm": 0.8124901056289673, + "learning_rate": 2.1889900258541085e-06, + "loss": 1.7929, + "mean_token_accuracy": 0.5899626016616821, + "num_tokens": 8853276328.0, + "step": 17319 + }, + { + "epoch": 4.68361276365603, + "grad_norm": 0.826125979423523, + "learning_rate": 2.1886687859425237e-06, + "loss": 1.7552, + "mean_token_accuracy": 0.6067581176757812, + "num_tokens": 8853800500.0, + "step": 17320 + }, + { + "epoch": 4.68388318009735, + "grad_norm": 0.41058701276779175, + "learning_rate": 2.1883478163884926e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7200675010681152, + "num_tokens": 8854279497.0, + "step": 17321 + }, + { + "epoch": 4.684153596538669, + "grad_norm": 0.8127776384353638, + "learning_rate": 2.1880271172018637e-06, + "loss": 1.8779, + "mean_token_accuracy": 0.5640488862991333, + "num_tokens": 8854803690.0, + "step": 17322 + }, + { + "epoch": 4.6844240129799894, + "grad_norm": 0.9890890121459961, + "learning_rate": 2.1877066883924777e-06, + "loss": 1.8653, + "mean_token_accuracy": 0.5865034461021423, + "num_tokens": 8855327802.0, + "step": 17323 + }, + { + "epoch": 4.684694429421309, + "grad_norm": 0.8109898567199707, + "learning_rate": 2.1873865299701654e-06, + "loss": 1.7905, + "mean_token_accuracy": 0.5841891765594482, + "num_tokens": 8855852064.0, + "step": 17324 + }, + { + "epoch": 4.684964845862629, + "grad_norm": 0.798943817615509, + "learning_rate": 2.1870666419447507e-06, + "loss": 1.7474, + "mean_token_accuracy": 0.6125520467758179, + "num_tokens": 8856376303.0, + "step": 17325 + }, + { + "epoch": 4.685235262303948, + "grad_norm": 0.7491666078567505, + "learning_rate": 2.1867470243260485e-06, + "loss": 1.6416, + "mean_token_accuracy": 0.6063700914382935, + "num_tokens": 8856860412.0, + "step": 17326 + }, + { + "epoch": 4.685505678745268, + "grad_norm": 0.998425304889679, + "learning_rate": 2.186427677123866e-06, + "loss": 1.8034, + "mean_token_accuracy": 0.596489429473877, + "num_tokens": 8857261827.0, + "step": 17327 + }, + { + "epoch": 4.685776095186587, + "grad_norm": 1.042922854423523, + "learning_rate": 2.1861086003480014e-06, + "loss": 1.8526, + "mean_token_accuracy": 0.5663503408432007, + "num_tokens": 8857729985.0, + "step": 17328 + }, + { + "epoch": 4.686046511627907, + "grad_norm": 0.8852577805519104, + "learning_rate": 2.185789794008245e-06, + "loss": 1.897, + "mean_token_accuracy": 0.5661773681640625, + "num_tokens": 8858254036.0, + "step": 17329 + }, + { + "epoch": 4.6863169280692265, + "grad_norm": 0.9233279228210449, + "learning_rate": 2.185471258114378e-06, + "loss": 1.8086, + "mean_token_accuracy": 0.5857650637626648, + "num_tokens": 8858778264.0, + "step": 17330 + }, + { + "epoch": 4.686587344510547, + "grad_norm": 0.8065609931945801, + "learning_rate": 2.185152992676174e-06, + "loss": 1.8875, + "mean_token_accuracy": 0.5761796832084656, + "num_tokens": 8859302509.0, + "step": 17331 + }, + { + "epoch": 4.686857760951866, + "grad_norm": 0.8253031373023987, + "learning_rate": 2.184834997703401e-06, + "loss": 1.8074, + "mean_token_accuracy": 0.5741811990737915, + "num_tokens": 8859826687.0, + "step": 17332 + }, + { + "epoch": 4.687128177393186, + "grad_norm": 0.8735520243644714, + "learning_rate": 2.184517273205812e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5474472045898438, + "num_tokens": 8860350870.0, + "step": 17333 + }, + { + "epoch": 4.687398593834505, + "grad_norm": 0.8600065112113953, + "learning_rate": 2.1841998191931596e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5754525065422058, + "num_tokens": 8860875031.0, + "step": 17334 + }, + { + "epoch": 4.687669010275824, + "grad_norm": 0.9970088601112366, + "learning_rate": 2.18388263567518e-06, + "loss": 1.9145, + "mean_token_accuracy": 0.5635808110237122, + "num_tokens": 8861399250.0, + "step": 17335 + }, + { + "epoch": 4.687939426717144, + "grad_norm": 0.8274598121643066, + "learning_rate": 2.1835657226616095e-06, + "loss": 1.7588, + "mean_token_accuracy": 0.5904669761657715, + "num_tokens": 8861887112.0, + "step": 17336 + }, + { + "epoch": 4.6882098431584645, + "grad_norm": 0.7957760691642761, + "learning_rate": 2.183249080162169e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.576795756816864, + "num_tokens": 8862411354.0, + "step": 17337 + }, + { + "epoch": 4.688480259599784, + "grad_norm": 0.9146867990493774, + "learning_rate": 2.1829327081865755e-06, + "loss": 1.73, + "mean_token_accuracy": 0.5831345319747925, + "num_tokens": 8862935558.0, + "step": 17338 + }, + { + "epoch": 4.688750676041103, + "grad_norm": 0.8724446892738342, + "learning_rate": 2.182616606744537e-06, + "loss": 1.7152, + "mean_token_accuracy": 0.5996711254119873, + "num_tokens": 8863438873.0, + "step": 17339 + }, + { + "epoch": 4.689021092482423, + "grad_norm": 0.950040876865387, + "learning_rate": 2.1823007758457495e-06, + "loss": 1.8175, + "mean_token_accuracy": 0.5849215984344482, + "num_tokens": 8863963021.0, + "step": 17340 + }, + { + "epoch": 4.689291508923743, + "grad_norm": 0.3427742123603821, + "learning_rate": 2.1819852154999064e-06, + "loss": 1.042, + "mean_token_accuracy": 0.7163940668106079, + "num_tokens": 8864485105.0, + "step": 17341 + }, + { + "epoch": 4.689561925365062, + "grad_norm": 0.9351152777671814, + "learning_rate": 2.181669925716689e-06, + "loss": 1.8787, + "mean_token_accuracy": 0.5857251882553101, + "num_tokens": 8864980487.0, + "step": 17342 + }, + { + "epoch": 4.689832341806381, + "grad_norm": 0.8869157433509827, + "learning_rate": 2.18135490650577e-06, + "loss": 1.8762, + "mean_token_accuracy": 0.5827573537826538, + "num_tokens": 8865446797.0, + "step": 17343 + }, + { + "epoch": 4.6901027582477015, + "grad_norm": 0.8783144354820251, + "learning_rate": 2.181040157876818e-06, + "loss": 1.8168, + "mean_token_accuracy": 0.5748222470283508, + "num_tokens": 8865971009.0, + "step": 17344 + }, + { + "epoch": 4.690373174689021, + "grad_norm": 0.9482119679450989, + "learning_rate": 2.180725679839488e-06, + "loss": 1.8472, + "mean_token_accuracy": 0.5614191293716431, + "num_tokens": 8866483102.0, + "step": 17345 + }, + { + "epoch": 4.690643591130341, + "grad_norm": 0.7654333114624023, + "learning_rate": 2.1804114724034304e-06, + "loss": 1.6143, + "mean_token_accuracy": 0.6391538381576538, + "num_tokens": 8867007348.0, + "step": 17346 + }, + { + "epoch": 4.69091400757166, + "grad_norm": 0.9302517175674438, + "learning_rate": 2.1800975355782856e-06, + "loss": 1.8479, + "mean_token_accuracy": 0.5752145051956177, + "num_tokens": 8867531615.0, + "step": 17347 + }, + { + "epoch": 4.69118442401298, + "grad_norm": 0.7576962113380432, + "learning_rate": 2.1797838693736844e-06, + "loss": 1.7506, + "mean_token_accuracy": 0.5929505825042725, + "num_tokens": 8868055870.0, + "step": 17348 + }, + { + "epoch": 4.691454840454299, + "grad_norm": 0.9566309452056885, + "learning_rate": 2.179470473799254e-06, + "loss": 1.7924, + "mean_token_accuracy": 0.5999138951301575, + "num_tokens": 8868580138.0, + "step": 17349 + }, + { + "epoch": 4.691725256895619, + "grad_norm": 1.0715445280075073, + "learning_rate": 2.1791573488646086e-06, + "loss": 1.8899, + "mean_token_accuracy": 0.5689696073532104, + "num_tokens": 8869062668.0, + "step": 17350 + }, + { + "epoch": 4.691995673336939, + "grad_norm": 0.9357873797416687, + "learning_rate": 2.1788444945793566e-06, + "loss": 1.7789, + "mean_token_accuracy": 0.5927236080169678, + "num_tokens": 8869524484.0, + "step": 17351 + }, + { + "epoch": 4.692266089778259, + "grad_norm": 0.8228414058685303, + "learning_rate": 2.1785319109530966e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.5774824023246765, + "num_tokens": 8870048738.0, + "step": 17352 + }, + { + "epoch": 4.692536506219578, + "grad_norm": 0.7129482626914978, + "learning_rate": 2.17821959799542e-06, + "loss": 1.5645, + "mean_token_accuracy": 0.6060627102851868, + "num_tokens": 8870572927.0, + "step": 17353 + }, + { + "epoch": 4.692806922660898, + "grad_norm": 1.0817930698394775, + "learning_rate": 2.1779075557159084e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5799356698989868, + "num_tokens": 8871077721.0, + "step": 17354 + }, + { + "epoch": 4.693077339102217, + "grad_norm": 0.908469557762146, + "learning_rate": 2.1775957841241383e-06, + "loss": 1.9002, + "mean_token_accuracy": 0.5753699541091919, + "num_tokens": 8871601994.0, + "step": 17355 + }, + { + "epoch": 4.693347755543537, + "grad_norm": 1.565603256225586, + "learning_rate": 2.1772842832296744e-06, + "loss": 1.3311, + "mean_token_accuracy": 0.6208519339561462, + "num_tokens": 8872126215.0, + "step": 17356 + }, + { + "epoch": 4.693618171984856, + "grad_norm": 0.8856909871101379, + "learning_rate": 2.1769730530420747e-06, + "loss": 1.7584, + "mean_token_accuracy": 0.5779441595077515, + "num_tokens": 8872650494.0, + "step": 17357 + }, + { + "epoch": 4.6938885884261765, + "grad_norm": 1.0775630474090576, + "learning_rate": 2.1766620935708882e-06, + "loss": 1.5971, + "mean_token_accuracy": 0.6163977384567261, + "num_tokens": 8873174746.0, + "step": 17358 + }, + { + "epoch": 4.694159004867496, + "grad_norm": 0.845055341720581, + "learning_rate": 2.1763514048256563e-06, + "loss": 1.7302, + "mean_token_accuracy": 0.5894291996955872, + "num_tokens": 8873698831.0, + "step": 17359 + }, + { + "epoch": 4.694429421308816, + "grad_norm": 0.8063697814941406, + "learning_rate": 2.1760409868159115e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.5833538770675659, + "num_tokens": 8874223035.0, + "step": 17360 + }, + { + "epoch": 4.694699837750135, + "grad_norm": 2.1575369834899902, + "learning_rate": 2.17573083955118e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7478970289230347, + "num_tokens": 8874713992.0, + "step": 17361 + }, + { + "epoch": 4.694970254191455, + "grad_norm": 0.9289026260375977, + "learning_rate": 2.1754209630409755e-06, + "loss": 1.9898, + "mean_token_accuracy": 0.5466794967651367, + "num_tokens": 8875238194.0, + "step": 17362 + }, + { + "epoch": 4.695240670632774, + "grad_norm": 0.8358498215675354, + "learning_rate": 2.175111357294808e-06, + "loss": 1.7811, + "mean_token_accuracy": 0.5920642614364624, + "num_tokens": 8875762273.0, + "step": 17363 + }, + { + "epoch": 4.695511087074094, + "grad_norm": 0.9221353530883789, + "learning_rate": 2.174802022322176e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5752688646316528, + "num_tokens": 8876286551.0, + "step": 17364 + }, + { + "epoch": 4.695781503515414, + "grad_norm": 0.7635986804962158, + "learning_rate": 2.1744929581325723e-06, + "loss": 1.5747, + "mean_token_accuracy": 0.6339899301528931, + "num_tokens": 8876810837.0, + "step": 17365 + }, + { + "epoch": 4.696051919956734, + "grad_norm": 0.973798930644989, + "learning_rate": 2.174184164735478e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.5701045989990234, + "num_tokens": 8877334980.0, + "step": 17366 + }, + { + "epoch": 4.696322336398053, + "grad_norm": 0.8427472114562988, + "learning_rate": 2.173875642140368e-06, + "loss": 1.8421, + "mean_token_accuracy": 0.5665746331214905, + "num_tokens": 8877859117.0, + "step": 17367 + }, + { + "epoch": 4.696592752839373, + "grad_norm": 0.6798736453056335, + "learning_rate": 2.1735673903567096e-06, + "loss": 1.549, + "mean_token_accuracy": 0.6377485394477844, + "num_tokens": 8878383381.0, + "step": 17368 + }, + { + "epoch": 4.696863169280692, + "grad_norm": 0.9121007919311523, + "learning_rate": 2.1732594093939614e-06, + "loss": 1.7934, + "mean_token_accuracy": 0.6022017002105713, + "num_tokens": 8878863412.0, + "step": 17369 + }, + { + "epoch": 4.697133585722012, + "grad_norm": 0.9687821865081787, + "learning_rate": 2.1729516992615706e-06, + "loss": 1.788, + "mean_token_accuracy": 0.5726796984672546, + "num_tokens": 8879387527.0, + "step": 17370 + }, + { + "epoch": 4.697404002163331, + "grad_norm": 0.9221629500389099, + "learning_rate": 2.172644259968982e-06, + "loss": 1.8277, + "mean_token_accuracy": 0.5777170658111572, + "num_tokens": 8879911659.0, + "step": 17371 + }, + { + "epoch": 4.6976744186046515, + "grad_norm": 0.9279979467391968, + "learning_rate": 2.1723370915256258e-06, + "loss": 1.9111, + "mean_token_accuracy": 0.5846987962722778, + "num_tokens": 8880358873.0, + "step": 17372 + }, + { + "epoch": 4.697944835045971, + "grad_norm": 0.85350102186203, + "learning_rate": 2.1720301939409294e-06, + "loss": 1.8402, + "mean_token_accuracy": 0.5751187801361084, + "num_tokens": 8880882978.0, + "step": 17373 + }, + { + "epoch": 4.698215251487291, + "grad_norm": 0.9616483449935913, + "learning_rate": 2.171723567224306e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5533132553100586, + "num_tokens": 8881407206.0, + "step": 17374 + }, + { + "epoch": 4.69848566792861, + "grad_norm": 1.0261483192443848, + "learning_rate": 2.1714172113851683e-06, + "loss": 1.8882, + "mean_token_accuracy": 0.5513437986373901, + "num_tokens": 8881883126.0, + "step": 17375 + }, + { + "epoch": 4.698756084369929, + "grad_norm": 0.8286669254302979, + "learning_rate": 2.171111126432911e-06, + "loss": 1.8644, + "mean_token_accuracy": 0.57916659116745, + "num_tokens": 8882402155.0, + "step": 17376 + }, + { + "epoch": 4.699026500811249, + "grad_norm": 0.9248493909835815, + "learning_rate": 2.1708053123769303e-06, + "loss": 1.7987, + "mean_token_accuracy": 0.5796909928321838, + "num_tokens": 8882891230.0, + "step": 17377 + }, + { + "epoch": 4.699296917252569, + "grad_norm": 0.8345362544059753, + "learning_rate": 2.170499769226607e-06, + "loss": 1.8166, + "mean_token_accuracy": 0.5866854190826416, + "num_tokens": 8883415502.0, + "step": 17378 + }, + { + "epoch": 4.699567333693889, + "grad_norm": 0.8328090906143188, + "learning_rate": 2.1701944969913173e-06, + "loss": 1.7753, + "mean_token_accuracy": 0.5799533128738403, + "num_tokens": 8883939775.0, + "step": 17379 + }, + { + "epoch": 4.699837750135208, + "grad_norm": 0.9244831800460815, + "learning_rate": 2.169889495680426e-06, + "loss": 1.821, + "mean_token_accuracy": 0.5895564556121826, + "num_tokens": 8884403159.0, + "step": 17380 + }, + { + "epoch": 4.700108166576528, + "grad_norm": 0.3866807520389557, + "learning_rate": 2.169584765303294e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.7133968472480774, + "num_tokens": 8884927328.0, + "step": 17381 + }, + { + "epoch": 4.700378583017848, + "grad_norm": 0.7705142498016357, + "learning_rate": 2.169280305869269e-06, + "loss": 1.7442, + "mean_token_accuracy": 0.5983572006225586, + "num_tokens": 8885451605.0, + "step": 17382 + }, + { + "epoch": 4.700648999459167, + "grad_norm": 0.8307905793190002, + "learning_rate": 2.1689761173876937e-06, + "loss": 1.6849, + "mean_token_accuracy": 0.5975210070610046, + "num_tokens": 8885920993.0, + "step": 17383 + }, + { + "epoch": 4.700919415900486, + "grad_norm": 0.9304953813552856, + "learning_rate": 2.168672199867902e-06, + "loss": 1.7909, + "mean_token_accuracy": 0.5631232261657715, + "num_tokens": 8886445119.0, + "step": 17384 + }, + { + "epoch": 4.701189832341806, + "grad_norm": 0.7911476492881775, + "learning_rate": 2.168368553319219e-06, + "loss": 1.8829, + "mean_token_accuracy": 0.5659283399581909, + "num_tokens": 8886969395.0, + "step": 17385 + }, + { + "epoch": 4.701460248783126, + "grad_norm": 0.7445071935653687, + "learning_rate": 2.1680651777509593e-06, + "loss": 1.839, + "mean_token_accuracy": 0.5695099830627441, + "num_tokens": 8887493669.0, + "step": 17386 + }, + { + "epoch": 4.701730665224446, + "grad_norm": 0.7578336000442505, + "learning_rate": 2.167762073172434e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5842808485031128, + "num_tokens": 8888017927.0, + "step": 17387 + }, + { + "epoch": 4.702001081665765, + "grad_norm": 0.7721182107925415, + "learning_rate": 2.1674592395929426e-06, + "loss": 1.8394, + "mean_token_accuracy": 0.5816892385482788, + "num_tokens": 8888518563.0, + "step": 17388 + }, + { + "epoch": 4.702271498107085, + "grad_norm": 0.8174242377281189, + "learning_rate": 2.1671566770217756e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5589969158172607, + "num_tokens": 8889042704.0, + "step": 17389 + }, + { + "epoch": 4.702541914548404, + "grad_norm": 0.8281282186508179, + "learning_rate": 2.1668543854682178e-06, + "loss": 1.7515, + "mean_token_accuracy": 0.5836883783340454, + "num_tokens": 8889566985.0, + "step": 17390 + }, + { + "epoch": 4.702812330989724, + "grad_norm": 0.8478181958198547, + "learning_rate": 2.1665523649415438e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.5765261650085449, + "num_tokens": 8890091207.0, + "step": 17391 + }, + { + "epoch": 4.7030827474310435, + "grad_norm": 0.8216184973716736, + "learning_rate": 2.1662506154510205e-06, + "loss": 1.8501, + "mean_token_accuracy": 0.5654952526092529, + "num_tokens": 8890615410.0, + "step": 17392 + }, + { + "epoch": 4.703353163872364, + "grad_norm": 0.7699139714241028, + "learning_rate": 2.1659491370059073e-06, + "loss": 1.7505, + "mean_token_accuracy": 0.5979898571968079, + "num_tokens": 8891139662.0, + "step": 17393 + }, + { + "epoch": 4.703623580313683, + "grad_norm": 0.7831595540046692, + "learning_rate": 2.1656479296154535e-06, + "loss": 1.7814, + "mean_token_accuracy": 0.5992457866668701, + "num_tokens": 8891663936.0, + "step": 17394 + }, + { + "epoch": 4.703893996755003, + "grad_norm": 0.9112659096717834, + "learning_rate": 2.1653469932889013e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5935570001602173, + "num_tokens": 8892188143.0, + "step": 17395 + }, + { + "epoch": 4.704164413196322, + "grad_norm": 0.7360289096832275, + "learning_rate": 2.165046328035485e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.5961942076683044, + "num_tokens": 8892712357.0, + "step": 17396 + }, + { + "epoch": 4.704434829637642, + "grad_norm": 0.7700235247612, + "learning_rate": 2.1647459338644283e-06, + "loss": 1.7597, + "mean_token_accuracy": 0.6028708219528198, + "num_tokens": 8893226780.0, + "step": 17397 + }, + { + "epoch": 4.704705246078961, + "grad_norm": 0.8265569806098938, + "learning_rate": 2.1644458107849504e-06, + "loss": 1.835, + "mean_token_accuracy": 0.5886447429656982, + "num_tokens": 8893751003.0, + "step": 17398 + }, + { + "epoch": 4.704975662520281, + "grad_norm": 0.8167188763618469, + "learning_rate": 2.164145958806258e-06, + "loss": 1.6187, + "mean_token_accuracy": 0.6292315721511841, + "num_tokens": 8894260492.0, + "step": 17399 + }, + { + "epoch": 4.705246078961601, + "grad_norm": 0.8927388191223145, + "learning_rate": 2.1638463779375515e-06, + "loss": 1.7739, + "mean_token_accuracy": 0.5937803983688354, + "num_tokens": 8894746071.0, + "step": 17400 + }, + { + "epoch": 4.705516495402921, + "grad_norm": 0.3848992586135864, + "learning_rate": 2.1635470681880254e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.7163197994232178, + "num_tokens": 8895201343.0, + "step": 17401 + }, + { + "epoch": 4.70578691184424, + "grad_norm": 0.7750357389450073, + "learning_rate": 2.1632480295668602e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.5582832098007202, + "num_tokens": 8895725521.0, + "step": 17402 + }, + { + "epoch": 4.70605732828556, + "grad_norm": 0.9035996794700623, + "learning_rate": 2.1629492620832323e-06, + "loss": 1.7787, + "mean_token_accuracy": 0.5881938934326172, + "num_tokens": 8896249598.0, + "step": 17403 + }, + { + "epoch": 4.706327744726879, + "grad_norm": 0.7593943476676941, + "learning_rate": 2.16265076574631e-06, + "loss": 1.7259, + "mean_token_accuracy": 0.599744439125061, + "num_tokens": 8896773552.0, + "step": 17404 + }, + { + "epoch": 4.706598161168199, + "grad_norm": 0.8011782169342041, + "learning_rate": 2.1623525405652497e-06, + "loss": 1.6967, + "mean_token_accuracy": 0.6019968390464783, + "num_tokens": 8897297811.0, + "step": 17405 + }, + { + "epoch": 4.7068685776095185, + "grad_norm": 0.792551577091217, + "learning_rate": 2.162054586549204e-06, + "loss": 1.7537, + "mean_token_accuracy": 0.5872148275375366, + "num_tokens": 8897822082.0, + "step": 17406 + }, + { + "epoch": 4.707138994050839, + "grad_norm": 0.8866752982139587, + "learning_rate": 2.1617569037073155e-06, + "loss": 1.8439, + "mean_token_accuracy": 0.5840949416160583, + "num_tokens": 8898346234.0, + "step": 17407 + }, + { + "epoch": 4.707409410492158, + "grad_norm": 0.8388269543647766, + "learning_rate": 2.161459492048716e-06, + "loss": 1.8174, + "mean_token_accuracy": 0.5751747488975525, + "num_tokens": 8898839704.0, + "step": 17408 + }, + { + "epoch": 4.707679826933478, + "grad_norm": 0.7983438372612, + "learning_rate": 2.1611623515825317e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5724836587905884, + "num_tokens": 8899363881.0, + "step": 17409 + }, + { + "epoch": 4.707950243374797, + "grad_norm": 0.792043924331665, + "learning_rate": 2.1608654823178795e-06, + "loss": 1.8545, + "mean_token_accuracy": 0.5636760592460632, + "num_tokens": 8899888118.0, + "step": 17410 + }, + { + "epoch": 4.708220659816117, + "grad_norm": 0.8011952042579651, + "learning_rate": 2.1605688842638693e-06, + "loss": 1.8642, + "mean_token_accuracy": 0.572268545627594, + "num_tokens": 8900412322.0, + "step": 17411 + }, + { + "epoch": 4.708491076257436, + "grad_norm": 0.7357062697410583, + "learning_rate": 2.1602725574296e-06, + "loss": 1.6709, + "mean_token_accuracy": 0.6089150905609131, + "num_tokens": 8900903002.0, + "step": 17412 + }, + { + "epoch": 4.708761492698756, + "grad_norm": 0.7987896800041199, + "learning_rate": 2.1599765018241654e-06, + "loss": 1.836, + "mean_token_accuracy": 0.5753763914108276, + "num_tokens": 8901427239.0, + "step": 17413 + }, + { + "epoch": 4.709031909140076, + "grad_norm": 0.7177736759185791, + "learning_rate": 2.1596807174566474e-06, + "loss": 1.6609, + "mean_token_accuracy": 0.6088043451309204, + "num_tokens": 8901951468.0, + "step": 17414 + }, + { + "epoch": 4.709302325581396, + "grad_norm": 0.9641590714454651, + "learning_rate": 2.159385204336123e-06, + "loss": 1.8925, + "mean_token_accuracy": 0.5637061595916748, + "num_tokens": 8902475658.0, + "step": 17415 + }, + { + "epoch": 4.709572742022715, + "grad_norm": 0.8242956399917603, + "learning_rate": 2.1590899624716597e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.586400032043457, + "num_tokens": 8902960577.0, + "step": 17416 + }, + { + "epoch": 4.709843158464034, + "grad_norm": 0.8087083697319031, + "learning_rate": 2.158794991872315e-06, + "loss": 1.7541, + "mean_token_accuracy": 0.5946111679077148, + "num_tokens": 8903484808.0, + "step": 17417 + }, + { + "epoch": 4.710113574905354, + "grad_norm": 0.8453794717788696, + "learning_rate": 2.158500292547141e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.5612598657608032, + "num_tokens": 8904009076.0, + "step": 17418 + }, + { + "epoch": 4.710383991346674, + "grad_norm": 0.884765088558197, + "learning_rate": 2.1582058645051777e-06, + "loss": 1.7529, + "mean_token_accuracy": 0.5804734230041504, + "num_tokens": 8904533130.0, + "step": 17419 + }, + { + "epoch": 4.7106544077879935, + "grad_norm": 1.093522071838379, + "learning_rate": 2.1579117077554627e-06, + "loss": 1.7963, + "mean_token_accuracy": 0.5725815296173096, + "num_tokens": 8905057413.0, + "step": 17420 + }, + { + "epoch": 4.710924824229313, + "grad_norm": 0.32917383313179016, + "learning_rate": 2.1576178223070176e-06, + "loss": 1.1082, + "mean_token_accuracy": 0.6943402290344238, + "num_tokens": 8905581576.0, + "step": 17421 + }, + { + "epoch": 4.711195240670633, + "grad_norm": 0.9930173754692078, + "learning_rate": 2.157324208168862e-06, + "loss": 1.86, + "mean_token_accuracy": 0.5925432443618774, + "num_tokens": 8906045295.0, + "step": 17422 + }, + { + "epoch": 4.711465657111953, + "grad_norm": 0.8763525485992432, + "learning_rate": 2.1570308653500045e-06, + "loss": 1.7154, + "mean_token_accuracy": 0.6014245748519897, + "num_tokens": 8906569483.0, + "step": 17423 + }, + { + "epoch": 4.711736073553272, + "grad_norm": 0.8735249638557434, + "learning_rate": 2.156737793859446e-06, + "loss": 1.8887, + "mean_token_accuracy": 0.5602850914001465, + "num_tokens": 8907058545.0, + "step": 17424 + }, + { + "epoch": 4.712006489994591, + "grad_norm": 0.8789516687393188, + "learning_rate": 2.1564449937061784e-06, + "loss": 1.7812, + "mean_token_accuracy": 0.5766300559043884, + "num_tokens": 8907582707.0, + "step": 17425 + }, + { + "epoch": 4.712276906435911, + "grad_norm": 0.769989550113678, + "learning_rate": 2.1561524648991836e-06, + "loss": 1.7909, + "mean_token_accuracy": 0.5915015935897827, + "num_tokens": 8908106962.0, + "step": 17426 + }, + { + "epoch": 4.7125473228772305, + "grad_norm": 0.8300051093101501, + "learning_rate": 2.1558602074474415e-06, + "loss": 1.8243, + "mean_token_accuracy": 0.5867455005645752, + "num_tokens": 8908615557.0, + "step": 17427 + }, + { + "epoch": 4.712817739318551, + "grad_norm": 0.8686535954475403, + "learning_rate": 2.155568221359917e-06, + "loss": 1.7492, + "mean_token_accuracy": 0.5984372496604919, + "num_tokens": 8909106711.0, + "step": 17428 + }, + { + "epoch": 4.71308815575987, + "grad_norm": 0.9253262877464294, + "learning_rate": 2.155276506645568e-06, + "loss": 1.8919, + "mean_token_accuracy": 0.5658340454101562, + "num_tokens": 8909593396.0, + "step": 17429 + }, + { + "epoch": 4.71335857220119, + "grad_norm": 0.7445539832115173, + "learning_rate": 2.154985063313347e-06, + "loss": 1.8233, + "mean_token_accuracy": 0.5666896104812622, + "num_tokens": 8910117499.0, + "step": 17430 + }, + { + "epoch": 4.713628988642509, + "grad_norm": 0.7643265724182129, + "learning_rate": 2.154693891372196e-06, + "loss": 1.6272, + "mean_token_accuracy": 0.6111282110214233, + "num_tokens": 8910641757.0, + "step": 17431 + }, + { + "epoch": 4.713899405083829, + "grad_norm": 0.8550435304641724, + "learning_rate": 2.1544029908310483e-06, + "loss": 1.8384, + "mean_token_accuracy": 0.5710474848747253, + "num_tokens": 8911158481.0, + "step": 17432 + }, + { + "epoch": 4.714169821525148, + "grad_norm": 0.9362114071846008, + "learning_rate": 2.1541123616988302e-06, + "loss": 1.8961, + "mean_token_accuracy": 0.5665360689163208, + "num_tokens": 8911644889.0, + "step": 17433 + }, + { + "epoch": 4.7144402379664685, + "grad_norm": 0.9179481267929077, + "learning_rate": 2.1538220039844593e-06, + "loss": 1.8113, + "mean_token_accuracy": 0.5695071220397949, + "num_tokens": 8912169073.0, + "step": 17434 + }, + { + "epoch": 4.714710654407788, + "grad_norm": 0.8176301121711731, + "learning_rate": 2.1535319176968433e-06, + "loss": 1.8252, + "mean_token_accuracy": 0.572270393371582, + "num_tokens": 8912691007.0, + "step": 17435 + }, + { + "epoch": 4.714981070849108, + "grad_norm": 0.863871693611145, + "learning_rate": 2.153242102844884e-06, + "loss": 1.9435, + "mean_token_accuracy": 0.540459156036377, + "num_tokens": 8913215233.0, + "step": 17436 + }, + { + "epoch": 4.715251487290427, + "grad_norm": 0.7919296622276306, + "learning_rate": 2.152952559437475e-06, + "loss": 1.7838, + "mean_token_accuracy": 0.5740610361099243, + "num_tokens": 8913722404.0, + "step": 17437 + }, + { + "epoch": 4.715521903731747, + "grad_norm": 0.8073484897613525, + "learning_rate": 2.1526632874834975e-06, + "loss": 1.8175, + "mean_token_accuracy": 0.581505298614502, + "num_tokens": 8914246520.0, + "step": 17438 + }, + { + "epoch": 4.715792320173066, + "grad_norm": 0.7909780740737915, + "learning_rate": 2.1523742869918286e-06, + "loss": 1.8836, + "mean_token_accuracy": 0.5819735527038574, + "num_tokens": 8914707923.0, + "step": 17439 + }, + { + "epoch": 4.716062736614386, + "grad_norm": 0.748016357421875, + "learning_rate": 2.1520855579713364e-06, + "loss": 1.839, + "mean_token_accuracy": 0.5764697194099426, + "num_tokens": 8915232201.0, + "step": 17440 + }, + { + "epoch": 4.7163331530557056, + "grad_norm": 0.336825430393219, + "learning_rate": 2.1517971004308797e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.7093240022659302, + "num_tokens": 8915756363.0, + "step": 17441 + }, + { + "epoch": 4.716603569497026, + "grad_norm": 0.742031455039978, + "learning_rate": 2.151508914379308e-06, + "loss": 1.7718, + "mean_token_accuracy": 0.5808284282684326, + "num_tokens": 8916280526.0, + "step": 17442 + }, + { + "epoch": 4.716873985938345, + "grad_norm": 0.923042893409729, + "learning_rate": 2.1512209998254653e-06, + "loss": 1.9188, + "mean_token_accuracy": 0.5523374080657959, + "num_tokens": 8916804703.0, + "step": 17443 + }, + { + "epoch": 4.717144402379665, + "grad_norm": 0.9346389770507812, + "learning_rate": 2.1509333567781843e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5804669857025146, + "num_tokens": 8917328718.0, + "step": 17444 + }, + { + "epoch": 4.717414818820984, + "grad_norm": 0.849348247051239, + "learning_rate": 2.1506459852462914e-06, + "loss": 1.8949, + "mean_token_accuracy": 0.5515751242637634, + "num_tokens": 8917852922.0, + "step": 17445 + }, + { + "epoch": 4.717685235262304, + "grad_norm": 0.7943314909934998, + "learning_rate": 2.150358885238604e-06, + "loss": 1.7717, + "mean_token_accuracy": 0.5915459394454956, + "num_tokens": 8918376926.0, + "step": 17446 + }, + { + "epoch": 4.717955651703623, + "grad_norm": 0.8323469161987305, + "learning_rate": 2.150072056763931e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.5723164081573486, + "num_tokens": 8918873822.0, + "step": 17447 + }, + { + "epoch": 4.7182260681449435, + "grad_norm": 0.7511265873908997, + "learning_rate": 2.149785499831073e-06, + "loss": 1.759, + "mean_token_accuracy": 0.5954598188400269, + "num_tokens": 8919376999.0, + "step": 17448 + }, + { + "epoch": 4.718496484586263, + "grad_norm": 0.7569037079811096, + "learning_rate": 2.1494992144488227e-06, + "loss": 1.8029, + "mean_token_accuracy": 0.5829355716705322, + "num_tokens": 8919901274.0, + "step": 17449 + }, + { + "epoch": 4.718766901027583, + "grad_norm": 0.8564164042472839, + "learning_rate": 2.1492132006259647e-06, + "loss": 1.7786, + "mean_token_accuracy": 0.5841009616851807, + "num_tokens": 8920425404.0, + "step": 17450 + }, + { + "epoch": 4.719037317468902, + "grad_norm": 0.8651062846183777, + "learning_rate": 2.148927458371274e-06, + "loss": 1.7533, + "mean_token_accuracy": 0.5881913304328918, + "num_tokens": 8920949685.0, + "step": 17451 + }, + { + "epoch": 4.719307733910222, + "grad_norm": 0.9080533385276794, + "learning_rate": 2.1486419876935176e-06, + "loss": 1.7713, + "mean_token_accuracy": 0.5797202587127686, + "num_tokens": 8921473770.0, + "step": 17452 + }, + { + "epoch": 4.719578150351541, + "grad_norm": 0.8813837170600891, + "learning_rate": 2.1483567886014556e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5674062371253967, + "num_tokens": 8921998041.0, + "step": 17453 + }, + { + "epoch": 4.719848566792861, + "grad_norm": 0.7986650466918945, + "learning_rate": 2.148071861103839e-06, + "loss": 1.8112, + "mean_token_accuracy": 0.5784143209457397, + "num_tokens": 8922522272.0, + "step": 17454 + }, + { + "epoch": 4.7201189832341806, + "grad_norm": 0.8502673506736755, + "learning_rate": 2.1477872052094082e-06, + "loss": 1.8648, + "mean_token_accuracy": 0.5573659539222717, + "num_tokens": 8923046333.0, + "step": 17455 + }, + { + "epoch": 4.720389399675501, + "grad_norm": 0.7346048951148987, + "learning_rate": 2.1475028209268995e-06, + "loss": 1.7416, + "mean_token_accuracy": 0.5790346264839172, + "num_tokens": 8923570472.0, + "step": 17456 + }, + { + "epoch": 4.72065981611682, + "grad_norm": 0.8386015892028809, + "learning_rate": 2.1472187082650375e-06, + "loss": 1.6583, + "mean_token_accuracy": 0.586066722869873, + "num_tokens": 8924094747.0, + "step": 17457 + }, + { + "epoch": 4.720930232558139, + "grad_norm": 0.9283586144447327, + "learning_rate": 2.1469348672325394e-06, + "loss": 1.8488, + "mean_token_accuracy": 0.5726222395896912, + "num_tokens": 8924618986.0, + "step": 17458 + }, + { + "epoch": 4.721200648999459, + "grad_norm": 0.8656243085861206, + "learning_rate": 2.1466512978381158e-06, + "loss": 1.8063, + "mean_token_accuracy": 0.5840232372283936, + "num_tokens": 8925084710.0, + "step": 17459 + }, + { + "epoch": 4.721471065440779, + "grad_norm": 0.8290097713470459, + "learning_rate": 2.146368000090466e-06, + "loss": 1.8645, + "mean_token_accuracy": 0.5845520496368408, + "num_tokens": 8925606664.0, + "step": 17460 + }, + { + "epoch": 4.721741481882098, + "grad_norm": 0.3519701361656189, + "learning_rate": 2.1460849739982822e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.705929696559906, + "num_tokens": 8926130905.0, + "step": 17461 + }, + { + "epoch": 4.722011898323418, + "grad_norm": 0.8994888663291931, + "learning_rate": 2.145802219570249e-06, + "loss": 1.7149, + "mean_token_accuracy": 0.599374532699585, + "num_tokens": 8926592161.0, + "step": 17462 + }, + { + "epoch": 4.722282314764738, + "grad_norm": 0.9717890024185181, + "learning_rate": 2.1455197368150427e-06, + "loss": 1.9668, + "mean_token_accuracy": 0.5655806064605713, + "num_tokens": 8927115167.0, + "step": 17463 + }, + { + "epoch": 4.722552731206058, + "grad_norm": 0.8736435770988464, + "learning_rate": 2.1452375257413303e-06, + "loss": 1.8988, + "mean_token_accuracy": 0.5606368780136108, + "num_tokens": 8927639237.0, + "step": 17464 + }, + { + "epoch": 4.722823147647377, + "grad_norm": 0.800582230091095, + "learning_rate": 2.144955586357771e-06, + "loss": 1.8392, + "mean_token_accuracy": 0.5633306503295898, + "num_tokens": 8928163497.0, + "step": 17465 + }, + { + "epoch": 4.723093564088696, + "grad_norm": 0.797515332698822, + "learning_rate": 2.144673918673014e-06, + "loss": 1.8649, + "mean_token_accuracy": 0.5770102143287659, + "num_tokens": 8928687703.0, + "step": 17466 + }, + { + "epoch": 4.723363980530016, + "grad_norm": 0.8163972496986389, + "learning_rate": 2.1443925226957045e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5912503600120544, + "num_tokens": 8929211895.0, + "step": 17467 + }, + { + "epoch": 4.7236343969713355, + "grad_norm": 0.9311155676841736, + "learning_rate": 2.1441113984344745e-06, + "loss": 1.7669, + "mean_token_accuracy": 0.5999760627746582, + "num_tokens": 8929736181.0, + "step": 17468 + }, + { + "epoch": 4.723904813412656, + "grad_norm": 0.9142654538154602, + "learning_rate": 2.14383054589795e-06, + "loss": 1.7656, + "mean_token_accuracy": 0.5959542393684387, + "num_tokens": 8930260448.0, + "step": 17469 + }, + { + "epoch": 4.724175229853975, + "grad_norm": 0.9315569400787354, + "learning_rate": 2.1435499650947497e-06, + "loss": 1.7902, + "mean_token_accuracy": 0.5689499378204346, + "num_tokens": 8930784626.0, + "step": 17470 + }, + { + "epoch": 4.724445646295295, + "grad_norm": 0.8645381927490234, + "learning_rate": 2.14326965603348e-06, + "loss": 1.8309, + "mean_token_accuracy": 0.5729498267173767, + "num_tokens": 8931308717.0, + "step": 17471 + }, + { + "epoch": 4.724716062736614, + "grad_norm": 0.9183719158172607, + "learning_rate": 2.142989618722744e-06, + "loss": 1.8417, + "mean_token_accuracy": 0.5900596976280212, + "num_tokens": 8931759380.0, + "step": 17472 + }, + { + "epoch": 4.724986479177934, + "grad_norm": 0.8977745771408081, + "learning_rate": 2.1427098531711336e-06, + "loss": 1.8129, + "mean_token_accuracy": 0.579268217086792, + "num_tokens": 8932237760.0, + "step": 17473 + }, + { + "epoch": 4.725256895619253, + "grad_norm": 0.8315147757530212, + "learning_rate": 2.142430359387233e-06, + "loss": 1.8734, + "mean_token_accuracy": 0.5842130184173584, + "num_tokens": 8932714439.0, + "step": 17474 + }, + { + "epoch": 4.725527312060573, + "grad_norm": 0.9372081160545349, + "learning_rate": 2.1421511373796157e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5836676955223083, + "num_tokens": 8933238707.0, + "step": 17475 + }, + { + "epoch": 4.725797728501893, + "grad_norm": 0.8264710307121277, + "learning_rate": 2.141872187156851e-06, + "loss": 1.8596, + "mean_token_accuracy": 0.5682357549667358, + "num_tokens": 8933762886.0, + "step": 17476 + }, + { + "epoch": 4.726068144943213, + "grad_norm": 0.8777886033058167, + "learning_rate": 2.1415935087274994e-06, + "loss": 1.7896, + "mean_token_accuracy": 0.5973602533340454, + "num_tokens": 8934207310.0, + "step": 17477 + }, + { + "epoch": 4.726338561384532, + "grad_norm": 0.8509156703948975, + "learning_rate": 2.141315102100107e-06, + "loss": 1.8951, + "mean_token_accuracy": 0.5777269005775452, + "num_tokens": 8934713802.0, + "step": 17478 + }, + { + "epoch": 4.726608977825852, + "grad_norm": 0.8339352607727051, + "learning_rate": 2.1410369672832214e-06, + "loss": 1.913, + "mean_token_accuracy": 0.5550447702407837, + "num_tokens": 8935234180.0, + "step": 17479 + }, + { + "epoch": 4.726879394267171, + "grad_norm": 0.7588469982147217, + "learning_rate": 2.1407591042853737e-06, + "loss": 1.6672, + "mean_token_accuracy": 0.6119554042816162, + "num_tokens": 8935758465.0, + "step": 17480 + }, + { + "epoch": 4.727149810708491, + "grad_norm": 0.3438880145549774, + "learning_rate": 2.1404815131150896e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7223454117774963, + "num_tokens": 8936282676.0, + "step": 17481 + }, + { + "epoch": 4.7274202271498105, + "grad_norm": 0.8714901208877563, + "learning_rate": 2.140204193780887e-06, + "loss": 1.8955, + "mean_token_accuracy": 0.550869345664978, + "num_tokens": 8936806808.0, + "step": 17482 + }, + { + "epoch": 4.727690643591131, + "grad_norm": 0.9016286730766296, + "learning_rate": 2.1399271462912744e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5860577821731567, + "num_tokens": 8937330984.0, + "step": 17483 + }, + { + "epoch": 4.72796106003245, + "grad_norm": 0.8448923826217651, + "learning_rate": 2.139650370654753e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5739867091178894, + "num_tokens": 8937801441.0, + "step": 17484 + }, + { + "epoch": 4.72823147647377, + "grad_norm": 0.8560856580734253, + "learning_rate": 2.1393738668798153e-06, + "loss": 1.9698, + "mean_token_accuracy": 0.5317462682723999, + "num_tokens": 8938325504.0, + "step": 17485 + }, + { + "epoch": 4.728501892915089, + "grad_norm": 0.9897958636283875, + "learning_rate": 2.1390976349749435e-06, + "loss": 1.7511, + "mean_token_accuracy": 0.5952308177947998, + "num_tokens": 8938822594.0, + "step": 17486 + }, + { + "epoch": 4.728772309356409, + "grad_norm": 0.9200230836868286, + "learning_rate": 2.1388216749486153e-06, + "loss": 1.7851, + "mean_token_accuracy": 0.5921363830566406, + "num_tokens": 8939335519.0, + "step": 17487 + }, + { + "epoch": 4.729042725797728, + "grad_norm": 0.8240768313407898, + "learning_rate": 2.1385459868092977e-06, + "loss": 1.7612, + "mean_token_accuracy": 0.6009516716003418, + "num_tokens": 8939799436.0, + "step": 17488 + }, + { + "epoch": 4.729313142239048, + "grad_norm": 0.9431432485580444, + "learning_rate": 2.1382705705654475e-06, + "loss": 1.837, + "mean_token_accuracy": 0.5721020698547363, + "num_tokens": 8940323604.0, + "step": 17489 + }, + { + "epoch": 4.729583558680368, + "grad_norm": 0.9070023894309998, + "learning_rate": 2.1379954262255177e-06, + "loss": 1.8547, + "mean_token_accuracy": 0.5730232000350952, + "num_tokens": 8940847757.0, + "step": 17490 + }, + { + "epoch": 4.729853975121688, + "grad_norm": 0.9603202939033508, + "learning_rate": 2.1377205537979497e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.5798778533935547, + "num_tokens": 8941372004.0, + "step": 17491 + }, + { + "epoch": 4.730124391563007, + "grad_norm": 0.8829556107521057, + "learning_rate": 2.1374459532911775e-06, + "loss": 1.7064, + "mean_token_accuracy": 0.6014014482498169, + "num_tokens": 8941855091.0, + "step": 17492 + }, + { + "epoch": 4.730394808004327, + "grad_norm": 0.7832958698272705, + "learning_rate": 2.137171624713626e-06, + "loss": 1.7469, + "mean_token_accuracy": 0.5754608511924744, + "num_tokens": 8942379365.0, + "step": 17493 + }, + { + "epoch": 4.730665224445646, + "grad_norm": 0.7206647396087646, + "learning_rate": 2.1368975680737146e-06, + "loss": 1.8133, + "mean_token_accuracy": 0.5798850059509277, + "num_tokens": 8942903514.0, + "step": 17494 + }, + { + "epoch": 4.730935640886966, + "grad_norm": 0.8178688883781433, + "learning_rate": 2.1366237833798485e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.5645773410797119, + "num_tokens": 8943427720.0, + "step": 17495 + }, + { + "epoch": 4.7312060573282855, + "grad_norm": 0.8828521370887756, + "learning_rate": 2.1363502706404313e-06, + "loss": 1.6893, + "mean_token_accuracy": 0.5757608413696289, + "num_tokens": 8943951902.0, + "step": 17496 + }, + { + "epoch": 4.731476473769606, + "grad_norm": 0.7528545260429382, + "learning_rate": 2.1360770298638535e-06, + "loss": 1.9139, + "mean_token_accuracy": 0.5561292767524719, + "num_tokens": 8944476161.0, + "step": 17497 + }, + { + "epoch": 4.731746890210925, + "grad_norm": 0.948274552822113, + "learning_rate": 2.1358040610585e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5558276772499084, + "num_tokens": 8945000423.0, + "step": 17498 + }, + { + "epoch": 4.732017306652244, + "grad_norm": 0.9029322266578674, + "learning_rate": 2.1355313642327456e-06, + "loss": 1.9669, + "mean_token_accuracy": 0.5634100437164307, + "num_tokens": 8945496127.0, + "step": 17499 + }, + { + "epoch": 4.732287723093564, + "grad_norm": 1.245437741279602, + "learning_rate": 2.1352589393949583e-06, + "loss": 1.822, + "mean_token_accuracy": 0.5736995935440063, + "num_tokens": 8945979639.0, + "step": 17500 + }, + { + "epoch": 4.732558139534884, + "grad_norm": 0.36531734466552734, + "learning_rate": 2.1349867865534945e-06, + "loss": 1.0938, + "mean_token_accuracy": 0.6958726644515991, + "num_tokens": 8946498053.0, + "step": 17501 + }, + { + "epoch": 4.732828555976203, + "grad_norm": 0.8222274780273438, + "learning_rate": 2.1347149057167083e-06, + "loss": 1.6621, + "mean_token_accuracy": 0.6443321704864502, + "num_tokens": 8946957170.0, + "step": 17502 + }, + { + "epoch": 4.7330989724175225, + "grad_norm": 0.8275098204612732, + "learning_rate": 2.1344432968929385e-06, + "loss": 1.8544, + "mean_token_accuracy": 0.5820194482803345, + "num_tokens": 8947481452.0, + "step": 17503 + }, + { + "epoch": 4.733369388858843, + "grad_norm": 0.8340874314308167, + "learning_rate": 2.134171960090521e-06, + "loss": 1.7721, + "mean_token_accuracy": 0.5918365716934204, + "num_tokens": 8948005732.0, + "step": 17504 + }, + { + "epoch": 4.733639805300163, + "grad_norm": 0.923922061920166, + "learning_rate": 2.13390089531778e-06, + "loss": 1.8633, + "mean_token_accuracy": 0.5726061463356018, + "num_tokens": 8948529823.0, + "step": 17505 + }, + { + "epoch": 4.733910221741482, + "grad_norm": 0.8360071182250977, + "learning_rate": 2.1336301025830335e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5830159187316895, + "num_tokens": 8949053923.0, + "step": 17506 + }, + { + "epoch": 4.734180638182801, + "grad_norm": 0.9226680994033813, + "learning_rate": 2.133359581894589e-06, + "loss": 1.8375, + "mean_token_accuracy": 0.5915975570678711, + "num_tokens": 8949565579.0, + "step": 17507 + }, + { + "epoch": 4.734451054624121, + "grad_norm": 0.8128560185432434, + "learning_rate": 2.133089333260749e-06, + "loss": 1.7209, + "mean_token_accuracy": 0.5812878608703613, + "num_tokens": 8950089717.0, + "step": 17508 + }, + { + "epoch": 4.734721471065441, + "grad_norm": 0.7973207235336304, + "learning_rate": 2.132819356689803e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5836631059646606, + "num_tokens": 8950581853.0, + "step": 17509 + }, + { + "epoch": 4.7349918875067605, + "grad_norm": 0.9303510785102844, + "learning_rate": 2.1325496521900365e-06, + "loss": 1.7506, + "mean_token_accuracy": 0.5741819143295288, + "num_tokens": 8951106127.0, + "step": 17510 + }, + { + "epoch": 4.73526230394808, + "grad_norm": 0.8515337109565735, + "learning_rate": 2.132280219769724e-06, + "loss": 1.7638, + "mean_token_accuracy": 0.5866792798042297, + "num_tokens": 8951624254.0, + "step": 17511 + }, + { + "epoch": 4.7355327203894, + "grad_norm": 0.8156589865684509, + "learning_rate": 2.1320110594371318e-06, + "loss": 1.8468, + "mean_token_accuracy": 0.577303409576416, + "num_tokens": 8952148524.0, + "step": 17512 + }, + { + "epoch": 4.735803136830719, + "grad_norm": 0.8098064661026001, + "learning_rate": 2.1317421712005203e-06, + "loss": 1.8293, + "mean_token_accuracy": 0.571456789970398, + "num_tokens": 8952672515.0, + "step": 17513 + }, + { + "epoch": 4.736073553272039, + "grad_norm": 1.0135763883590698, + "learning_rate": 2.1314735550681393e-06, + "loss": 1.7309, + "mean_token_accuracy": 0.5882002115249634, + "num_tokens": 8953196700.0, + "step": 17514 + }, + { + "epoch": 4.736343969713358, + "grad_norm": 1.0409079790115356, + "learning_rate": 2.1312052110482283e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.5822612643241882, + "num_tokens": 8953670943.0, + "step": 17515 + }, + { + "epoch": 4.736614386154678, + "grad_norm": 0.8137925863265991, + "learning_rate": 2.130937139149025e-06, + "loss": 1.743, + "mean_token_accuracy": 0.5772106647491455, + "num_tokens": 8954195068.0, + "step": 17516 + }, + { + "epoch": 4.7368848025959975, + "grad_norm": 0.7735716104507446, + "learning_rate": 2.1306693393787513e-06, + "loss": 1.7773, + "mean_token_accuracy": 0.591813325881958, + "num_tokens": 8954719199.0, + "step": 17517 + }, + { + "epoch": 4.737155219037318, + "grad_norm": 0.7142947912216187, + "learning_rate": 2.1304018117456248e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.578951358795166, + "num_tokens": 8955241769.0, + "step": 17518 + }, + { + "epoch": 4.737425635478637, + "grad_norm": 0.8523935675621033, + "learning_rate": 2.1301345562578556e-06, + "loss": 1.8866, + "mean_token_accuracy": 0.5600560307502747, + "num_tokens": 8955766039.0, + "step": 17519 + }, + { + "epoch": 4.737696051919957, + "grad_norm": 0.8480073809623718, + "learning_rate": 2.129867572923642e-06, + "loss": 1.8462, + "mean_token_accuracy": 0.569486141204834, + "num_tokens": 8956290324.0, + "step": 17520 + }, + { + "epoch": 4.737966468361276, + "grad_norm": 0.3667411208152771, + "learning_rate": 2.1296008617511767e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.7231835126876831, + "num_tokens": 8956770726.0, + "step": 17521 + }, + { + "epoch": 4.738236884802596, + "grad_norm": 0.8904959559440613, + "learning_rate": 2.1293344227486435e-06, + "loss": 1.8371, + "mean_token_accuracy": 0.5850107669830322, + "num_tokens": 8957294997.0, + "step": 17522 + }, + { + "epoch": 4.738507301243915, + "grad_norm": 0.8329774141311646, + "learning_rate": 2.1290682559242178e-06, + "loss": 1.7466, + "mean_token_accuracy": 0.5898798704147339, + "num_tokens": 8957819275.0, + "step": 17523 + }, + { + "epoch": 4.7387777176852355, + "grad_norm": 0.8689495325088501, + "learning_rate": 2.1288023612860645e-06, + "loss": 1.8152, + "mean_token_accuracy": 0.5509072542190552, + "num_tokens": 8958343471.0, + "step": 17524 + }, + { + "epoch": 4.739048134126555, + "grad_norm": 0.8513372540473938, + "learning_rate": 2.1285367388423443e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5860847234725952, + "num_tokens": 8958867747.0, + "step": 17525 + }, + { + "epoch": 4.739318550567875, + "grad_norm": 0.8762786388397217, + "learning_rate": 2.1282713886012053e-06, + "loss": 1.7629, + "mean_token_accuracy": 0.5892854928970337, + "num_tokens": 8959349916.0, + "step": 17526 + }, + { + "epoch": 4.739588967009194, + "grad_norm": 0.8014681339263916, + "learning_rate": 2.1280063105707905e-06, + "loss": 1.857, + "mean_token_accuracy": 0.5684864521026611, + "num_tokens": 8959874142.0, + "step": 17527 + }, + { + "epoch": 4.739859383450514, + "grad_norm": 0.8931770324707031, + "learning_rate": 2.127741504759233e-06, + "loss": 1.8444, + "mean_token_accuracy": 0.5606281757354736, + "num_tokens": 8960398372.0, + "step": 17528 + }, + { + "epoch": 4.740129799891833, + "grad_norm": 0.9708718657493591, + "learning_rate": 2.1274769711746577e-06, + "loss": 1.873, + "mean_token_accuracy": 0.5524954199790955, + "num_tokens": 8960922578.0, + "step": 17529 + }, + { + "epoch": 4.740400216333153, + "grad_norm": 0.9530925154685974, + "learning_rate": 2.1272127098251816e-06, + "loss": 1.6895, + "mean_token_accuracy": 0.5901159644126892, + "num_tokens": 8961355539.0, + "step": 17530 + }, + { + "epoch": 4.7406706327744725, + "grad_norm": 0.8027585744857788, + "learning_rate": 2.1269487207189125e-06, + "loss": 1.7963, + "mean_token_accuracy": 0.6014992594718933, + "num_tokens": 8961826700.0, + "step": 17531 + }, + { + "epoch": 4.740941049215793, + "grad_norm": 0.8475282788276672, + "learning_rate": 2.12668500386395e-06, + "loss": 1.8607, + "mean_token_accuracy": 0.55516517162323, + "num_tokens": 8962350977.0, + "step": 17532 + }, + { + "epoch": 4.741211465657112, + "grad_norm": 0.925320565700531, + "learning_rate": 2.1264215592683867e-06, + "loss": 1.8089, + "mean_token_accuracy": 0.5952427983283997, + "num_tokens": 8962816434.0, + "step": 17533 + }, + { + "epoch": 4.741481882098432, + "grad_norm": 0.867283046245575, + "learning_rate": 2.1261583869403058e-06, + "loss": 1.8493, + "mean_token_accuracy": 0.5770671367645264, + "num_tokens": 8963327364.0, + "step": 17534 + }, + { + "epoch": 4.741752298539751, + "grad_norm": 0.8375946879386902, + "learning_rate": 2.1258954868877805e-06, + "loss": 1.7416, + "mean_token_accuracy": 0.6046066284179688, + "num_tokens": 8963792398.0, + "step": 17535 + }, + { + "epoch": 4.742022714981071, + "grad_norm": 0.8254072666168213, + "learning_rate": 2.1256328591188805e-06, + "loss": 1.8958, + "mean_token_accuracy": 0.5796763896942139, + "num_tokens": 8964316579.0, + "step": 17536 + }, + { + "epoch": 4.74229313142239, + "grad_norm": 0.8109263777732849, + "learning_rate": 2.1253705036416616e-06, + "loss": 1.6942, + "mean_token_accuracy": 0.5948268175125122, + "num_tokens": 8964840771.0, + "step": 17537 + }, + { + "epoch": 4.7425635478637105, + "grad_norm": 0.8884682059288025, + "learning_rate": 2.1251084204641735e-06, + "loss": 1.7502, + "mean_token_accuracy": 0.585211992263794, + "num_tokens": 8965364949.0, + "step": 17538 + }, + { + "epoch": 4.74283396430503, + "grad_norm": 0.8149519562721252, + "learning_rate": 2.1248466095944595e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5902818441390991, + "num_tokens": 8965889209.0, + "step": 17539 + }, + { + "epoch": 4.743104380746349, + "grad_norm": 0.8395898938179016, + "learning_rate": 2.124585071040552e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5646927356719971, + "num_tokens": 8966413418.0, + "step": 17540 + }, + { + "epoch": 4.743374797187669, + "grad_norm": 0.35695937275886536, + "learning_rate": 2.1243238048104743e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.7086837887763977, + "num_tokens": 8966937620.0, + "step": 17541 + }, + { + "epoch": 4.743645213628989, + "grad_norm": 0.8185943961143494, + "learning_rate": 2.1240628109122436e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5994716882705688, + "num_tokens": 8967395863.0, + "step": 17542 + }, + { + "epoch": 4.743915630070308, + "grad_norm": 0.8072099089622498, + "learning_rate": 2.1238020893538698e-06, + "loss": 1.8224, + "mean_token_accuracy": 0.58033287525177, + "num_tokens": 8967920138.0, + "step": 17543 + }, + { + "epoch": 4.7441860465116275, + "grad_norm": 0.8209421634674072, + "learning_rate": 2.1235416401433497e-06, + "loss": 1.7401, + "mean_token_accuracy": 0.5932735204696655, + "num_tokens": 8968444235.0, + "step": 17544 + }, + { + "epoch": 4.7444564629529475, + "grad_norm": 0.8663823008537292, + "learning_rate": 2.1232814632886767e-06, + "loss": 1.9269, + "mean_token_accuracy": 0.5534837245941162, + "num_tokens": 8968968470.0, + "step": 17545 + }, + { + "epoch": 4.744726879394268, + "grad_norm": 0.8002406358718872, + "learning_rate": 2.1230215587978325e-06, + "loss": 1.9301, + "mean_token_accuracy": 0.5658958554267883, + "num_tokens": 8969492656.0, + "step": 17546 + }, + { + "epoch": 4.744997295835587, + "grad_norm": 0.7707324624061584, + "learning_rate": 2.1227619266787936e-06, + "loss": 1.8303, + "mean_token_accuracy": 0.5687346458435059, + "num_tokens": 8970016928.0, + "step": 17547 + }, + { + "epoch": 4.745267712276906, + "grad_norm": 0.8066577911376953, + "learning_rate": 2.1225025669395232e-06, + "loss": 1.8578, + "mean_token_accuracy": 0.5626252889633179, + "num_tokens": 8970541129.0, + "step": 17548 + }, + { + "epoch": 4.745538128718226, + "grad_norm": 0.8573999404907227, + "learning_rate": 2.1222434795879836e-06, + "loss": 1.7252, + "mean_token_accuracy": 0.6034843921661377, + "num_tokens": 8970999745.0, + "step": 17549 + }, + { + "epoch": 4.745808545159546, + "grad_norm": 0.7946937084197998, + "learning_rate": 2.1219846646321195e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.582539439201355, + "num_tokens": 8971473629.0, + "step": 17550 + }, + { + "epoch": 4.746078961600865, + "grad_norm": 0.9223810434341431, + "learning_rate": 2.1217261220798753e-06, + "loss": 1.7858, + "mean_token_accuracy": 0.5874305963516235, + "num_tokens": 8971997910.0, + "step": 17551 + }, + { + "epoch": 4.746349378042185, + "grad_norm": 0.9220787882804871, + "learning_rate": 2.1214678519391837e-06, + "loss": 1.8016, + "mean_token_accuracy": 0.5928279757499695, + "num_tokens": 8972462689.0, + "step": 17552 + }, + { + "epoch": 4.746619794483505, + "grad_norm": 0.8304014205932617, + "learning_rate": 2.1212098542179675e-06, + "loss": 1.7111, + "mean_token_accuracy": 0.5794888734817505, + "num_tokens": 8972986824.0, + "step": 17553 + }, + { + "epoch": 4.746890210924824, + "grad_norm": 0.760563850402832, + "learning_rate": 2.120952128924144e-06, + "loss": 1.685, + "mean_token_accuracy": 0.6063926815986633, + "num_tokens": 8973510948.0, + "step": 17554 + }, + { + "epoch": 4.747160627366144, + "grad_norm": 0.7857894897460938, + "learning_rate": 2.12069467606562e-06, + "loss": 1.635, + "mean_token_accuracy": 0.6396304368972778, + "num_tokens": 8974035168.0, + "step": 17555 + }, + { + "epoch": 4.747431043807463, + "grad_norm": 0.7505438327789307, + "learning_rate": 2.1204374956502975e-06, + "loss": 1.8821, + "mean_token_accuracy": 0.5799591541290283, + "num_tokens": 8974559397.0, + "step": 17556 + }, + { + "epoch": 4.747701460248783, + "grad_norm": 0.7851313352584839, + "learning_rate": 2.120180587686065e-06, + "loss": 1.9398, + "mean_token_accuracy": 0.5641602277755737, + "num_tokens": 8975040996.0, + "step": 17557 + }, + { + "epoch": 4.7479718766901025, + "grad_norm": 0.9303995370864868, + "learning_rate": 2.119923952180805e-06, + "loss": 1.8582, + "mean_token_accuracy": 0.5702850818634033, + "num_tokens": 8975565209.0, + "step": 17558 + }, + { + "epoch": 4.7482422931314225, + "grad_norm": 1.0101231336593628, + "learning_rate": 2.119667589142394e-06, + "loss": 1.8502, + "mean_token_accuracy": 0.5551313757896423, + "num_tokens": 8976089461.0, + "step": 17559 + }, + { + "epoch": 4.748512709572742, + "grad_norm": 0.9389458298683167, + "learning_rate": 2.1194114985786963e-06, + "loss": 1.6181, + "mean_token_accuracy": 0.6201545000076294, + "num_tokens": 8976613534.0, + "step": 17560 + }, + { + "epoch": 4.748783126014062, + "grad_norm": 0.3546393811702728, + "learning_rate": 2.1191556804975697e-06, + "loss": 1.104, + "mean_token_accuracy": 0.6987746953964233, + "num_tokens": 8977137541.0, + "step": 17561 + }, + { + "epoch": 4.749053542455381, + "grad_norm": 0.8539307117462158, + "learning_rate": 2.1189001349068636e-06, + "loss": 1.8277, + "mean_token_accuracy": 0.5901919603347778, + "num_tokens": 8977645643.0, + "step": 17562 + }, + { + "epoch": 4.749323958896701, + "grad_norm": 0.8392009139060974, + "learning_rate": 2.118644861814419e-06, + "loss": 1.6804, + "mean_token_accuracy": 0.595723032951355, + "num_tokens": 8978169926.0, + "step": 17563 + }, + { + "epoch": 4.74959437533802, + "grad_norm": 0.8390695452690125, + "learning_rate": 2.1183898612280677e-06, + "loss": 1.8533, + "mean_token_accuracy": 0.5697146654129028, + "num_tokens": 8978694133.0, + "step": 17564 + }, + { + "epoch": 4.74986479177934, + "grad_norm": 0.8630149364471436, + "learning_rate": 2.1181351331556363e-06, + "loss": 1.8363, + "mean_token_accuracy": 0.5705166459083557, + "num_tokens": 8979218415.0, + "step": 17565 + }, + { + "epoch": 4.75013520822066, + "grad_norm": 0.8859679698944092, + "learning_rate": 2.1178806776049384e-06, + "loss": 1.8209, + "mean_token_accuracy": 0.568336009979248, + "num_tokens": 8979738147.0, + "step": 17566 + }, + { + "epoch": 4.75040562466198, + "grad_norm": 0.8102579116821289, + "learning_rate": 2.1176264945837802e-06, + "loss": 1.7649, + "mean_token_accuracy": 0.5781173706054688, + "num_tokens": 8980221380.0, + "step": 17567 + }, + { + "epoch": 4.750676041103299, + "grad_norm": 0.9371504187583923, + "learning_rate": 2.1173725840999646e-06, + "loss": 1.7709, + "mean_token_accuracy": 0.593603789806366, + "num_tokens": 8980745559.0, + "step": 17568 + }, + { + "epoch": 4.750946457544619, + "grad_norm": 0.736781656742096, + "learning_rate": 2.1171189461612797e-06, + "loss": 1.8265, + "mean_token_accuracy": 0.5740755796432495, + "num_tokens": 8981269633.0, + "step": 17569 + }, + { + "epoch": 4.751216873985938, + "grad_norm": 0.9433727860450745, + "learning_rate": 2.1168655807755075e-06, + "loss": 1.7785, + "mean_token_accuracy": 0.5925053358078003, + "num_tokens": 8981785401.0, + "step": 17570 + }, + { + "epoch": 4.751487290427258, + "grad_norm": 0.8354403376579285, + "learning_rate": 2.1166124879504233e-06, + "loss": 1.7742, + "mean_token_accuracy": 0.5883150100708008, + "num_tokens": 8982309675.0, + "step": 17571 + }, + { + "epoch": 4.7517577068685775, + "grad_norm": 0.8524896502494812, + "learning_rate": 2.1163596676937938e-06, + "loss": 1.7824, + "mean_token_accuracy": 0.5885663032531738, + "num_tokens": 8982797044.0, + "step": 17572 + }, + { + "epoch": 4.7520281233098975, + "grad_norm": 1.0128142833709717, + "learning_rate": 2.1161071200133725e-06, + "loss": 1.8614, + "mean_token_accuracy": 0.5920182466506958, + "num_tokens": 8983261782.0, + "step": 17573 + }, + { + "epoch": 4.752298539751217, + "grad_norm": 0.7656306624412537, + "learning_rate": 2.1158548449169124e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.595930814743042, + "num_tokens": 8983725521.0, + "step": 17574 + }, + { + "epoch": 4.752568956192537, + "grad_norm": 0.9182208180427551, + "learning_rate": 2.1156028424121506e-06, + "loss": 1.7876, + "mean_token_accuracy": 0.5803003907203674, + "num_tokens": 8984249738.0, + "step": 17575 + }, + { + "epoch": 4.752839372633856, + "grad_norm": 0.8961986899375916, + "learning_rate": 2.115351112506821e-06, + "loss": 1.7151, + "mean_token_accuracy": 0.5853517055511475, + "num_tokens": 8984773761.0, + "step": 17576 + }, + { + "epoch": 4.753109789075176, + "grad_norm": 0.882546603679657, + "learning_rate": 2.1150996552086484e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.562716007232666, + "num_tokens": 8985240553.0, + "step": 17577 + }, + { + "epoch": 4.753380205516495, + "grad_norm": 0.8346177339553833, + "learning_rate": 2.1148484705253475e-06, + "loss": 1.8272, + "mean_token_accuracy": 0.6013913750648499, + "num_tokens": 8985764837.0, + "step": 17578 + }, + { + "epoch": 4.753650621957815, + "grad_norm": 0.8190325498580933, + "learning_rate": 2.1145975584646237e-06, + "loss": 1.8248, + "mean_token_accuracy": 0.579424262046814, + "num_tokens": 8986289082.0, + "step": 17579 + }, + { + "epoch": 4.753921038399135, + "grad_norm": 1.031133770942688, + "learning_rate": 2.1143469190341776e-06, + "loss": 1.8554, + "mean_token_accuracy": 0.5608293414115906, + "num_tokens": 8986803169.0, + "step": 17580 + }, + { + "epoch": 4.754191454840454, + "grad_norm": 0.37416312098503113, + "learning_rate": 2.1140965522416996e-06, + "loss": 1.063, + "mean_token_accuracy": 0.7037925720214844, + "num_tokens": 8987313152.0, + "step": 17581 + }, + { + "epoch": 4.754461871281774, + "grad_norm": 0.8538588881492615, + "learning_rate": 2.1138464580948704e-06, + "loss": 1.7359, + "mean_token_accuracy": 0.569078803062439, + "num_tokens": 8987837425.0, + "step": 17582 + }, + { + "epoch": 4.754732287723094, + "grad_norm": 0.8850281834602356, + "learning_rate": 2.1135966366013642e-06, + "loss": 1.7766, + "mean_token_accuracy": 0.5858966112136841, + "num_tokens": 8988361499.0, + "step": 17583 + }, + { + "epoch": 4.755002704164413, + "grad_norm": 0.7492890357971191, + "learning_rate": 2.113347087768847e-06, + "loss": 1.7771, + "mean_token_accuracy": 0.582947850227356, + "num_tokens": 8988885770.0, + "step": 17584 + }, + { + "epoch": 4.755273120605732, + "grad_norm": 0.7909255623817444, + "learning_rate": 2.1130978116049754e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5533617734909058, + "num_tokens": 8989410041.0, + "step": 17585 + }, + { + "epoch": 4.7555435370470525, + "grad_norm": 0.8441532254219055, + "learning_rate": 2.1128488081173974e-06, + "loss": 1.6726, + "mean_token_accuracy": 0.6009845733642578, + "num_tokens": 8989917369.0, + "step": 17586 + }, + { + "epoch": 4.7558139534883725, + "grad_norm": 0.7283923029899597, + "learning_rate": 2.112600077313753e-06, + "loss": 1.706, + "mean_token_accuracy": 0.5924497842788696, + "num_tokens": 8990441653.0, + "step": 17587 + }, + { + "epoch": 4.756084369929692, + "grad_norm": 0.9955040216445923, + "learning_rate": 2.1123516192016746e-06, + "loss": 1.9372, + "mean_token_accuracy": 0.5847948789596558, + "num_tokens": 8990900397.0, + "step": 17588 + }, + { + "epoch": 4.756354786371011, + "grad_norm": 0.7956796288490295, + "learning_rate": 2.1121034337887847e-06, + "loss": 1.7954, + "mean_token_accuracy": 0.5756518244743347, + "num_tokens": 8991424569.0, + "step": 17589 + }, + { + "epoch": 4.756625202812331, + "grad_norm": 0.7342748045921326, + "learning_rate": 2.1118555210826996e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.5647132992744446, + "num_tokens": 8991948842.0, + "step": 17590 + }, + { + "epoch": 4.756895619253651, + "grad_norm": 0.9293928742408752, + "learning_rate": 2.1116078810910246e-06, + "loss": 1.7814, + "mean_token_accuracy": 0.5792733430862427, + "num_tokens": 8992473090.0, + "step": 17591 + }, + { + "epoch": 4.75716603569497, + "grad_norm": 0.925590455532074, + "learning_rate": 2.1113605138213607e-06, + "loss": 1.8692, + "mean_token_accuracy": 0.5717024803161621, + "num_tokens": 8992997315.0, + "step": 17592 + }, + { + "epoch": 4.7574364521362895, + "grad_norm": 0.9138393402099609, + "learning_rate": 2.111113419281295e-06, + "loss": 1.6668, + "mean_token_accuracy": 0.6073850989341736, + "num_tokens": 8993521511.0, + "step": 17593 + }, + { + "epoch": 4.75770686857761, + "grad_norm": 0.8119555115699768, + "learning_rate": 2.11086659747841e-06, + "loss": 1.8177, + "mean_token_accuracy": 0.5505041480064392, + "num_tokens": 8994045654.0, + "step": 17594 + }, + { + "epoch": 4.757977285018929, + "grad_norm": 0.857639729976654, + "learning_rate": 2.110620048420279e-06, + "loss": 1.8051, + "mean_token_accuracy": 0.5817891359329224, + "num_tokens": 8994569857.0, + "step": 17595 + }, + { + "epoch": 4.758247701460249, + "grad_norm": 0.8986273407936096, + "learning_rate": 2.1103737721144666e-06, + "loss": 1.8258, + "mean_token_accuracy": 0.5776406526565552, + "num_tokens": 8995094040.0, + "step": 17596 + }, + { + "epoch": 4.758518117901568, + "grad_norm": 0.8918991088867188, + "learning_rate": 2.1101277685685305e-06, + "loss": 1.7901, + "mean_token_accuracy": 0.5742154121398926, + "num_tokens": 8995570628.0, + "step": 17597 + }, + { + "epoch": 4.758788534342888, + "grad_norm": 0.7776859402656555, + "learning_rate": 2.1098820377900178e-06, + "loss": 1.8407, + "mean_token_accuracy": 0.5944207906723022, + "num_tokens": 8996064849.0, + "step": 17598 + }, + { + "epoch": 4.759058950784207, + "grad_norm": 0.7274491190910339, + "learning_rate": 2.1096365797864675e-06, + "loss": 1.7605, + "mean_token_accuracy": 0.59944087266922, + "num_tokens": 8996570546.0, + "step": 17599 + }, + { + "epoch": 4.7593293672255275, + "grad_norm": 0.81004798412323, + "learning_rate": 2.1093913945654127e-06, + "loss": 1.841, + "mean_token_accuracy": 0.5857104063034058, + "num_tokens": 8997094653.0, + "step": 17600 + }, + { + "epoch": 4.759599783666847, + "grad_norm": 0.36116158962249756, + "learning_rate": 2.109146482134375e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.716468095779419, + "num_tokens": 8997618861.0, + "step": 17601 + }, + { + "epoch": 4.759870200108167, + "grad_norm": 0.7919545769691467, + "learning_rate": 2.108901842500869e-06, + "loss": 1.7668, + "mean_token_accuracy": 0.6114932894706726, + "num_tokens": 8998143045.0, + "step": 17602 + }, + { + "epoch": 4.760140616549486, + "grad_norm": 0.8006089329719543, + "learning_rate": 2.1086574756724017e-06, + "loss": 1.7855, + "mean_token_accuracy": 0.5830665826797485, + "num_tokens": 8998635687.0, + "step": 17603 + }, + { + "epoch": 4.760411032990806, + "grad_norm": 0.8195778727531433, + "learning_rate": 2.108413381656471e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.567997932434082, + "num_tokens": 8999159794.0, + "step": 17604 + }, + { + "epoch": 4.760681449432125, + "grad_norm": 0.8201197981834412, + "learning_rate": 2.108169560460566e-06, + "loss": 1.8166, + "mean_token_accuracy": 0.5817337036132812, + "num_tokens": 8999684003.0, + "step": 17605 + }, + { + "epoch": 4.760951865873445, + "grad_norm": 0.8142721652984619, + "learning_rate": 2.107926012092169e-06, + "loss": 1.8597, + "mean_token_accuracy": 0.551918625831604, + "num_tokens": 9000208275.0, + "step": 17606 + }, + { + "epoch": 4.7612222823147645, + "grad_norm": 0.8219768404960632, + "learning_rate": 2.107682736558751e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5731958746910095, + "num_tokens": 9000732543.0, + "step": 17607 + }, + { + "epoch": 4.761492698756085, + "grad_norm": 0.8311392664909363, + "learning_rate": 2.1074397338677776e-06, + "loss": 1.8466, + "mean_token_accuracy": 0.5828647613525391, + "num_tokens": 9001201360.0, + "step": 17608 + }, + { + "epoch": 4.761763115197404, + "grad_norm": 1.2325586080551147, + "learning_rate": 2.1071970040267036e-06, + "loss": 1.9779, + "mean_token_accuracy": 0.554541289806366, + "num_tokens": 9001725584.0, + "step": 17609 + }, + { + "epoch": 4.762033531638724, + "grad_norm": 1.0359878540039062, + "learning_rate": 2.1069545470429788e-06, + "loss": 1.6605, + "mean_token_accuracy": 0.6065120697021484, + "num_tokens": 9002249723.0, + "step": 17610 + }, + { + "epoch": 4.762303948080043, + "grad_norm": 0.7995163202285767, + "learning_rate": 2.1067123629240404e-06, + "loss": 1.5871, + "mean_token_accuracy": 0.6360000371932983, + "num_tokens": 9002756211.0, + "step": 17611 + }, + { + "epoch": 4.762574364521363, + "grad_norm": 0.8919191360473633, + "learning_rate": 2.106470451677321e-06, + "loss": 1.8255, + "mean_token_accuracy": 0.5695540904998779, + "num_tokens": 9003280476.0, + "step": 17612 + }, + { + "epoch": 4.762844780962682, + "grad_norm": 0.8635651469230652, + "learning_rate": 2.106228813310241e-06, + "loss": 1.6612, + "mean_token_accuracy": 0.6299421191215515, + "num_tokens": 9003792848.0, + "step": 17613 + }, + { + "epoch": 4.7631151974040025, + "grad_norm": 0.7761894464492798, + "learning_rate": 2.1059874478302165e-06, + "loss": 1.7846, + "mean_token_accuracy": 0.5856274366378784, + "num_tokens": 9004264783.0, + "step": 17614 + }, + { + "epoch": 4.763385613845322, + "grad_norm": 0.7230052351951599, + "learning_rate": 2.105746355244653e-06, + "loss": 1.8781, + "mean_token_accuracy": 0.5623819231987, + "num_tokens": 9004789027.0, + "step": 17615 + }, + { + "epoch": 4.763656030286642, + "grad_norm": 0.8650824427604675, + "learning_rate": 2.105505535560948e-06, + "loss": 1.9014, + "mean_token_accuracy": 0.5717568397521973, + "num_tokens": 9005313301.0, + "step": 17616 + }, + { + "epoch": 4.763926446727961, + "grad_norm": 0.8742016553878784, + "learning_rate": 2.105264988786489e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5819735527038574, + "num_tokens": 9005836392.0, + "step": 17617 + }, + { + "epoch": 4.764196863169281, + "grad_norm": 1.0098272562026978, + "learning_rate": 2.1050247149286584e-06, + "loss": 1.7377, + "mean_token_accuracy": 0.6118086576461792, + "num_tokens": 9006279471.0, + "step": 17618 + }, + { + "epoch": 4.7644672796106, + "grad_norm": 0.8933142423629761, + "learning_rate": 2.1047847139948276e-06, + "loss": 1.8178, + "mean_token_accuracy": 0.5717675685882568, + "num_tokens": 9006754337.0, + "step": 17619 + }, + { + "epoch": 4.76473769605192, + "grad_norm": 0.7735505700111389, + "learning_rate": 2.1045449859923617e-06, + "loss": 1.813, + "mean_token_accuracy": 0.5913177728652954, + "num_tokens": 9007230147.0, + "step": 17620 + }, + { + "epoch": 4.7650081124932395, + "grad_norm": 0.39436957240104675, + "learning_rate": 2.104305530928615e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.7010248899459839, + "num_tokens": 9007696826.0, + "step": 17621 + }, + { + "epoch": 4.765278528934559, + "grad_norm": 0.9410625696182251, + "learning_rate": 2.1040663488109356e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5725537538528442, + "num_tokens": 9008221017.0, + "step": 17622 + }, + { + "epoch": 4.765548945375879, + "grad_norm": 0.9598236680030823, + "learning_rate": 2.103827439646662e-06, + "loss": 1.9225, + "mean_token_accuracy": 0.5640512704849243, + "num_tokens": 9008745170.0, + "step": 17623 + }, + { + "epoch": 4.765819361817199, + "grad_norm": 0.893080472946167, + "learning_rate": 2.103588803443124e-06, + "loss": 1.8539, + "mean_token_accuracy": 0.5982329845428467, + "num_tokens": 9009205219.0, + "step": 17624 + }, + { + "epoch": 4.766089778258518, + "grad_norm": 0.8424788117408752, + "learning_rate": 2.103350440207643e-06, + "loss": 1.7257, + "mean_token_accuracy": 0.5761055946350098, + "num_tokens": 9009729250.0, + "step": 17625 + }, + { + "epoch": 4.766360194699837, + "grad_norm": 0.9429053664207458, + "learning_rate": 2.1031123499475347e-06, + "loss": 1.7984, + "mean_token_accuracy": 0.5930317640304565, + "num_tokens": 9010216412.0, + "step": 17626 + }, + { + "epoch": 4.766630611141157, + "grad_norm": 0.9643758535385132, + "learning_rate": 2.102874532670104e-06, + "loss": 1.9547, + "mean_token_accuracy": 0.5549510717391968, + "num_tokens": 9010689946.0, + "step": 17627 + }, + { + "epoch": 4.7669010275824775, + "grad_norm": 0.9025964140892029, + "learning_rate": 2.102636988382648e-06, + "loss": 1.8233, + "mean_token_accuracy": 0.5667957067489624, + "num_tokens": 9011214027.0, + "step": 17628 + }, + { + "epoch": 4.767171444023797, + "grad_norm": 0.9914220571517944, + "learning_rate": 2.1023997170924533e-06, + "loss": 1.8011, + "mean_token_accuracy": 0.5920611619949341, + "num_tokens": 9011738284.0, + "step": 17629 + }, + { + "epoch": 4.767441860465116, + "grad_norm": 0.7662301659584045, + "learning_rate": 2.1021627188068015e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.570797324180603, + "num_tokens": 9012262524.0, + "step": 17630 + }, + { + "epoch": 4.767712276906436, + "grad_norm": 0.7381129264831543, + "learning_rate": 2.1019259935329657e-06, + "loss": 1.7557, + "mean_token_accuracy": 0.6022979021072388, + "num_tokens": 9012786625.0, + "step": 17631 + }, + { + "epoch": 4.767982693347756, + "grad_norm": 0.7937808036804199, + "learning_rate": 2.1016895412782065e-06, + "loss": 1.7586, + "mean_token_accuracy": 0.5882383584976196, + "num_tokens": 9013310897.0, + "step": 17632 + }, + { + "epoch": 4.768253109789075, + "grad_norm": 0.7687832117080688, + "learning_rate": 2.101453362049781e-06, + "loss": 1.8237, + "mean_token_accuracy": 0.579008936882019, + "num_tokens": 9013791085.0, + "step": 17633 + }, + { + "epoch": 4.768523526230394, + "grad_norm": 0.7092247009277344, + "learning_rate": 2.1012174558549343e-06, + "loss": 1.826, + "mean_token_accuracy": 0.5643545389175415, + "num_tokens": 9014315267.0, + "step": 17634 + }, + { + "epoch": 4.7687939426717145, + "grad_norm": 0.8919996023178101, + "learning_rate": 2.1009818227009077e-06, + "loss": 1.8517, + "mean_token_accuracy": 0.5662965178489685, + "num_tokens": 9014839319.0, + "step": 17635 + }, + { + "epoch": 4.769064359113034, + "grad_norm": 0.986951470375061, + "learning_rate": 2.100746462594927e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5771142244338989, + "num_tokens": 9015312784.0, + "step": 17636 + }, + { + "epoch": 4.769334775554354, + "grad_norm": 0.8370260000228882, + "learning_rate": 2.100511375544217e-06, + "loss": 1.8191, + "mean_token_accuracy": 0.5777409672737122, + "num_tokens": 9015837007.0, + "step": 17637 + }, + { + "epoch": 4.769605191995673, + "grad_norm": 1.0279693603515625, + "learning_rate": 2.100276561555989e-06, + "loss": 1.9139, + "mean_token_accuracy": 0.5519922971725464, + "num_tokens": 9016337932.0, + "step": 17638 + }, + { + "epoch": 4.769875608436993, + "grad_norm": 0.9019807577133179, + "learning_rate": 2.1000420206374497e-06, + "loss": 1.9155, + "mean_token_accuracy": 0.5610127449035645, + "num_tokens": 9016835672.0, + "step": 17639 + }, + { + "epoch": 4.770146024878312, + "grad_norm": 0.7650293111801147, + "learning_rate": 2.099807752795793e-06, + "loss": 1.7942, + "mean_token_accuracy": 0.5880085229873657, + "num_tokens": 9017348953.0, + "step": 17640 + }, + { + "epoch": 4.770416441319632, + "grad_norm": 0.36995571851730347, + "learning_rate": 2.0995737580382085e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.7032552361488342, + "num_tokens": 9017819590.0, + "step": 17641 + }, + { + "epoch": 4.770686857760952, + "grad_norm": 0.7582262754440308, + "learning_rate": 2.099340036371876e-06, + "loss": 1.7663, + "mean_token_accuracy": 0.5909844636917114, + "num_tokens": 9018343865.0, + "step": 17642 + }, + { + "epoch": 4.770957274202272, + "grad_norm": 0.8972158432006836, + "learning_rate": 2.099106587803966e-06, + "loss": 1.812, + "mean_token_accuracy": 0.5772433280944824, + "num_tokens": 9018868068.0, + "step": 17643 + }, + { + "epoch": 4.771227690643591, + "grad_norm": 0.7428659796714783, + "learning_rate": 2.0988734123416427e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5777053833007812, + "num_tokens": 9019387431.0, + "step": 17644 + }, + { + "epoch": 4.771498107084911, + "grad_norm": 0.7608616948127747, + "learning_rate": 2.0986405099920577e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.5741621255874634, + "num_tokens": 9019911571.0, + "step": 17645 + }, + { + "epoch": 4.77176852352623, + "grad_norm": 0.8365705013275146, + "learning_rate": 2.09840788076236e-06, + "loss": 1.9674, + "mean_token_accuracy": 0.556233286857605, + "num_tokens": 9020435842.0, + "step": 17646 + }, + { + "epoch": 4.77203893996755, + "grad_norm": 0.8328032493591309, + "learning_rate": 2.0981755246596864e-06, + "loss": 1.8857, + "mean_token_accuracy": 0.5597178339958191, + "num_tokens": 9020960109.0, + "step": 17647 + }, + { + "epoch": 4.772309356408869, + "grad_norm": 0.8625178933143616, + "learning_rate": 2.0979434416911664e-06, + "loss": 1.8493, + "mean_token_accuracy": 0.5699188709259033, + "num_tokens": 9021484247.0, + "step": 17648 + }, + { + "epoch": 4.7725797728501895, + "grad_norm": 0.9312028288841248, + "learning_rate": 2.097711631863921e-06, + "loss": 1.8999, + "mean_token_accuracy": 0.5636723041534424, + "num_tokens": 9022008413.0, + "step": 17649 + }, + { + "epoch": 4.772850189291509, + "grad_norm": 0.9202159643173218, + "learning_rate": 2.097480095185062e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.5689114332199097, + "num_tokens": 9022532619.0, + "step": 17650 + }, + { + "epoch": 4.773120605732829, + "grad_norm": 0.7854678630828857, + "learning_rate": 2.0972488316616947e-06, + "loss": 1.7588, + "mean_token_accuracy": 0.595798134803772, + "num_tokens": 9023052285.0, + "step": 17651 + }, + { + "epoch": 4.773391022174148, + "grad_norm": 0.8781086206436157, + "learning_rate": 2.097017841300915e-06, + "loss": 1.7818, + "mean_token_accuracy": 0.570756196975708, + "num_tokens": 9023576342.0, + "step": 17652 + }, + { + "epoch": 4.773661438615468, + "grad_norm": 0.8773707151412964, + "learning_rate": 2.096787124109809e-06, + "loss": 1.863, + "mean_token_accuracy": 0.5594101548194885, + "num_tokens": 9024100535.0, + "step": 17653 + }, + { + "epoch": 4.773931855056787, + "grad_norm": 0.828592836856842, + "learning_rate": 2.096556680095457e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5890610218048096, + "num_tokens": 9024624735.0, + "step": 17654 + }, + { + "epoch": 4.774202271498107, + "grad_norm": 0.8063544631004333, + "learning_rate": 2.09632650926493e-06, + "loss": 1.8084, + "mean_token_accuracy": 0.5802841186523438, + "num_tokens": 9025148943.0, + "step": 17655 + }, + { + "epoch": 4.774472687939427, + "grad_norm": 0.840968132019043, + "learning_rate": 2.0960966116252884e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5775513648986816, + "num_tokens": 9025673178.0, + "step": 17656 + }, + { + "epoch": 4.774743104380747, + "grad_norm": 0.8587331175804138, + "learning_rate": 2.0958669871835884e-06, + "loss": 1.7601, + "mean_token_accuracy": 0.573150634765625, + "num_tokens": 9026197297.0, + "step": 17657 + }, + { + "epoch": 4.775013520822066, + "grad_norm": 0.9281879663467407, + "learning_rate": 2.095637635946875e-06, + "loss": 1.8318, + "mean_token_accuracy": 0.5494891405105591, + "num_tokens": 9026721503.0, + "step": 17658 + }, + { + "epoch": 4.775283937263386, + "grad_norm": 0.9185341596603394, + "learning_rate": 2.0954085579221837e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.5855369567871094, + "num_tokens": 9027222900.0, + "step": 17659 + }, + { + "epoch": 4.775554353704705, + "grad_norm": 0.8317984938621521, + "learning_rate": 2.095179753116546e-06, + "loss": 1.7301, + "mean_token_accuracy": 0.5959235429763794, + "num_tokens": 9027722244.0, + "step": 17660 + }, + { + "epoch": 4.775824770146025, + "grad_norm": 0.3403145968914032, + "learning_rate": 2.0949512215369793e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.7129487991333008, + "num_tokens": 9028246495.0, + "step": 17661 + }, + { + "epoch": 4.776095186587344, + "grad_norm": 1.0167664289474487, + "learning_rate": 2.0947229631904973e-06, + "loss": 1.7638, + "mean_token_accuracy": 0.5725639462471008, + "num_tokens": 9028770768.0, + "step": 17662 + }, + { + "epoch": 4.7763656030286645, + "grad_norm": 1.0648183822631836, + "learning_rate": 2.0944949780841047e-06, + "loss": 1.805, + "mean_token_accuracy": 0.5962581634521484, + "num_tokens": 9029250168.0, + "step": 17663 + }, + { + "epoch": 4.776636019469984, + "grad_norm": 0.7899486422538757, + "learning_rate": 2.094267266224795e-06, + "loss": 1.8814, + "mean_token_accuracy": 0.5707171559333801, + "num_tokens": 9029774404.0, + "step": 17664 + }, + { + "epoch": 4.776906435911304, + "grad_norm": 0.8781746625900269, + "learning_rate": 2.094039827619555e-06, + "loss": 1.9292, + "mean_token_accuracy": 0.584162712097168, + "num_tokens": 9030298440.0, + "step": 17665 + }, + { + "epoch": 4.777176852352623, + "grad_norm": 0.8767796754837036, + "learning_rate": 2.0938126622753646e-06, + "loss": 1.7497, + "mean_token_accuracy": 0.5955090522766113, + "num_tokens": 9030822508.0, + "step": 17666 + }, + { + "epoch": 4.777447268793942, + "grad_norm": 0.8926335573196411, + "learning_rate": 2.0935857701991934e-06, + "loss": 1.7475, + "mean_token_accuracy": 0.5996289253234863, + "num_tokens": 9031282918.0, + "step": 17667 + }, + { + "epoch": 4.777717685235262, + "grad_norm": 0.8448097109794617, + "learning_rate": 2.0933591513980016e-06, + "loss": 1.8028, + "mean_token_accuracy": 0.5838308334350586, + "num_tokens": 9031807180.0, + "step": 17668 + }, + { + "epoch": 4.777988101676582, + "grad_norm": 0.9244452118873596, + "learning_rate": 2.0931328058787452e-06, + "loss": 1.7757, + "mean_token_accuracy": 0.5877159833908081, + "num_tokens": 9032280110.0, + "step": 17669 + }, + { + "epoch": 4.778258518117902, + "grad_norm": 0.9414304494857788, + "learning_rate": 2.092906733648367e-06, + "loss": 1.7476, + "mean_token_accuracy": 0.5937809348106384, + "num_tokens": 9032804289.0, + "step": 17670 + }, + { + "epoch": 4.778528934559221, + "grad_norm": 0.8703526258468628, + "learning_rate": 2.0926809347138044e-06, + "loss": 1.8763, + "mean_token_accuracy": 0.5673260688781738, + "num_tokens": 9033328522.0, + "step": 17671 + }, + { + "epoch": 4.778799351000541, + "grad_norm": 0.8004273772239685, + "learning_rate": 2.092455409081985e-06, + "loss": 1.8458, + "mean_token_accuracy": 0.5773841142654419, + "num_tokens": 9033852796.0, + "step": 17672 + }, + { + "epoch": 4.779069767441861, + "grad_norm": 0.8371442556381226, + "learning_rate": 2.092230156759829e-06, + "loss": 1.5555, + "mean_token_accuracy": 0.6317938566207886, + "num_tokens": 9034377079.0, + "step": 17673 + }, + { + "epoch": 4.77934018388318, + "grad_norm": 0.8528309464454651, + "learning_rate": 2.0920051777542486e-06, + "loss": 1.9096, + "mean_token_accuracy": 0.5342049598693848, + "num_tokens": 9034901243.0, + "step": 17674 + }, + { + "epoch": 4.779610600324499, + "grad_norm": 0.7727441191673279, + "learning_rate": 2.0917804720721455e-06, + "loss": 1.7983, + "mean_token_accuracy": 0.5777936577796936, + "num_tokens": 9035376019.0, + "step": 17675 + }, + { + "epoch": 4.7798810167658194, + "grad_norm": 0.8514204025268555, + "learning_rate": 2.091556039720414e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5940737724304199, + "num_tokens": 9035842788.0, + "step": 17676 + }, + { + "epoch": 4.780151433207139, + "grad_norm": 0.841283917427063, + "learning_rate": 2.091331880705943e-06, + "loss": 1.8667, + "mean_token_accuracy": 0.573811411857605, + "num_tokens": 9036367063.0, + "step": 17677 + }, + { + "epoch": 4.780421849648459, + "grad_norm": 0.7884813547134399, + "learning_rate": 2.091107995035608e-06, + "loss": 1.903, + "mean_token_accuracy": 0.5648466348648071, + "num_tokens": 9036891233.0, + "step": 17678 + }, + { + "epoch": 4.780692266089778, + "grad_norm": 0.7845155596733093, + "learning_rate": 2.0908843827162782e-06, + "loss": 1.8388, + "mean_token_accuracy": 0.5601154565811157, + "num_tokens": 9037415429.0, + "step": 17679 + }, + { + "epoch": 4.780962682531098, + "grad_norm": 0.8239524364471436, + "learning_rate": 2.0906610437548165e-06, + "loss": 1.8363, + "mean_token_accuracy": 0.580640435218811, + "num_tokens": 9037939647.0, + "step": 17680 + }, + { + "epoch": 4.781233098972417, + "grad_norm": 0.3662675619125366, + "learning_rate": 2.0904379781580744e-06, + "loss": 1.2204, + "mean_token_accuracy": 0.6811760067939758, + "num_tokens": 9038407900.0, + "step": 17681 + }, + { + "epoch": 4.781503515413737, + "grad_norm": 0.92915940284729, + "learning_rate": 2.090215185932896e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5843435525894165, + "num_tokens": 9038932094.0, + "step": 17682 + }, + { + "epoch": 4.7817739318550565, + "grad_norm": 0.956464409828186, + "learning_rate": 2.0899926670861183e-06, + "loss": 1.9396, + "mean_token_accuracy": 0.5587725639343262, + "num_tokens": 9039385663.0, + "step": 17683 + }, + { + "epoch": 4.782044348296377, + "grad_norm": 0.9319024682044983, + "learning_rate": 2.0897704216245683e-06, + "loss": 1.8643, + "mean_token_accuracy": 0.5694245100021362, + "num_tokens": 9039855909.0, + "step": 17684 + }, + { + "epoch": 4.782314764737696, + "grad_norm": 0.7574194669723511, + "learning_rate": 2.089548449555064e-06, + "loss": 1.8068, + "mean_token_accuracy": 0.5832030773162842, + "num_tokens": 9040322389.0, + "step": 17685 + }, + { + "epoch": 4.782585181179016, + "grad_norm": 0.8559732437133789, + "learning_rate": 2.0893267508844183e-06, + "loss": 1.8681, + "mean_token_accuracy": 0.5850329399108887, + "num_tokens": 9040825945.0, + "step": 17686 + }, + { + "epoch": 4.782855597620335, + "grad_norm": 0.8191994428634644, + "learning_rate": 2.089105325619432e-06, + "loss": 1.8603, + "mean_token_accuracy": 0.5715539455413818, + "num_tokens": 9041313705.0, + "step": 17687 + }, + { + "epoch": 4.783126014061655, + "grad_norm": 0.8905112743377686, + "learning_rate": 2.0888841737669003e-06, + "loss": 1.6784, + "mean_token_accuracy": 0.6050201654434204, + "num_tokens": 9041778079.0, + "step": 17688 + }, + { + "epoch": 4.783396430502974, + "grad_norm": 0.8095402121543884, + "learning_rate": 2.088663295333607e-06, + "loss": 1.8153, + "mean_token_accuracy": 0.5838184356689453, + "num_tokens": 9042302060.0, + "step": 17689 + }, + { + "epoch": 4.7836668469442944, + "grad_norm": 0.7632208466529846, + "learning_rate": 2.08844269032633e-06, + "loss": 1.7792, + "mean_token_accuracy": 0.5752557516098022, + "num_tokens": 9042826198.0, + "step": 17690 + }, + { + "epoch": 4.783937263385614, + "grad_norm": 0.9000329375267029, + "learning_rate": 2.0882223587518385e-06, + "loss": 1.8238, + "mean_token_accuracy": 0.5803855657577515, + "num_tokens": 9043308228.0, + "step": 17691 + }, + { + "epoch": 4.784207679826934, + "grad_norm": 0.924875020980835, + "learning_rate": 2.088002300616894e-06, + "loss": 1.7897, + "mean_token_accuracy": 0.5838505625724792, + "num_tokens": 9043832327.0, + "step": 17692 + }, + { + "epoch": 4.784478096268253, + "grad_norm": 0.8602734804153442, + "learning_rate": 2.0877825159282477e-06, + "loss": 1.7433, + "mean_token_accuracy": 0.5957784652709961, + "num_tokens": 9044356505.0, + "step": 17693 + }, + { + "epoch": 4.784748512709573, + "grad_norm": 0.9429853558540344, + "learning_rate": 2.087563004692641e-06, + "loss": 1.8585, + "mean_token_accuracy": 0.5821254849433899, + "num_tokens": 9044880677.0, + "step": 17694 + }, + { + "epoch": 4.785018929150892, + "grad_norm": 0.7995328903198242, + "learning_rate": 2.0873437669168124e-06, + "loss": 1.6153, + "mean_token_accuracy": 0.6226609945297241, + "num_tokens": 9045404859.0, + "step": 17695 + }, + { + "epoch": 4.785289345592212, + "grad_norm": 0.919832706451416, + "learning_rate": 2.0871248026074872e-06, + "loss": 1.9834, + "mean_token_accuracy": 0.5501411557197571, + "num_tokens": 9045911309.0, + "step": 17696 + }, + { + "epoch": 4.7855597620335315, + "grad_norm": 0.8469182252883911, + "learning_rate": 2.086906111771384e-06, + "loss": 1.8142, + "mean_token_accuracy": 0.5732471942901611, + "num_tokens": 9046392307.0, + "step": 17697 + }, + { + "epoch": 4.785830178474852, + "grad_norm": 0.8382552266120911, + "learning_rate": 2.0866876944152138e-06, + "loss": 1.4978, + "mean_token_accuracy": 0.6291131973266602, + "num_tokens": 9046916562.0, + "step": 17698 + }, + { + "epoch": 4.786100594916171, + "grad_norm": 0.9208321571350098, + "learning_rate": 2.0864695505456765e-06, + "loss": 1.8922, + "mean_token_accuracy": 0.5663904547691345, + "num_tokens": 9047440744.0, + "step": 17699 + }, + { + "epoch": 4.786371011357491, + "grad_norm": 0.8262743949890137, + "learning_rate": 2.086251680169466e-06, + "loss": 1.8832, + "mean_token_accuracy": 0.5581272840499878, + "num_tokens": 9047961808.0, + "step": 17700 + }, + { + "epoch": 4.78664142779881, + "grad_norm": 0.3304325342178345, + "learning_rate": 2.0860340832932687e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.7066631317138672, + "num_tokens": 9048486058.0, + "step": 17701 + }, + { + "epoch": 4.78691184424013, + "grad_norm": 0.8859944343566895, + "learning_rate": 2.0858167599237596e-06, + "loss": 1.8117, + "mean_token_accuracy": 0.5670853853225708, + "num_tokens": 9049010322.0, + "step": 17702 + }, + { + "epoch": 4.787182260681449, + "grad_norm": 0.830602765083313, + "learning_rate": 2.085599710067607e-06, + "loss": 1.9586, + "mean_token_accuracy": 0.5483168363571167, + "num_tokens": 9049534421.0, + "step": 17703 + }, + { + "epoch": 4.7874526771227695, + "grad_norm": 0.800888180732727, + "learning_rate": 2.0853829337314708e-06, + "loss": 1.866, + "mean_token_accuracy": 0.5801042318344116, + "num_tokens": 9050058697.0, + "step": 17704 + }, + { + "epoch": 4.787723093564089, + "grad_norm": 1.4231007099151611, + "learning_rate": 2.0851664309220025e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.5942994356155396, + "num_tokens": 9050582929.0, + "step": 17705 + }, + { + "epoch": 4.787993510005409, + "grad_norm": 0.7887952327728271, + "learning_rate": 2.0849502016458453e-06, + "loss": 1.7221, + "mean_token_accuracy": 0.6085844039916992, + "num_tokens": 9051107185.0, + "step": 17706 + }, + { + "epoch": 4.788263926446728, + "grad_norm": 0.8647623062133789, + "learning_rate": 2.0847342459096335e-06, + "loss": 1.8374, + "mean_token_accuracy": 0.5826317071914673, + "num_tokens": 9051631416.0, + "step": 17707 + }, + { + "epoch": 4.788534342888047, + "grad_norm": 0.9030976891517639, + "learning_rate": 2.0845185637199926e-06, + "loss": 1.7993, + "mean_token_accuracy": 0.5790834426879883, + "num_tokens": 9052155610.0, + "step": 17708 + }, + { + "epoch": 4.788804759329367, + "grad_norm": 1.0035544633865356, + "learning_rate": 2.0843031550835405e-06, + "loss": 1.8521, + "mean_token_accuracy": 0.571728527545929, + "num_tokens": 9052679803.0, + "step": 17709 + }, + { + "epoch": 4.789075175770687, + "grad_norm": 0.8675665855407715, + "learning_rate": 2.084088020006888e-06, + "loss": 1.7259, + "mean_token_accuracy": 0.574785053730011, + "num_tokens": 9053203986.0, + "step": 17710 + }, + { + "epoch": 4.7893455922120065, + "grad_norm": 0.9677753448486328, + "learning_rate": 2.0838731584966344e-06, + "loss": 1.8182, + "mean_token_accuracy": 0.582811713218689, + "num_tokens": 9053688295.0, + "step": 17711 + }, + { + "epoch": 4.789616008653326, + "grad_norm": 0.8408267498016357, + "learning_rate": 2.083658570559373e-06, + "loss": 1.6532, + "mean_token_accuracy": 0.6129461526870728, + "num_tokens": 9054212528.0, + "step": 17712 + }, + { + "epoch": 4.789886425094646, + "grad_norm": 0.8051175475120544, + "learning_rate": 2.0834442562016886e-06, + "loss": 1.7732, + "mean_token_accuracy": 0.6144877076148987, + "num_tokens": 9054671902.0, + "step": 17713 + }, + { + "epoch": 4.790156841535966, + "grad_norm": 0.9243943691253662, + "learning_rate": 2.083230215430155e-06, + "loss": 1.8511, + "mean_token_accuracy": 0.5701396465301514, + "num_tokens": 9055172678.0, + "step": 17714 + }, + { + "epoch": 4.790427257977285, + "grad_norm": 0.9013399481773376, + "learning_rate": 2.0830164482513415e-06, + "loss": 1.8565, + "mean_token_accuracy": 0.5676246881484985, + "num_tokens": 9055658420.0, + "step": 17715 + }, + { + "epoch": 4.790697674418604, + "grad_norm": 0.880694568157196, + "learning_rate": 2.0828029546718065e-06, + "loss": 1.8669, + "mean_token_accuracy": 0.5617535710334778, + "num_tokens": 9056182644.0, + "step": 17716 + }, + { + "epoch": 4.790968090859924, + "grad_norm": 0.8480312824249268, + "learning_rate": 2.0825897346981004e-06, + "loss": 1.855, + "mean_token_accuracy": 0.5877187252044678, + "num_tokens": 9056650095.0, + "step": 17717 + }, + { + "epoch": 4.791238507301244, + "grad_norm": 0.8076450228691101, + "learning_rate": 2.082376788336766e-06, + "loss": 1.8637, + "mean_token_accuracy": 0.5745044350624084, + "num_tokens": 9057174310.0, + "step": 17718 + }, + { + "epoch": 4.791508923742564, + "grad_norm": 0.8390880823135376, + "learning_rate": 2.082164115594336e-06, + "loss": 1.6891, + "mean_token_accuracy": 0.5989654660224915, + "num_tokens": 9057651539.0, + "step": 17719 + }, + { + "epoch": 4.791779340183883, + "grad_norm": 0.8375870585441589, + "learning_rate": 2.081951716477337e-06, + "loss": 1.8071, + "mean_token_accuracy": 0.5765830278396606, + "num_tokens": 9058167771.0, + "step": 17720 + }, + { + "epoch": 4.792049756625203, + "grad_norm": 0.3380003273487091, + "learning_rate": 2.0817395909922866e-06, + "loss": 1.1053, + "mean_token_accuracy": 0.6967304944992065, + "num_tokens": 9058691942.0, + "step": 17721 + }, + { + "epoch": 4.792320173066522, + "grad_norm": 0.9357646107673645, + "learning_rate": 2.081527739145691e-06, + "loss": 1.8758, + "mean_token_accuracy": 0.5644274950027466, + "num_tokens": 9059216167.0, + "step": 17722 + }, + { + "epoch": 4.792590589507842, + "grad_norm": 0.8998599648475647, + "learning_rate": 2.0813161609440526e-06, + "loss": 1.8648, + "mean_token_accuracy": 0.5799566507339478, + "num_tokens": 9059688528.0, + "step": 17723 + }, + { + "epoch": 4.792861005949161, + "grad_norm": 0.8992066979408264, + "learning_rate": 2.081104856393863e-06, + "loss": 1.8968, + "mean_token_accuracy": 0.5589438676834106, + "num_tokens": 9060212781.0, + "step": 17724 + }, + { + "epoch": 4.7931314223904815, + "grad_norm": 0.8299600481987, + "learning_rate": 2.0808938255016042e-06, + "loss": 1.8789, + "mean_token_accuracy": 0.5573968291282654, + "num_tokens": 9060736922.0, + "step": 17725 + }, + { + "epoch": 4.793401838831801, + "grad_norm": 0.7762044668197632, + "learning_rate": 2.0806830682737524e-06, + "loss": 1.8312, + "mean_token_accuracy": 0.5792568922042847, + "num_tokens": 9061238502.0, + "step": 17726 + }, + { + "epoch": 4.793672255273121, + "grad_norm": 0.8797130584716797, + "learning_rate": 2.0804725847167744e-06, + "loss": 1.7741, + "mean_token_accuracy": 0.597527801990509, + "num_tokens": 9061762656.0, + "step": 17727 + }, + { + "epoch": 4.79394267171444, + "grad_norm": 0.8619800209999084, + "learning_rate": 2.0802623748371285e-06, + "loss": 1.9619, + "mean_token_accuracy": 0.553464412689209, + "num_tokens": 9062231102.0, + "step": 17728 + }, + { + "epoch": 4.79421308815576, + "grad_norm": 0.8534674048423767, + "learning_rate": 2.080052438641264e-06, + "loss": 1.8624, + "mean_token_accuracy": 0.576749861240387, + "num_tokens": 9062755271.0, + "step": 17729 + }, + { + "epoch": 4.794483504597079, + "grad_norm": 0.8128117322921753, + "learning_rate": 2.079842776135622e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5846766233444214, + "num_tokens": 9063279466.0, + "step": 17730 + }, + { + "epoch": 4.794753921038399, + "grad_norm": 0.8713854551315308, + "learning_rate": 2.0796333873266367e-06, + "loss": 1.8429, + "mean_token_accuracy": 0.5866367816925049, + "num_tokens": 9063773647.0, + "step": 17731 + }, + { + "epoch": 4.795024337479719, + "grad_norm": 0.9974962472915649, + "learning_rate": 2.079424272220731e-06, + "loss": 1.9429, + "mean_token_accuracy": 0.5795358419418335, + "num_tokens": 9064240631.0, + "step": 17732 + }, + { + "epoch": 4.795294753921039, + "grad_norm": 0.8759713172912598, + "learning_rate": 2.079215430824323e-06, + "loss": 1.8096, + "mean_token_accuracy": 0.5685281753540039, + "num_tokens": 9064764804.0, + "step": 17733 + }, + { + "epoch": 4.795565170362358, + "grad_norm": 0.9385712146759033, + "learning_rate": 2.0790068631438193e-06, + "loss": 1.7821, + "mean_token_accuracy": 0.5784488916397095, + "num_tokens": 9065285658.0, + "step": 17734 + }, + { + "epoch": 4.795835586803678, + "grad_norm": 0.799241840839386, + "learning_rate": 2.078798569185622e-06, + "loss": 1.7825, + "mean_token_accuracy": 0.5940559506416321, + "num_tokens": 9065791490.0, + "step": 17735 + }, + { + "epoch": 4.796106003244997, + "grad_norm": 0.7299472689628601, + "learning_rate": 2.078590548956118e-06, + "loss": 1.7784, + "mean_token_accuracy": 0.5929350852966309, + "num_tokens": 9066315576.0, + "step": 17736 + }, + { + "epoch": 4.796376419686317, + "grad_norm": 0.821627676486969, + "learning_rate": 2.0783828024616926e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5686280727386475, + "num_tokens": 9066778619.0, + "step": 17737 + }, + { + "epoch": 4.796646836127636, + "grad_norm": 1.046473741531372, + "learning_rate": 2.0781753297087214e-06, + "loss": 1.7936, + "mean_token_accuracy": 0.5516514778137207, + "num_tokens": 9067302861.0, + "step": 17738 + }, + { + "epoch": 4.7969172525689565, + "grad_norm": 0.8879904747009277, + "learning_rate": 2.077968130703567e-06, + "loss": 1.9154, + "mean_token_accuracy": 0.5575791597366333, + "num_tokens": 9067827039.0, + "step": 17739 + }, + { + "epoch": 4.797187669010276, + "grad_norm": 0.7977930903434753, + "learning_rate": 2.077761205452588e-06, + "loss": 1.8884, + "mean_token_accuracy": 0.5564780235290527, + "num_tokens": 9068351254.0, + "step": 17740 + }, + { + "epoch": 4.797458085451596, + "grad_norm": 0.401142954826355, + "learning_rate": 2.077554553962134e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.706501305103302, + "num_tokens": 9068857904.0, + "step": 17741 + }, + { + "epoch": 4.797728501892915, + "grad_norm": 0.8404553532600403, + "learning_rate": 2.0773481762385455e-06, + "loss": 1.7206, + "mean_token_accuracy": 0.5851004123687744, + "num_tokens": 9069382124.0, + "step": 17742 + }, + { + "epoch": 4.797998918334235, + "grad_norm": 0.8397428393363953, + "learning_rate": 2.0771420722881552e-06, + "loss": 1.6598, + "mean_token_accuracy": 0.6068556308746338, + "num_tokens": 9069906167.0, + "step": 17743 + }, + { + "epoch": 4.798269334775554, + "grad_norm": 0.9198997020721436, + "learning_rate": 2.076936242117287e-06, + "loss": 1.8796, + "mean_token_accuracy": 0.5817753076553345, + "num_tokens": 9070430453.0, + "step": 17744 + }, + { + "epoch": 4.798539751216874, + "grad_norm": 0.8773131370544434, + "learning_rate": 2.0767306857322543e-06, + "loss": 1.8944, + "mean_token_accuracy": 0.5532881021499634, + "num_tokens": 9070954720.0, + "step": 17745 + }, + { + "epoch": 4.798810167658194, + "grad_norm": 0.747499942779541, + "learning_rate": 2.076525403139366e-06, + "loss": 1.7685, + "mean_token_accuracy": 0.605526328086853, + "num_tokens": 9071421150.0, + "step": 17746 + }, + { + "epoch": 4.799080584099514, + "grad_norm": 0.9491420388221741, + "learning_rate": 2.0763203943449216e-06, + "loss": 1.6878, + "mean_token_accuracy": 0.5958314538002014, + "num_tokens": 9071945397.0, + "step": 17747 + }, + { + "epoch": 4.799351000540833, + "grad_norm": 0.8788787722587585, + "learning_rate": 2.0761156593552097e-06, + "loss": 1.7474, + "mean_token_accuracy": 0.5964610576629639, + "num_tokens": 9072408473.0, + "step": 17748 + }, + { + "epoch": 4.799621416982152, + "grad_norm": 0.8475692868232727, + "learning_rate": 2.075911198176513e-06, + "loss": 1.824, + "mean_token_accuracy": 0.5831735134124756, + "num_tokens": 9072907678.0, + "step": 17749 + }, + { + "epoch": 4.799891833423472, + "grad_norm": 0.8455794453620911, + "learning_rate": 2.075707010815105e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.5934164524078369, + "num_tokens": 9073429609.0, + "step": 17750 + }, + { + "epoch": 4.800162249864792, + "grad_norm": 0.8085415363311768, + "learning_rate": 2.07550309727725e-06, + "loss": 1.884, + "mean_token_accuracy": 0.5792098045349121, + "num_tokens": 9073889167.0, + "step": 17751 + }, + { + "epoch": 4.800432666306111, + "grad_norm": 0.8263648152351379, + "learning_rate": 2.0752994575692056e-06, + "loss": 1.8563, + "mean_token_accuracy": 0.5667086839675903, + "num_tokens": 9074413389.0, + "step": 17752 + }, + { + "epoch": 4.800703082747431, + "grad_norm": 0.7657597064971924, + "learning_rate": 2.07509609169722e-06, + "loss": 1.7875, + "mean_token_accuracy": 0.5817781686782837, + "num_tokens": 9074937589.0, + "step": 17753 + }, + { + "epoch": 4.800973499188751, + "grad_norm": 0.831600546836853, + "learning_rate": 2.0748929996675326e-06, + "loss": 1.8191, + "mean_token_accuracy": 0.5693271160125732, + "num_tokens": 9075461751.0, + "step": 17754 + }, + { + "epoch": 4.801243915630071, + "grad_norm": 0.8082675337791443, + "learning_rate": 2.074690181486375e-06, + "loss": 1.8105, + "mean_token_accuracy": 0.5781799554824829, + "num_tokens": 9075935047.0, + "step": 17755 + }, + { + "epoch": 4.80151433207139, + "grad_norm": 0.7270995378494263, + "learning_rate": 2.07448763715997e-06, + "loss": 1.6766, + "mean_token_accuracy": 0.5941362380981445, + "num_tokens": 9076459291.0, + "step": 17756 + }, + { + "epoch": 4.801784748512709, + "grad_norm": 0.8741438388824463, + "learning_rate": 2.074285366694532e-06, + "loss": 1.9432, + "mean_token_accuracy": 0.5577876567840576, + "num_tokens": 9076983346.0, + "step": 17757 + }, + { + "epoch": 4.802055164954029, + "grad_norm": 0.8797745108604431, + "learning_rate": 2.074083370096269e-06, + "loss": 1.5569, + "mean_token_accuracy": 0.6116015911102295, + "num_tokens": 9077507567.0, + "step": 17758 + }, + { + "epoch": 4.8023255813953485, + "grad_norm": 0.8308241963386536, + "learning_rate": 2.073881647371376e-06, + "loss": 1.9381, + "mean_token_accuracy": 0.5667493343353271, + "num_tokens": 9077973094.0, + "step": 17759 + }, + { + "epoch": 4.802595997836669, + "grad_norm": 0.7547377347946167, + "learning_rate": 2.073680198526046e-06, + "loss": 1.8484, + "mean_token_accuracy": 0.5548588037490845, + "num_tokens": 9078497257.0, + "step": 17760 + }, + { + "epoch": 4.802866414277988, + "grad_norm": 0.3863755166530609, + "learning_rate": 2.0734790235664558e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6995767951011658, + "num_tokens": 524249.0, + "step": 17761 + }, + { + "epoch": 4.803136830719308, + "grad_norm": 0.8878975510597229, + "learning_rate": 2.0732781224987823e-06, + "loss": 1.8137, + "mean_token_accuracy": 0.5825285315513611, + "num_tokens": 991327.0, + "step": 17762 + }, + { + "epoch": 4.803407247160627, + "grad_norm": 1.076160192489624, + "learning_rate": 2.0730774953291865e-06, + "loss": 1.889, + "mean_token_accuracy": 0.5519222617149353, + "num_tokens": 1515497.0, + "step": 17763 + }, + { + "epoch": 4.803677663601947, + "grad_norm": 0.8409662842750549, + "learning_rate": 2.0728771420638264e-06, + "loss": 1.8201, + "mean_token_accuracy": 0.5879085063934326, + "num_tokens": 2023481.0, + "step": 17764 + }, + { + "epoch": 4.803948080043266, + "grad_norm": 1.1385270357131958, + "learning_rate": 2.0726770627088485e-06, + "loss": 1.8935, + "mean_token_accuracy": 0.5576642155647278, + "num_tokens": 2547607.0, + "step": 17765 + }, + { + "epoch": 4.804218496484586, + "grad_norm": 0.8987575173377991, + "learning_rate": 2.072477257270392e-06, + "loss": 1.7216, + "mean_token_accuracy": 0.5830636024475098, + "num_tokens": 3071735.0, + "step": 17766 + }, + { + "epoch": 4.804488912925906, + "grad_norm": 0.8292415142059326, + "learning_rate": 2.0722777257545874e-06, + "loss": 1.8546, + "mean_token_accuracy": 0.5682330131530762, + "num_tokens": 3537882.0, + "step": 17767 + }, + { + "epoch": 4.804759329367226, + "grad_norm": 0.8261240720748901, + "learning_rate": 2.072078468167556e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.5753856301307678, + "num_tokens": 4062150.0, + "step": 17768 + }, + { + "epoch": 4.805029745808545, + "grad_norm": 0.7325551509857178, + "learning_rate": 2.0718794845154138e-06, + "loss": 1.6657, + "mean_token_accuracy": 0.613057017326355, + "num_tokens": 4586331.0, + "step": 17769 + }, + { + "epoch": 4.805300162249865, + "grad_norm": 0.7680826783180237, + "learning_rate": 2.071680774804264e-06, + "loss": 1.8542, + "mean_token_accuracy": 0.580325722694397, + "num_tokens": 5110390.0, + "step": 17770 + }, + { + "epoch": 4.805570578691184, + "grad_norm": 0.8100419044494629, + "learning_rate": 2.0714823390402044e-06, + "loss": 1.809, + "mean_token_accuracy": 0.5871658325195312, + "num_tokens": 5605511.0, + "step": 17771 + }, + { + "epoch": 4.805840995132504, + "grad_norm": 0.7814841866493225, + "learning_rate": 2.071284177229324e-06, + "loss": 1.8532, + "mean_token_accuracy": 0.5639386177062988, + "num_tokens": 6129782.0, + "step": 17772 + }, + { + "epoch": 4.8061114115738235, + "grad_norm": 0.8812711238861084, + "learning_rate": 2.071086289377702e-06, + "loss": 1.8412, + "mean_token_accuracy": 0.5741418600082397, + "num_tokens": 6632816.0, + "step": 17773 + }, + { + "epoch": 4.806381828015144, + "grad_norm": 0.856239378452301, + "learning_rate": 2.0708886754914113e-06, + "loss": 1.8183, + "mean_token_accuracy": 0.5828552842140198, + "num_tokens": 7156877.0, + "step": 17774 + }, + { + "epoch": 4.806652244456463, + "grad_norm": 0.8836832046508789, + "learning_rate": 2.070691335576515e-06, + "loss": 1.8872, + "mean_token_accuracy": 0.5757905840873718, + "num_tokens": 7628381.0, + "step": 17775 + }, + { + "epoch": 4.806922660897783, + "grad_norm": 0.7700260877609253, + "learning_rate": 2.0704942696390688e-06, + "loss": 1.8536, + "mean_token_accuracy": 0.5842323303222656, + "num_tokens": 8093410.0, + "step": 17776 + }, + { + "epoch": 4.807193077339102, + "grad_norm": 0.6982439160346985, + "learning_rate": 2.0702974776851175e-06, + "loss": 1.8435, + "mean_token_accuracy": 0.5653083324432373, + "num_tokens": 8617596.0, + "step": 17777 + }, + { + "epoch": 4.807463493780422, + "grad_norm": 0.7895453572273254, + "learning_rate": 2.0701009597206998e-06, + "loss": 1.9211, + "mean_token_accuracy": 0.54987633228302, + "num_tokens": 9141650.0, + "step": 17778 + }, + { + "epoch": 4.807733910221741, + "grad_norm": 0.970064103603363, + "learning_rate": 2.0699047157518465e-06, + "loss": 1.8819, + "mean_token_accuracy": 0.5616831183433533, + "num_tokens": 9665923.0, + "step": 17779 + }, + { + "epoch": 4.808004326663061, + "grad_norm": 0.8400454521179199, + "learning_rate": 2.0697087457845767e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.5782142877578735, + "num_tokens": 10130128.0, + "step": 17780 + }, + { + "epoch": 4.808274743104381, + "grad_norm": 0.3380430340766907, + "learning_rate": 2.069513049824907e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.7125152349472046, + "num_tokens": 10654361.0, + "step": 17781 + }, + { + "epoch": 4.808545159545701, + "grad_norm": 0.7655563950538635, + "learning_rate": 2.069317627878838e-06, + "loss": 1.8201, + "mean_token_accuracy": 0.5835050344467163, + "num_tokens": 11171681.0, + "step": 17782 + }, + { + "epoch": 4.80881557598702, + "grad_norm": 0.8352506756782532, + "learning_rate": 2.069122479952369e-06, + "loss": 1.7335, + "mean_token_accuracy": 0.575689435005188, + "num_tokens": 11695921.0, + "step": 17783 + }, + { + "epoch": 4.80908599242834, + "grad_norm": 0.8673760294914246, + "learning_rate": 2.068927606051486e-06, + "loss": 1.9038, + "mean_token_accuracy": 0.5577446222305298, + "num_tokens": 12220191.0, + "step": 17784 + }, + { + "epoch": 4.809356408869659, + "grad_norm": 0.7303031086921692, + "learning_rate": 2.0687330061821683e-06, + "loss": 1.7891, + "mean_token_accuracy": 0.5826653838157654, + "num_tokens": 12718558.0, + "step": 17785 + }, + { + "epoch": 4.809626825310979, + "grad_norm": 0.860093355178833, + "learning_rate": 2.068538680350387e-06, + "loss": 1.8773, + "mean_token_accuracy": 0.5726155638694763, + "num_tokens": 13210612.0, + "step": 17786 + }, + { + "epoch": 4.8098972417522985, + "grad_norm": 0.8362754583358765, + "learning_rate": 2.068344628562105e-06, + "loss": 1.8185, + "mean_token_accuracy": 0.5775395631790161, + "num_tokens": 13734713.0, + "step": 17787 + }, + { + "epoch": 4.810167658193619, + "grad_norm": 1.213394284248352, + "learning_rate": 2.068150850823275e-06, + "loss": 1.7204, + "mean_token_accuracy": 0.5943599939346313, + "num_tokens": 14214648.0, + "step": 17788 + }, + { + "epoch": 4.810438074634938, + "grad_norm": 1.1018502712249756, + "learning_rate": 2.067957347139844e-06, + "loss": 2.0272, + "mean_token_accuracy": 0.529508113861084, + "num_tokens": 14738807.0, + "step": 17789 + }, + { + "epoch": 4.810708491076257, + "grad_norm": 0.8492280840873718, + "learning_rate": 2.0677641175177503e-06, + "loss": 1.8611, + "mean_token_accuracy": 0.5605409145355225, + "num_tokens": 15263047.0, + "step": 17790 + }, + { + "epoch": 4.810978907517577, + "grad_norm": 0.8761090636253357, + "learning_rate": 2.0675711619629212e-06, + "loss": 1.8856, + "mean_token_accuracy": 0.5685955286026001, + "num_tokens": 15752658.0, + "step": 17791 + }, + { + "epoch": 4.811249323958897, + "grad_norm": 1.100771188735962, + "learning_rate": 2.067378480481277e-06, + "loss": 1.7068, + "mean_token_accuracy": 0.5701462626457214, + "num_tokens": 16276890.0, + "step": 17792 + }, + { + "epoch": 4.811519740400216, + "grad_norm": 1.0118942260742188, + "learning_rate": 2.067186073078731e-06, + "loss": 1.7013, + "mean_token_accuracy": 0.6032828092575073, + "num_tokens": 16797638.0, + "step": 17793 + }, + { + "epoch": 4.8117901568415355, + "grad_norm": 0.8189175128936768, + "learning_rate": 2.0669939397611836e-06, + "loss": 1.8278, + "mean_token_accuracy": 0.5766167640686035, + "num_tokens": 17321802.0, + "step": 17794 + }, + { + "epoch": 4.812060573282856, + "grad_norm": 0.7960326075553894, + "learning_rate": 2.0668020805345347e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5785509347915649, + "num_tokens": 17846061.0, + "step": 17795 + }, + { + "epoch": 4.812330989724176, + "grad_norm": 0.9535610675811768, + "learning_rate": 2.0666104954046686e-06, + "loss": 1.8669, + "mean_token_accuracy": 0.5625498294830322, + "num_tokens": 18370302.0, + "step": 17796 + }, + { + "epoch": 4.812601406165495, + "grad_norm": 0.9687805771827698, + "learning_rate": 2.0664191843774632e-06, + "loss": 1.8367, + "mean_token_accuracy": 0.5849019289016724, + "num_tokens": 18857280.0, + "step": 17797 + }, + { + "epoch": 4.812871822606814, + "grad_norm": 1.0031930208206177, + "learning_rate": 2.066228147458789e-06, + "loss": 1.8548, + "mean_token_accuracy": 0.5718374252319336, + "num_tokens": 19381439.0, + "step": 17798 + }, + { + "epoch": 4.813142239048134, + "grad_norm": 0.8238662481307983, + "learning_rate": 2.066037384654508e-06, + "loss": 1.8101, + "mean_token_accuracy": 0.5837932825088501, + "num_tokens": 19905458.0, + "step": 17799 + }, + { + "epoch": 4.813412655489453, + "grad_norm": 0.8035544157028198, + "learning_rate": 2.0658468959704738e-06, + "loss": 1.7409, + "mean_token_accuracy": 0.6049512624740601, + "num_tokens": 20429597.0, + "step": 17800 + }, + { + "epoch": 4.8136830719307735, + "grad_norm": 0.35840341448783875, + "learning_rate": 2.06565668141253e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.7087151408195496, + "num_tokens": 20922472.0, + "step": 17801 + }, + { + "epoch": 4.813953488372093, + "grad_norm": 0.8704895377159119, + "learning_rate": 2.0654667409865126e-06, + "loss": 1.6213, + "mean_token_accuracy": 0.6287445425987244, + "num_tokens": 21446606.0, + "step": 17802 + }, + { + "epoch": 4.814223904813413, + "grad_norm": 1.0350072383880615, + "learning_rate": 2.0652770746982516e-06, + "loss": 1.768, + "mean_token_accuracy": 0.5807531476020813, + "num_tokens": 21970851.0, + "step": 17803 + }, + { + "epoch": 4.814494321254732, + "grad_norm": 0.9278727769851685, + "learning_rate": 2.0650876825535643e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5977904796600342, + "num_tokens": 22431294.0, + "step": 17804 + }, + { + "epoch": 4.814764737696052, + "grad_norm": 0.957303524017334, + "learning_rate": 2.0648985645582635e-06, + "loss": 1.6972, + "mean_token_accuracy": 0.5990283489227295, + "num_tokens": 22944136.0, + "step": 17805 + }, + { + "epoch": 4.815035154137371, + "grad_norm": 0.8610675930976868, + "learning_rate": 2.0647097207181497e-06, + "loss": 1.9061, + "mean_token_accuracy": 0.5689120292663574, + "num_tokens": 23454233.0, + "step": 17806 + }, + { + "epoch": 4.815305570578691, + "grad_norm": 0.9782099723815918, + "learning_rate": 2.0645211510390203e-06, + "loss": 1.7612, + "mean_token_accuracy": 0.6077370643615723, + "num_tokens": 23978514.0, + "step": 17807 + }, + { + "epoch": 4.8155759870200106, + "grad_norm": 0.8034722208976746, + "learning_rate": 2.0643328555266594e-06, + "loss": 1.6442, + "mean_token_accuracy": 0.6352255344390869, + "num_tokens": 24502729.0, + "step": 17808 + }, + { + "epoch": 4.815846403461331, + "grad_norm": 0.8259908556938171, + "learning_rate": 2.0641448341868437e-06, + "loss": 1.7793, + "mean_token_accuracy": 0.5823545455932617, + "num_tokens": 25026832.0, + "step": 17809 + }, + { + "epoch": 4.81611681990265, + "grad_norm": 0.822769284248352, + "learning_rate": 2.063957087025343e-06, + "loss": 1.7923, + "mean_token_accuracy": 0.5834934115409851, + "num_tokens": 25551083.0, + "step": 17810 + }, + { + "epoch": 4.81638723634397, + "grad_norm": 0.883468747138977, + "learning_rate": 2.063769614047919e-06, + "loss": 1.7157, + "mean_token_accuracy": 0.5944050550460815, + "num_tokens": 26075136.0, + "step": 17811 + }, + { + "epoch": 4.816657652785289, + "grad_norm": 0.8178231120109558, + "learning_rate": 2.0635824152603227e-06, + "loss": 1.8659, + "mean_token_accuracy": 0.5665073990821838, + "num_tokens": 26599371.0, + "step": 17812 + }, + { + "epoch": 4.816928069226609, + "grad_norm": 0.8179240822792053, + "learning_rate": 2.063395490668298e-06, + "loss": 1.6898, + "mean_token_accuracy": 0.6018099784851074, + "num_tokens": 27075679.0, + "step": 17813 + }, + { + "epoch": 4.817198485667928, + "grad_norm": 0.7810083031654358, + "learning_rate": 2.0632088402775806e-06, + "loss": 1.7938, + "mean_token_accuracy": 0.5705097913742065, + "num_tokens": 27599866.0, + "step": 17814 + }, + { + "epoch": 4.8174689021092485, + "grad_norm": 0.9100900888442993, + "learning_rate": 2.063022464093897e-06, + "loss": 1.7794, + "mean_token_accuracy": 0.5814934372901917, + "num_tokens": 28112955.0, + "step": 17815 + }, + { + "epoch": 4.817739318550568, + "grad_norm": 0.8311601877212524, + "learning_rate": 2.0628363621229667e-06, + "loss": 1.785, + "mean_token_accuracy": 0.5723699331283569, + "num_tokens": 28637133.0, + "step": 17816 + }, + { + "epoch": 4.818009734991888, + "grad_norm": 0.8053682446479797, + "learning_rate": 2.062650534370499e-06, + "loss": 1.7992, + "mean_token_accuracy": 0.5903142690658569, + "num_tokens": 29105445.0, + "step": 17817 + }, + { + "epoch": 4.818280151433207, + "grad_norm": 0.7393456697463989, + "learning_rate": 2.0624649808421953e-06, + "loss": 1.7279, + "mean_token_accuracy": 0.603402853012085, + "num_tokens": 29629434.0, + "step": 17818 + }, + { + "epoch": 4.818550567874527, + "grad_norm": 0.8641261458396912, + "learning_rate": 2.0622797015437508e-06, + "loss": 1.8653, + "mean_token_accuracy": 0.5756580829620361, + "num_tokens": 30130126.0, + "step": 17819 + }, + { + "epoch": 4.818820984315846, + "grad_norm": 0.9895572662353516, + "learning_rate": 2.0620946964808487e-06, + "loss": 1.8712, + "mean_token_accuracy": 0.5550647974014282, + "num_tokens": 30654327.0, + "step": 17820 + }, + { + "epoch": 4.819091400757166, + "grad_norm": 0.3443109095096588, + "learning_rate": 2.061909965659166e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.7109706997871399, + "num_tokens": 31178608.0, + "step": 17821 + }, + { + "epoch": 4.8193618171984856, + "grad_norm": 0.822567343711853, + "learning_rate": 2.0617255090843706e-06, + "loss": 1.8469, + "mean_token_accuracy": 0.569751501083374, + "num_tokens": 31702828.0, + "step": 17822 + }, + { + "epoch": 4.819632233639806, + "grad_norm": 0.8877054452896118, + "learning_rate": 2.061541326762122e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5833035707473755, + "num_tokens": 32210167.0, + "step": 17823 + }, + { + "epoch": 4.819902650081125, + "grad_norm": 0.849903404712677, + "learning_rate": 2.0613574186980722e-06, + "loss": 1.7646, + "mean_token_accuracy": 0.6009767651557922, + "num_tokens": 32734344.0, + "step": 17824 + }, + { + "epoch": 4.820173066522445, + "grad_norm": 0.7927626371383667, + "learning_rate": 2.0611737848978643e-06, + "loss": 1.8516, + "mean_token_accuracy": 0.569692075252533, + "num_tokens": 33258500.0, + "step": 17825 + }, + { + "epoch": 4.820443482963764, + "grad_norm": 0.8403279781341553, + "learning_rate": 2.0609904253671305e-06, + "loss": 1.7604, + "mean_token_accuracy": 0.5869908928871155, + "num_tokens": 33782611.0, + "step": 17826 + }, + { + "epoch": 4.820713899405084, + "grad_norm": 0.8035909533500671, + "learning_rate": 2.0608073401115006e-06, + "loss": 1.7935, + "mean_token_accuracy": 0.5585870742797852, + "num_tokens": 34306782.0, + "step": 17827 + }, + { + "epoch": 4.820984315846403, + "grad_norm": 0.8677810430526733, + "learning_rate": 2.0606245291365885e-06, + "loss": 1.7649, + "mean_token_accuracy": 0.5984178781509399, + "num_tokens": 34790162.0, + "step": 17828 + }, + { + "epoch": 4.8212547322877235, + "grad_norm": 0.8684213161468506, + "learning_rate": 2.060441992448004e-06, + "loss": 1.8043, + "mean_token_accuracy": 0.5879166722297668, + "num_tokens": 35314265.0, + "step": 17829 + }, + { + "epoch": 4.821525148729043, + "grad_norm": 0.8928741812705994, + "learning_rate": 2.06025973005135e-06, + "loss": 1.8221, + "mean_token_accuracy": 0.5660496950149536, + "num_tokens": 35838480.0, + "step": 17830 + }, + { + "epoch": 4.821795565170362, + "grad_norm": 0.9334900379180908, + "learning_rate": 2.0600777419522175e-06, + "loss": 1.8266, + "mean_token_accuracy": 0.5749322175979614, + "num_tokens": 36362632.0, + "step": 17831 + }, + { + "epoch": 4.822065981611682, + "grad_norm": 0.7941701412200928, + "learning_rate": 2.059896028156189e-06, + "loss": 1.7959, + "mean_token_accuracy": 0.5757766962051392, + "num_tokens": 36886839.0, + "step": 17832 + }, + { + "epoch": 4.822336398053002, + "grad_norm": 0.7715753316879272, + "learning_rate": 2.0597145886688434e-06, + "loss": 1.913, + "mean_token_accuracy": 0.5604457259178162, + "num_tokens": 37411123.0, + "step": 17833 + }, + { + "epoch": 4.822606814494321, + "grad_norm": 0.8086880445480347, + "learning_rate": 2.0595334234957444e-06, + "loss": 1.8065, + "mean_token_accuracy": 0.5857808589935303, + "num_tokens": 37935309.0, + "step": 17834 + }, + { + "epoch": 4.8228772309356405, + "grad_norm": 0.8397193551063538, + "learning_rate": 2.059352532642454e-06, + "loss": 1.7558, + "mean_token_accuracy": 0.5986237525939941, + "num_tokens": 38459421.0, + "step": 17835 + }, + { + "epoch": 4.823147647376961, + "grad_norm": 0.7896079421043396, + "learning_rate": 2.059171916114519e-06, + "loss": 1.8116, + "mean_token_accuracy": 0.5826574563980103, + "num_tokens": 38983691.0, + "step": 17836 + }, + { + "epoch": 4.823418063818281, + "grad_norm": 0.7437701225280762, + "learning_rate": 2.0589915739174836e-06, + "loss": 1.7924, + "mean_token_accuracy": 0.5794007182121277, + "num_tokens": 39502039.0, + "step": 17837 + }, + { + "epoch": 4.8236884802596, + "grad_norm": 1.097977638244629, + "learning_rate": 2.05881150605688e-06, + "loss": 1.868, + "mean_token_accuracy": 0.5418506860733032, + "num_tokens": 39982289.0, + "step": 17838 + }, + { + "epoch": 4.823958896700919, + "grad_norm": 0.9071025252342224, + "learning_rate": 2.0586317125382347e-06, + "loss": 1.8347, + "mean_token_accuracy": 0.5841198563575745, + "num_tokens": 40448564.0, + "step": 17839 + }, + { + "epoch": 4.824229313142239, + "grad_norm": 0.7917447090148926, + "learning_rate": 2.0584521933670625e-06, + "loss": 1.8009, + "mean_token_accuracy": 0.5749942660331726, + "num_tokens": 40972817.0, + "step": 17840 + }, + { + "epoch": 4.824499729583558, + "grad_norm": 0.3917776346206665, + "learning_rate": 2.0582729485488722e-06, + "loss": 1.1385, + "mean_token_accuracy": 0.6729041337966919, + "num_tokens": 41496997.0, + "step": 17841 + }, + { + "epoch": 4.824770146024878, + "grad_norm": 0.922300398349762, + "learning_rate": 2.0580939780891644e-06, + "loss": 1.7435, + "mean_token_accuracy": 0.5983868837356567, + "num_tokens": 42021217.0, + "step": 17842 + }, + { + "epoch": 4.825040562466198, + "grad_norm": 0.8881902098655701, + "learning_rate": 2.0579152819934295e-06, + "loss": 1.8319, + "mean_token_accuracy": 0.5836883783340454, + "num_tokens": 42488889.0, + "step": 17843 + }, + { + "epoch": 4.825310978907518, + "grad_norm": 0.865085244178772, + "learning_rate": 2.057736860267151e-06, + "loss": 1.9104, + "mean_token_accuracy": 0.5651012659072876, + "num_tokens": 42958757.0, + "step": 17844 + }, + { + "epoch": 4.825581395348837, + "grad_norm": 0.901170015335083, + "learning_rate": 2.0575587129158024e-06, + "loss": 1.6745, + "mean_token_accuracy": 0.6351086497306824, + "num_tokens": 43394030.0, + "step": 17845 + }, + { + "epoch": 4.825851811790157, + "grad_norm": 0.8324592709541321, + "learning_rate": 2.0573808399448505e-06, + "loss": 1.8128, + "mean_token_accuracy": 0.5825671553611755, + "num_tokens": 43880462.0, + "step": 17846 + }, + { + "epoch": 4.826122228231476, + "grad_norm": 0.993244469165802, + "learning_rate": 2.057203241359754e-06, + "loss": 1.8853, + "mean_token_accuracy": 0.5734164118766785, + "num_tokens": 44404736.0, + "step": 17847 + }, + { + "epoch": 4.826392644672796, + "grad_norm": 0.9439274668693542, + "learning_rate": 2.0570259171659603e-06, + "loss": 1.9704, + "mean_token_accuracy": 0.5512962937355042, + "num_tokens": 44929016.0, + "step": 17848 + }, + { + "epoch": 4.8266630611141155, + "grad_norm": 0.9573772549629211, + "learning_rate": 2.056848867368911e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.568654477596283, + "num_tokens": 45453199.0, + "step": 17849 + }, + { + "epoch": 4.826933477555436, + "grad_norm": 0.8227483034133911, + "learning_rate": 2.0566720919740386e-06, + "loss": 1.8499, + "mean_token_accuracy": 0.564471960067749, + "num_tokens": 45973626.0, + "step": 17850 + }, + { + "epoch": 4.827203893996755, + "grad_norm": 0.8119732737541199, + "learning_rate": 2.056495590986767e-06, + "loss": 1.7729, + "mean_token_accuracy": 0.596940279006958, + "num_tokens": 46497879.0, + "step": 17851 + }, + { + "epoch": 4.827474310438075, + "grad_norm": 0.9883779287338257, + "learning_rate": 2.056319364412512e-06, + "loss": 1.8987, + "mean_token_accuracy": 0.5349928140640259, + "num_tokens": 47022149.0, + "step": 17852 + }, + { + "epoch": 4.827744726879394, + "grad_norm": 0.8330939412117004, + "learning_rate": 2.0561434122566807e-06, + "loss": 1.7889, + "mean_token_accuracy": 0.5963020324707031, + "num_tokens": 47546394.0, + "step": 17853 + }, + { + "epoch": 4.828015143320714, + "grad_norm": 0.7772514224052429, + "learning_rate": 2.055967734524672e-06, + "loss": 1.8739, + "mean_token_accuracy": 0.5707869529724121, + "num_tokens": 48062093.0, + "step": 17854 + }, + { + "epoch": 4.828285559762033, + "grad_norm": 0.9337877035140991, + "learning_rate": 2.055792331221875e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5660062432289124, + "num_tokens": 48586185.0, + "step": 17855 + }, + { + "epoch": 4.828555976203353, + "grad_norm": 0.7605624198913574, + "learning_rate": 2.0556172023536733e-06, + "loss": 1.7641, + "mean_token_accuracy": 0.5783151388168335, + "num_tokens": 49110460.0, + "step": 17856 + }, + { + "epoch": 4.828826392644673, + "grad_norm": 0.8370140790939331, + "learning_rate": 2.055442347925439e-06, + "loss": 1.7955, + "mean_token_accuracy": 0.5921759605407715, + "num_tokens": 49595768.0, + "step": 17857 + }, + { + "epoch": 4.829096809085993, + "grad_norm": 0.8120623826980591, + "learning_rate": 2.0552677679425377e-06, + "loss": 1.8474, + "mean_token_accuracy": 0.569746732711792, + "num_tokens": 50119908.0, + "step": 17858 + }, + { + "epoch": 4.829367225527312, + "grad_norm": 0.8890678286552429, + "learning_rate": 2.055093462410326e-06, + "loss": 1.8612, + "mean_token_accuracy": 0.5741932392120361, + "num_tokens": 50634307.0, + "step": 17859 + }, + { + "epoch": 4.829637641968632, + "grad_norm": 0.702181875705719, + "learning_rate": 2.054919431334152e-06, + "loss": 1.7305, + "mean_token_accuracy": 0.5958869457244873, + "num_tokens": 51158544.0, + "step": 17860 + }, + { + "epoch": 4.829908058409951, + "grad_norm": 0.3454679250717163, + "learning_rate": 2.0547456747193557e-06, + "loss": 1.1123, + "mean_token_accuracy": 0.7077183723449707, + "num_tokens": 51682675.0, + "step": 17861 + }, + { + "epoch": 4.830178474851271, + "grad_norm": 0.9904570579528809, + "learning_rate": 2.054572192571269e-06, + "loss": 1.8683, + "mean_token_accuracy": 0.5783966779708862, + "num_tokens": 52193573.0, + "step": 17862 + }, + { + "epoch": 4.8304488912925905, + "grad_norm": 1.0143922567367554, + "learning_rate": 2.0543989848952138e-06, + "loss": 1.8487, + "mean_token_accuracy": 0.5869555473327637, + "num_tokens": 52665935.0, + "step": 17863 + }, + { + "epoch": 4.830719307733911, + "grad_norm": 0.7908731698989868, + "learning_rate": 2.0542260516965045e-06, + "loss": 1.81, + "mean_token_accuracy": 0.5852434635162354, + "num_tokens": 53190213.0, + "step": 17864 + }, + { + "epoch": 4.83098972417523, + "grad_norm": 0.8673394322395325, + "learning_rate": 2.054053392980448e-06, + "loss": 1.7937, + "mean_token_accuracy": 0.5872490406036377, + "num_tokens": 53694142.0, + "step": 17865 + }, + { + "epoch": 4.83126014061655, + "grad_norm": 0.7993005514144897, + "learning_rate": 2.0538810087523424e-06, + "loss": 1.892, + "mean_token_accuracy": 0.5723079442977905, + "num_tokens": 54218412.0, + "step": 17866 + }, + { + "epoch": 4.831530557057869, + "grad_norm": 0.9027461409568787, + "learning_rate": 2.0537088990174765e-06, + "loss": 1.7488, + "mean_token_accuracy": 0.5820269584655762, + "num_tokens": 54742663.0, + "step": 17867 + }, + { + "epoch": 4.831800973499189, + "grad_norm": 0.9493717551231384, + "learning_rate": 2.05353706378113e-06, + "loss": 1.901, + "mean_token_accuracy": 0.5475107431411743, + "num_tokens": 55266942.0, + "step": 17868 + }, + { + "epoch": 4.832071389940508, + "grad_norm": 0.8714594841003418, + "learning_rate": 2.053365503048576e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.5725369453430176, + "num_tokens": 55755088.0, + "step": 17869 + }, + { + "epoch": 4.832341806381828, + "grad_norm": 0.8054800033569336, + "learning_rate": 2.053194216825079e-06, + "loss": 1.8035, + "mean_token_accuracy": 0.5972549915313721, + "num_tokens": 56232538.0, + "step": 17870 + }, + { + "epoch": 4.832612222823148, + "grad_norm": 0.9405171871185303, + "learning_rate": 2.0530232051158947e-06, + "loss": 1.7494, + "mean_token_accuracy": 0.584118127822876, + "num_tokens": 56756765.0, + "step": 17871 + }, + { + "epoch": 4.832882639264467, + "grad_norm": 1.0224225521087646, + "learning_rate": 2.0528524679262703e-06, + "loss": 1.8955, + "mean_token_accuracy": 0.5675230026245117, + "num_tokens": 57199180.0, + "step": 17872 + }, + { + "epoch": 4.833153055705787, + "grad_norm": 1.2656030654907227, + "learning_rate": 2.0526820052614428e-06, + "loss": 1.9053, + "mean_token_accuracy": 0.5598186254501343, + "num_tokens": 57723298.0, + "step": 17873 + }, + { + "epoch": 4.833423472147107, + "grad_norm": 0.9479761123657227, + "learning_rate": 2.0525118171266435e-06, + "loss": 1.8473, + "mean_token_accuracy": 0.5721664428710938, + "num_tokens": 58247568.0, + "step": 17874 + }, + { + "epoch": 4.833693888588426, + "grad_norm": 0.8768832683563232, + "learning_rate": 2.0523419035270954e-06, + "loss": 1.8093, + "mean_token_accuracy": 0.552298903465271, + "num_tokens": 58771784.0, + "step": 17875 + }, + { + "epoch": 4.833964305029745, + "grad_norm": 0.6752076148986816, + "learning_rate": 2.0521722644680113e-06, + "loss": 1.6529, + "mean_token_accuracy": 0.6109707355499268, + "num_tokens": 59295969.0, + "step": 17876 + }, + { + "epoch": 4.8342347214710655, + "grad_norm": 0.761396050453186, + "learning_rate": 2.0520028999545956e-06, + "loss": 1.8175, + "mean_token_accuracy": 0.5805057883262634, + "num_tokens": 59820161.0, + "step": 17877 + }, + { + "epoch": 4.834505137912386, + "grad_norm": 0.7931613922119141, + "learning_rate": 2.0518338099920447e-06, + "loss": 1.709, + "mean_token_accuracy": 0.5893477201461792, + "num_tokens": 60344345.0, + "step": 17878 + }, + { + "epoch": 4.834775554353705, + "grad_norm": 0.9284878373146057, + "learning_rate": 2.0516649945855484e-06, + "loss": 1.8771, + "mean_token_accuracy": 0.577766478061676, + "num_tokens": 60864307.0, + "step": 17879 + }, + { + "epoch": 4.835045970795024, + "grad_norm": 0.8371443152427673, + "learning_rate": 2.051496453740284e-06, + "loss": 1.8522, + "mean_token_accuracy": 0.5793983936309814, + "num_tokens": 61388574.0, + "step": 17880 + }, + { + "epoch": 4.835316387236344, + "grad_norm": 0.35538333654403687, + "learning_rate": 2.0513281874614255e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.7139934301376343, + "num_tokens": 61912735.0, + "step": 17881 + }, + { + "epoch": 4.835586803677663, + "grad_norm": 1.0168274641036987, + "learning_rate": 2.0511601957541347e-06, + "loss": 1.9936, + "mean_token_accuracy": 0.5631417632102966, + "num_tokens": 62436941.0, + "step": 17882 + }, + { + "epoch": 4.835857220118983, + "grad_norm": 0.9069022536277771, + "learning_rate": 2.050992478623565e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5917013883590698, + "num_tokens": 62919993.0, + "step": 17883 + }, + { + "epoch": 4.8361276365603025, + "grad_norm": 0.8802757263183594, + "learning_rate": 2.0508250360748634e-06, + "loss": 1.7633, + "mean_token_accuracy": 0.6154153943061829, + "num_tokens": 63396999.0, + "step": 17884 + }, + { + "epoch": 4.836398053001623, + "grad_norm": 0.8150847554206848, + "learning_rate": 2.0506578681131684e-06, + "loss": 1.7546, + "mean_token_accuracy": 0.6005565524101257, + "num_tokens": 63921120.0, + "step": 17885 + }, + { + "epoch": 4.836668469442942, + "grad_norm": 0.7498021125793457, + "learning_rate": 2.050490974743607e-06, + "loss": 1.8289, + "mean_token_accuracy": 0.5821468830108643, + "num_tokens": 64445394.0, + "step": 17886 + }, + { + "epoch": 4.836938885884262, + "grad_norm": 0.7928943037986755, + "learning_rate": 2.050324355971303e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.5770774483680725, + "num_tokens": 64969624.0, + "step": 17887 + }, + { + "epoch": 4.837209302325581, + "grad_norm": 0.7142485976219177, + "learning_rate": 2.0501580118013653e-06, + "loss": 1.7116, + "mean_token_accuracy": 0.6037685871124268, + "num_tokens": 65493832.0, + "step": 17888 + }, + { + "epoch": 4.837479718766901, + "grad_norm": 0.8468770384788513, + "learning_rate": 2.0499919422388998e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5758118629455566, + "num_tokens": 66018044.0, + "step": 17889 + }, + { + "epoch": 4.83775013520822, + "grad_norm": 0.7470723986625671, + "learning_rate": 2.049826147289003e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.5925965309143066, + "num_tokens": 66484098.0, + "step": 17890 + }, + { + "epoch": 4.8380205516495405, + "grad_norm": 0.6807780265808105, + "learning_rate": 2.0496606269567603e-06, + "loss": 1.6918, + "mean_token_accuracy": 0.5942500233650208, + "num_tokens": 67008260.0, + "step": 17891 + }, + { + "epoch": 4.83829096809086, + "grad_norm": 0.7855170369148254, + "learning_rate": 2.0494953812472494e-06, + "loss": 1.7561, + "mean_token_accuracy": 0.5784394145011902, + "num_tokens": 67517081.0, + "step": 17892 + }, + { + "epoch": 4.83856138453218, + "grad_norm": 0.7726613879203796, + "learning_rate": 2.049330410165544e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5789781808853149, + "num_tokens": 68041169.0, + "step": 17893 + }, + { + "epoch": 4.838831800973499, + "grad_norm": 0.9035853743553162, + "learning_rate": 2.0491657137167036e-06, + "loss": 1.8105, + "mean_token_accuracy": 0.5785561800003052, + "num_tokens": 68467776.0, + "step": 17894 + }, + { + "epoch": 4.839102217414819, + "grad_norm": 0.7903453707695007, + "learning_rate": 2.04900129190578e-06, + "loss": 1.8567, + "mean_token_accuracy": 0.5733528733253479, + "num_tokens": 68991910.0, + "step": 17895 + }, + { + "epoch": 4.839372633856138, + "grad_norm": 0.8655010461807251, + "learning_rate": 2.048837144737822e-06, + "loss": 1.7772, + "mean_token_accuracy": 0.5731933116912842, + "num_tokens": 69516175.0, + "step": 17896 + }, + { + "epoch": 4.839643050297458, + "grad_norm": 0.8011749386787415, + "learning_rate": 2.048673272217862e-06, + "loss": 1.7517, + "mean_token_accuracy": 0.5764912366867065, + "num_tokens": 70040346.0, + "step": 17897 + }, + { + "epoch": 4.8399134667387775, + "grad_norm": 0.8913147449493408, + "learning_rate": 2.0485096743509318e-06, + "loss": 1.7552, + "mean_token_accuracy": 0.5776476860046387, + "num_tokens": 70564603.0, + "step": 17898 + }, + { + "epoch": 4.840183883180098, + "grad_norm": 0.9510103464126587, + "learning_rate": 2.0483463511420483e-06, + "loss": 1.7503, + "mean_token_accuracy": 0.5835246443748474, + "num_tokens": 71088792.0, + "step": 17899 + }, + { + "epoch": 4.840454299621417, + "grad_norm": 1.0232281684875488, + "learning_rate": 2.0481833025962247e-06, + "loss": 1.7572, + "mean_token_accuracy": 0.5899641513824463, + "num_tokens": 71612967.0, + "step": 17900 + }, + { + "epoch": 4.840724716062737, + "grad_norm": 0.3669900596141815, + "learning_rate": 2.048020528718462e-06, + "loss": 1.056, + "mean_token_accuracy": 0.7089075446128845, + "num_tokens": 72097403.0, + "step": 17901 + }, + { + "epoch": 4.840995132504056, + "grad_norm": 0.8206305503845215, + "learning_rate": 2.047858029513757e-06, + "loss": 1.6879, + "mean_token_accuracy": 0.5970131754875183, + "num_tokens": 72621623.0, + "step": 17902 + }, + { + "epoch": 4.841265548945376, + "grad_norm": 1.0090144872665405, + "learning_rate": 2.0476958049870927e-06, + "loss": 1.8623, + "mean_token_accuracy": 0.5608409643173218, + "num_tokens": 73145794.0, + "step": 17903 + }, + { + "epoch": 4.841535965386695, + "grad_norm": 1.0141096115112305, + "learning_rate": 2.0475338551434484e-06, + "loss": 1.8217, + "mean_token_accuracy": 0.590660572052002, + "num_tokens": 73625458.0, + "step": 17904 + }, + { + "epoch": 4.8418063818280155, + "grad_norm": 0.9317888617515564, + "learning_rate": 2.0473721799877942e-06, + "loss": 1.8298, + "mean_token_accuracy": 0.562640368938446, + "num_tokens": 74149702.0, + "step": 17905 + }, + { + "epoch": 4.842076798269335, + "grad_norm": 0.7870644330978394, + "learning_rate": 2.0472107795250883e-06, + "loss": 1.7301, + "mean_token_accuracy": 0.5923366546630859, + "num_tokens": 74652819.0, + "step": 17906 + }, + { + "epoch": 4.842347214710655, + "grad_norm": 0.7324851751327515, + "learning_rate": 2.0470496537602837e-06, + "loss": 1.8053, + "mean_token_accuracy": 0.5635979175567627, + "num_tokens": 75177051.0, + "step": 17907 + }, + { + "epoch": 4.842617631151974, + "grad_norm": 0.8092599511146545, + "learning_rate": 2.046888802698326e-06, + "loss": 1.8639, + "mean_token_accuracy": 0.5714457035064697, + "num_tokens": 75701222.0, + "step": 17908 + }, + { + "epoch": 4.842888047593294, + "grad_norm": 0.9329092502593994, + "learning_rate": 2.0467282263441486e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.585755467414856, + "num_tokens": 76196471.0, + "step": 17909 + }, + { + "epoch": 4.843158464034613, + "grad_norm": 0.9001977443695068, + "learning_rate": 2.0465679247026788e-06, + "loss": 1.8398, + "mean_token_accuracy": 0.5891731381416321, + "num_tokens": 76720586.0, + "step": 17910 + }, + { + "epoch": 4.843428880475933, + "grad_norm": 0.8700833916664124, + "learning_rate": 2.0464078977788355e-06, + "loss": 1.8508, + "mean_token_accuracy": 0.585256040096283, + "num_tokens": 77244868.0, + "step": 17911 + }, + { + "epoch": 4.8436992969172525, + "grad_norm": 0.7941433787345886, + "learning_rate": 2.046248145577529e-06, + "loss": 1.8586, + "mean_token_accuracy": 0.5657486319541931, + "num_tokens": 77769087.0, + "step": 17912 + }, + { + "epoch": 4.843969713358572, + "grad_norm": 0.8300745487213135, + "learning_rate": 2.04608866810366e-06, + "loss": 1.6703, + "mean_token_accuracy": 0.6013076305389404, + "num_tokens": 78293219.0, + "step": 17913 + }, + { + "epoch": 4.844240129799892, + "grad_norm": 0.9488435387611389, + "learning_rate": 2.0459294653621233e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5694447755813599, + "num_tokens": 78817442.0, + "step": 17914 + }, + { + "epoch": 4.844510546241212, + "grad_norm": 0.9411404132843018, + "learning_rate": 2.0457705373578025e-06, + "loss": 1.7497, + "mean_token_accuracy": 0.5887404680252075, + "num_tokens": 79341420.0, + "step": 17915 + }, + { + "epoch": 4.844780962682531, + "grad_norm": 1.0254119634628296, + "learning_rate": 2.045611884095574e-06, + "loss": 1.9154, + "mean_token_accuracy": 0.5436195135116577, + "num_tokens": 79865662.0, + "step": 17916 + }, + { + "epoch": 4.84505137912385, + "grad_norm": 0.997600257396698, + "learning_rate": 2.0454535055803065e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.5710060596466064, + "num_tokens": 80389938.0, + "step": 17917 + }, + { + "epoch": 4.84532179556517, + "grad_norm": 0.7754337787628174, + "learning_rate": 2.045295401816858e-06, + "loss": 1.8302, + "mean_token_accuracy": 0.561997652053833, + "num_tokens": 80914208.0, + "step": 17918 + }, + { + "epoch": 4.8455922120064905, + "grad_norm": 0.8693861961364746, + "learning_rate": 2.045137572810081e-06, + "loss": 1.7632, + "mean_token_accuracy": 0.5804405212402344, + "num_tokens": 81438329.0, + "step": 17919 + }, + { + "epoch": 4.84586262844781, + "grad_norm": 0.7832448482513428, + "learning_rate": 2.044980018564819e-06, + "loss": 1.8223, + "mean_token_accuracy": 0.577734649181366, + "num_tokens": 81962581.0, + "step": 17920 + }, + { + "epoch": 4.846133044889129, + "grad_norm": 0.3661886155605316, + "learning_rate": 2.0448227390859043e-06, + "loss": 1.084, + "mean_token_accuracy": 0.7104398012161255, + "num_tokens": 82486682.0, + "step": 17921 + }, + { + "epoch": 4.846403461330449, + "grad_norm": 0.9487103223800659, + "learning_rate": 2.0446657343781635e-06, + "loss": 1.6719, + "mean_token_accuracy": 0.6087877154350281, + "num_tokens": 83010934.0, + "step": 17922 + }, + { + "epoch": 4.846673877771768, + "grad_norm": 0.8579503297805786, + "learning_rate": 2.0445090044464136e-06, + "loss": 1.8254, + "mean_token_accuracy": 0.5776270627975464, + "num_tokens": 83535197.0, + "step": 17923 + }, + { + "epoch": 4.846944294213088, + "grad_norm": 0.8613488078117371, + "learning_rate": 2.0443525492954637e-06, + "loss": 1.7521, + "mean_token_accuracy": 0.5741509795188904, + "num_tokens": 84059435.0, + "step": 17924 + }, + { + "epoch": 4.8472147106544075, + "grad_norm": 0.7571828365325928, + "learning_rate": 2.0441963689301144e-06, + "loss": 1.5951, + "mean_token_accuracy": 0.6332684755325317, + "num_tokens": 84523229.0, + "step": 17925 + }, + { + "epoch": 4.8474851270957275, + "grad_norm": 0.9079188108444214, + "learning_rate": 2.044040463355158e-06, + "loss": 1.8559, + "mean_token_accuracy": 0.5892125964164734, + "num_tokens": 85033862.0, + "step": 17926 + }, + { + "epoch": 4.847755543537047, + "grad_norm": 1.1004455089569092, + "learning_rate": 2.043884832575379e-06, + "loss": 1.8153, + "mean_token_accuracy": 0.5795890092849731, + "num_tokens": 85546740.0, + "step": 17927 + }, + { + "epoch": 4.848025959978367, + "grad_norm": 0.7810291647911072, + "learning_rate": 2.04372947659555e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5916628837585449, + "num_tokens": 86039363.0, + "step": 17928 + }, + { + "epoch": 4.848296376419686, + "grad_norm": 0.8301800489425659, + "learning_rate": 2.0435743954204394e-06, + "loss": 1.9644, + "mean_token_accuracy": 0.5497032403945923, + "num_tokens": 86563590.0, + "step": 17929 + }, + { + "epoch": 4.848566792861006, + "grad_norm": 0.7580090761184692, + "learning_rate": 2.043419589054807e-06, + "loss": 1.9159, + "mean_token_accuracy": 0.5574823021888733, + "num_tokens": 87087792.0, + "step": 17930 + }, + { + "epoch": 4.848837209302325, + "grad_norm": 0.8919731378555298, + "learning_rate": 2.0432650575034e-06, + "loss": 1.7862, + "mean_token_accuracy": 0.5839410424232483, + "num_tokens": 87611940.0, + "step": 17931 + }, + { + "epoch": 4.849107625743645, + "grad_norm": 0.8686984777450562, + "learning_rate": 2.0431108007709603e-06, + "loss": 1.811, + "mean_token_accuracy": 0.5893657207489014, + "num_tokens": 88105933.0, + "step": 17932 + }, + { + "epoch": 4.849378042184965, + "grad_norm": 0.8654731512069702, + "learning_rate": 2.042956818862223e-06, + "loss": 1.7903, + "mean_token_accuracy": 0.5927045941352844, + "num_tokens": 88569752.0, + "step": 17933 + }, + { + "epoch": 4.849648458626285, + "grad_norm": 0.7844717502593994, + "learning_rate": 2.042803111781911e-06, + "loss": 1.7437, + "mean_token_accuracy": 0.5837873220443726, + "num_tokens": 89068620.0, + "step": 17934 + }, + { + "epoch": 4.849918875067604, + "grad_norm": 0.8513161540031433, + "learning_rate": 2.0426496795347417e-06, + "loss": 1.7655, + "mean_token_accuracy": 0.609140932559967, + "num_tokens": 89454105.0, + "step": 17935 + }, + { + "epoch": 4.850189291508924, + "grad_norm": 0.7984281778335571, + "learning_rate": 2.0424965221254217e-06, + "loss": 1.559, + "mean_token_accuracy": 0.634932816028595, + "num_tokens": 89978248.0, + "step": 17936 + }, + { + "epoch": 4.850459707950243, + "grad_norm": 0.8443214893341064, + "learning_rate": 2.0423436395586503e-06, + "loss": 1.9037, + "mean_token_accuracy": 0.5478270053863525, + "num_tokens": 90502458.0, + "step": 17937 + }, + { + "epoch": 4.850730124391563, + "grad_norm": 0.7959319949150085, + "learning_rate": 2.042191031839119e-06, + "loss": 1.7785, + "mean_token_accuracy": 0.5772024393081665, + "num_tokens": 91026727.0, + "step": 17938 + }, + { + "epoch": 4.8510005408328825, + "grad_norm": 0.7284889817237854, + "learning_rate": 2.04203869897151e-06, + "loss": 1.7641, + "mean_token_accuracy": 0.593874454498291, + "num_tokens": 91550938.0, + "step": 17939 + }, + { + "epoch": 4.8512709572742025, + "grad_norm": 0.8229451179504395, + "learning_rate": 2.0418866409604974e-06, + "loss": 1.7591, + "mean_token_accuracy": 0.590317964553833, + "num_tokens": 92043266.0, + "step": 17940 + }, + { + "epoch": 4.851541373715522, + "grad_norm": 0.32749906182289124, + "learning_rate": 2.0417348578107465e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.7058568596839905, + "num_tokens": 92567486.0, + "step": 17941 + }, + { + "epoch": 4.851811790156842, + "grad_norm": 0.757477343082428, + "learning_rate": 2.0415833495269148e-06, + "loss": 1.8777, + "mean_token_accuracy": 0.5618323087692261, + "num_tokens": 93091687.0, + "step": 17942 + }, + { + "epoch": 4.852082206598161, + "grad_norm": 0.6839975714683533, + "learning_rate": 2.041432116113651e-06, + "loss": 1.8664, + "mean_token_accuracy": 0.577562153339386, + "num_tokens": 93615953.0, + "step": 17943 + }, + { + "epoch": 4.852352623039481, + "grad_norm": 0.8514991998672485, + "learning_rate": 2.0412811575755953e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5845649242401123, + "num_tokens": 94140094.0, + "step": 17944 + }, + { + "epoch": 4.8526230394808, + "grad_norm": 0.8597790002822876, + "learning_rate": 2.041130473917379e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.587973952293396, + "num_tokens": 94593294.0, + "step": 17945 + }, + { + "epoch": 4.85289345592212, + "grad_norm": 0.7181755304336548, + "learning_rate": 2.040980065143627e-06, + "loss": 1.9853, + "mean_token_accuracy": 0.5410503149032593, + "num_tokens": 95117510.0, + "step": 17946 + }, + { + "epoch": 4.85316387236344, + "grad_norm": 0.8029006719589233, + "learning_rate": 2.040829931258953e-06, + "loss": 1.8323, + "mean_token_accuracy": 0.5901456475257874, + "num_tokens": 95641788.0, + "step": 17947 + }, + { + "epoch": 4.85343428880476, + "grad_norm": 0.8591635823249817, + "learning_rate": 2.0406800722679647e-06, + "loss": 1.917, + "mean_token_accuracy": 0.5496066808700562, + "num_tokens": 96165844.0, + "step": 17948 + }, + { + "epoch": 4.853704705246079, + "grad_norm": 0.7474010586738586, + "learning_rate": 2.040530488175258e-06, + "loss": 1.7894, + "mean_token_accuracy": 0.5753828883171082, + "num_tokens": 96690051.0, + "step": 17949 + }, + { + "epoch": 4.853975121687399, + "grad_norm": 0.7275130152702332, + "learning_rate": 2.040381178985425e-06, + "loss": 1.7232, + "mean_token_accuracy": 0.609782338142395, + "num_tokens": 97214285.0, + "step": 17950 + }, + { + "epoch": 4.854245538128718, + "grad_norm": 0.7455328702926636, + "learning_rate": 2.0402321447030463e-06, + "loss": 1.7982, + "mean_token_accuracy": 0.5800762176513672, + "num_tokens": 97738463.0, + "step": 17951 + }, + { + "epoch": 4.854515954570038, + "grad_norm": 0.7725681066513062, + "learning_rate": 2.0400833853326933e-06, + "loss": 1.6706, + "mean_token_accuracy": 0.5841872692108154, + "num_tokens": 98262688.0, + "step": 17952 + }, + { + "epoch": 4.8547863710113575, + "grad_norm": 0.8012265563011169, + "learning_rate": 2.039934900878932e-06, + "loss": 1.7385, + "mean_token_accuracy": 0.5811606049537659, + "num_tokens": 98786887.0, + "step": 17953 + }, + { + "epoch": 4.855056787452677, + "grad_norm": 0.963444173336029, + "learning_rate": 2.039786691346317e-06, + "loss": 1.8754, + "mean_token_accuracy": 0.5620235204696655, + "num_tokens": 99311151.0, + "step": 17954 + }, + { + "epoch": 4.855327203893997, + "grad_norm": 0.7916472554206848, + "learning_rate": 2.039638756739398e-06, + "loss": 1.7597, + "mean_token_accuracy": 0.5757301449775696, + "num_tokens": 99835389.0, + "step": 17955 + }, + { + "epoch": 4.855597620335317, + "grad_norm": 0.7559778094291687, + "learning_rate": 2.039491097062711e-06, + "loss": 1.7238, + "mean_token_accuracy": 0.5773391723632812, + "num_tokens": 100359552.0, + "step": 17956 + }, + { + "epoch": 4.855868036776636, + "grad_norm": 0.7759124040603638, + "learning_rate": 2.03934371232079e-06, + "loss": 1.7462, + "mean_token_accuracy": 0.584621787071228, + "num_tokens": 100883582.0, + "step": 17957 + }, + { + "epoch": 4.856138453217955, + "grad_norm": 0.8062729835510254, + "learning_rate": 2.0391966025181547e-06, + "loss": 1.8671, + "mean_token_accuracy": 0.5797634124755859, + "num_tokens": 101407818.0, + "step": 17958 + }, + { + "epoch": 4.856408869659275, + "grad_norm": 0.8679569363594055, + "learning_rate": 2.0390497676593187e-06, + "loss": 1.7527, + "mean_token_accuracy": 0.597278892993927, + "num_tokens": 101931787.0, + "step": 17959 + }, + { + "epoch": 4.856679286100595, + "grad_norm": 0.8287252187728882, + "learning_rate": 2.038903207748789e-06, + "loss": 1.7619, + "mean_token_accuracy": 0.5821102857589722, + "num_tokens": 102455890.0, + "step": 17960 + }, + { + "epoch": 4.856949702541915, + "grad_norm": 0.32593658566474915, + "learning_rate": 2.0387569227910615e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7273300886154175, + "num_tokens": 102980174.0, + "step": 17961 + }, + { + "epoch": 4.857220118983234, + "grad_norm": 0.9326741099357605, + "learning_rate": 2.0386109127906257e-06, + "loss": 1.8465, + "mean_token_accuracy": 0.5826146602630615, + "num_tokens": 103504313.0, + "step": 17962 + }, + { + "epoch": 4.857490535424554, + "grad_norm": 1.0156424045562744, + "learning_rate": 2.0384651777519605e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.5844738483428955, + "num_tokens": 104028398.0, + "step": 17963 + }, + { + "epoch": 4.857760951865873, + "grad_norm": 0.933996319770813, + "learning_rate": 2.0383197176795376e-06, + "loss": 1.6348, + "mean_token_accuracy": 0.6123173832893372, + "num_tokens": 104515115.0, + "step": 17964 + }, + { + "epoch": 4.858031368307193, + "grad_norm": 0.7800469398498535, + "learning_rate": 2.0381745325778206e-06, + "loss": 1.8074, + "mean_token_accuracy": 0.5701608657836914, + "num_tokens": 105039315.0, + "step": 17965 + }, + { + "epoch": 4.858301784748512, + "grad_norm": 0.8510506749153137, + "learning_rate": 2.0380296224512643e-06, + "loss": 1.9119, + "mean_token_accuracy": 0.5842993259429932, + "num_tokens": 105517312.0, + "step": 17966 + }, + { + "epoch": 4.8585722011898325, + "grad_norm": 0.7599503397941589, + "learning_rate": 2.0378849873043134e-06, + "loss": 1.8902, + "mean_token_accuracy": 0.5671112537384033, + "num_tokens": 106041557.0, + "step": 17967 + }, + { + "epoch": 4.858842617631152, + "grad_norm": 0.7712327837944031, + "learning_rate": 2.0377406271414076e-06, + "loss": 1.8192, + "mean_token_accuracy": 0.5805981159210205, + "num_tokens": 106565801.0, + "step": 17968 + }, + { + "epoch": 4.859113034072472, + "grad_norm": 0.8613496422767639, + "learning_rate": 2.0375965419669756e-06, + "loss": 1.9198, + "mean_token_accuracy": 0.5730398297309875, + "num_tokens": 107084316.0, + "step": 17969 + }, + { + "epoch": 4.859383450513791, + "grad_norm": 0.79106605052948, + "learning_rate": 2.0374527317854385e-06, + "loss": 1.7941, + "mean_token_accuracy": 0.5927228927612305, + "num_tokens": 107595119.0, + "step": 17970 + }, + { + "epoch": 4.859653866955111, + "grad_norm": 1.004029631614685, + "learning_rate": 2.0373091966012094e-06, + "loss": 1.8266, + "mean_token_accuracy": 0.5791170597076416, + "num_tokens": 108119245.0, + "step": 17971 + }, + { + "epoch": 4.85992428339643, + "grad_norm": 0.726485013961792, + "learning_rate": 2.0371659364186903e-06, + "loss": 1.8246, + "mean_token_accuracy": 0.575654149055481, + "num_tokens": 108643509.0, + "step": 17972 + }, + { + "epoch": 4.86019469983775, + "grad_norm": 0.7837349772453308, + "learning_rate": 2.0370229512422795e-06, + "loss": 1.692, + "mean_token_accuracy": 0.6103934645652771, + "num_tokens": 109165892.0, + "step": 17973 + }, + { + "epoch": 4.8604651162790695, + "grad_norm": 0.8819977641105652, + "learning_rate": 2.036880241076362e-06, + "loss": 1.7729, + "mean_token_accuracy": 0.5808418393135071, + "num_tokens": 109690084.0, + "step": 17974 + }, + { + "epoch": 4.86073553272039, + "grad_norm": 0.7945743799209595, + "learning_rate": 2.036737805925318e-06, + "loss": 1.7776, + "mean_token_accuracy": 0.5734070539474487, + "num_tokens": 110214290.0, + "step": 17975 + }, + { + "epoch": 4.861005949161709, + "grad_norm": 0.8462726473808289, + "learning_rate": 2.036595645793518e-06, + "loss": 1.6799, + "mean_token_accuracy": 0.6055566668510437, + "num_tokens": 110738505.0, + "step": 17976 + }, + { + "epoch": 4.861276365603029, + "grad_norm": 1.1411834955215454, + "learning_rate": 2.0364537606853224e-06, + "loss": 1.7567, + "mean_token_accuracy": 0.5853710174560547, + "num_tokens": 111217957.0, + "step": 17977 + }, + { + "epoch": 4.861546782044348, + "grad_norm": 0.786077082157135, + "learning_rate": 2.0363121506050853e-06, + "loss": 1.7517, + "mean_token_accuracy": 0.5859274864196777, + "num_tokens": 111742228.0, + "step": 17978 + }, + { + "epoch": 4.861817198485668, + "grad_norm": 0.8422127366065979, + "learning_rate": 2.0361708155571522e-06, + "loss": 1.8822, + "mean_token_accuracy": 0.5624203085899353, + "num_tokens": 112167719.0, + "step": 17979 + }, + { + "epoch": 4.862087614926987, + "grad_norm": 0.8769515752792358, + "learning_rate": 2.0360297555458595e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5824501514434814, + "num_tokens": 112691933.0, + "step": 17980 + }, + { + "epoch": 4.8623580313683075, + "grad_norm": 0.3313803970813751, + "learning_rate": 2.035888970575535e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7347396612167358, + "num_tokens": 113207351.0, + "step": 17981 + }, + { + "epoch": 4.862628447809627, + "grad_norm": 0.8179304599761963, + "learning_rate": 2.0357484606504987e-06, + "loss": 1.8003, + "mean_token_accuracy": 0.5798355340957642, + "num_tokens": 113731542.0, + "step": 17982 + }, + { + "epoch": 4.862898864250947, + "grad_norm": 0.8073391318321228, + "learning_rate": 2.035608225775063e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5825159549713135, + "num_tokens": 114255554.0, + "step": 17983 + }, + { + "epoch": 4.863169280692266, + "grad_norm": 0.8738329410552979, + "learning_rate": 2.0354682659535272e-06, + "loss": 1.8521, + "mean_token_accuracy": 0.5731650590896606, + "num_tokens": 114779715.0, + "step": 17984 + }, + { + "epoch": 4.863439697133586, + "grad_norm": 0.839789092540741, + "learning_rate": 2.0353285811901903e-06, + "loss": 1.8044, + "mean_token_accuracy": 0.5513662099838257, + "num_tokens": 115303679.0, + "step": 17985 + }, + { + "epoch": 4.863710113574905, + "grad_norm": 0.7014415860176086, + "learning_rate": 2.035189171489334e-06, + "loss": 1.6998, + "mean_token_accuracy": 0.5950036644935608, + "num_tokens": 115827957.0, + "step": 17986 + }, + { + "epoch": 4.863980530016225, + "grad_norm": 0.733932375907898, + "learning_rate": 2.035050036855239e-06, + "loss": 1.8625, + "mean_token_accuracy": 0.5764120817184448, + "num_tokens": 116352191.0, + "step": 17987 + }, + { + "epoch": 4.8642509464575445, + "grad_norm": 0.737211287021637, + "learning_rate": 2.034911177292172e-06, + "loss": 1.8338, + "mean_token_accuracy": 0.5691547393798828, + "num_tokens": 116876444.0, + "step": 17988 + }, + { + "epoch": 4.864521362898865, + "grad_norm": 0.8126223087310791, + "learning_rate": 2.0347725928043955e-06, + "loss": 1.8055, + "mean_token_accuracy": 0.5806723833084106, + "num_tokens": 117400692.0, + "step": 17989 + }, + { + "epoch": 4.864791779340184, + "grad_norm": 0.9634086489677429, + "learning_rate": 2.03463428339616e-06, + "loss": 1.7725, + "mean_token_accuracy": 0.6013823747634888, + "num_tokens": 117889685.0, + "step": 17990 + }, + { + "epoch": 4.865062195781504, + "grad_norm": 0.803132176399231, + "learning_rate": 2.0344962490717102e-06, + "loss": 1.7758, + "mean_token_accuracy": 0.588769257068634, + "num_tokens": 118371972.0, + "step": 17991 + }, + { + "epoch": 4.865332612222823, + "grad_norm": 0.8987239003181458, + "learning_rate": 2.034358489835282e-06, + "loss": 1.8288, + "mean_token_accuracy": 0.5829288959503174, + "num_tokens": 118830511.0, + "step": 17992 + }, + { + "epoch": 4.865603028664143, + "grad_norm": 0.7912705540657043, + "learning_rate": 2.034221005691102e-06, + "loss": 1.879, + "mean_token_accuracy": 0.5780662298202515, + "num_tokens": 119354717.0, + "step": 17993 + }, + { + "epoch": 4.865873445105462, + "grad_norm": 0.7843519449234009, + "learning_rate": 2.0340837966433874e-06, + "loss": 1.8685, + "mean_token_accuracy": 0.5797083377838135, + "num_tokens": 119806245.0, + "step": 17994 + }, + { + "epoch": 4.866143861546782, + "grad_norm": 1.0025720596313477, + "learning_rate": 2.0339468626963493e-06, + "loss": 1.9401, + "mean_token_accuracy": 0.5536380410194397, + "num_tokens": 120330511.0, + "step": 17995 + }, + { + "epoch": 4.866414277988102, + "grad_norm": 0.9007235169410706, + "learning_rate": 2.0338102038541884e-06, + "loss": 1.8699, + "mean_token_accuracy": 0.5708020925521851, + "num_tokens": 120854796.0, + "step": 17996 + }, + { + "epoch": 4.866684694429422, + "grad_norm": 0.9317750930786133, + "learning_rate": 2.0336738201210986e-06, + "loss": 1.7937, + "mean_token_accuracy": 0.5818523168563843, + "num_tokens": 121321315.0, + "step": 17997 + }, + { + "epoch": 4.866955110870741, + "grad_norm": 0.8254031538963318, + "learning_rate": 2.0335377115012646e-06, + "loss": 1.7915, + "mean_token_accuracy": 0.550983190536499, + "num_tokens": 121845369.0, + "step": 17998 + }, + { + "epoch": 4.86722552731206, + "grad_norm": 0.8845143914222717, + "learning_rate": 2.0334018779988617e-06, + "loss": 1.6537, + "mean_token_accuracy": 0.6148647665977478, + "num_tokens": 122276369.0, + "step": 17999 + }, + { + "epoch": 4.86749594375338, + "grad_norm": 0.8022642135620117, + "learning_rate": 2.0332663196180583e-06, + "loss": 1.8332, + "mean_token_accuracy": 0.5730745792388916, + "num_tokens": 122800418.0, + "step": 18000 + }, + { + "epoch": 4.8677663601947, + "grad_norm": 0.34093207120895386, + "learning_rate": 2.0331310363630138e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7281209826469421, + "num_tokens": 123324584.0, + "step": 18001 + }, + { + "epoch": 4.8680367766360195, + "grad_norm": 0.8519608974456787, + "learning_rate": 2.0329960282378785e-06, + "loss": 1.8606, + "mean_token_accuracy": 0.5703239440917969, + "num_tokens": 123789862.0, + "step": 18002 + }, + { + "epoch": 4.868307193077339, + "grad_norm": 0.7686076760292053, + "learning_rate": 2.0328612952467965e-06, + "loss": 1.6741, + "mean_token_accuracy": 0.5972103476524353, + "num_tokens": 124314031.0, + "step": 18003 + }, + { + "epoch": 4.868577609518659, + "grad_norm": 0.8128756880760193, + "learning_rate": 2.032726837393899e-06, + "loss": 1.6249, + "mean_token_accuracy": 0.6312422752380371, + "num_tokens": 124838254.0, + "step": 18004 + }, + { + "epoch": 4.868848025959978, + "grad_norm": 0.7685651183128357, + "learning_rate": 2.0325926546833142e-06, + "loss": 1.6819, + "mean_token_accuracy": 0.5962247848510742, + "num_tokens": 125362461.0, + "step": 18005 + }, + { + "epoch": 4.869118442401298, + "grad_norm": 0.9304079413414001, + "learning_rate": 2.032458747119158e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5727922916412354, + "num_tokens": 125885762.0, + "step": 18006 + }, + { + "epoch": 4.869388858842617, + "grad_norm": 0.8156638741493225, + "learning_rate": 2.0323251147055393e-06, + "loss": 1.7641, + "mean_token_accuracy": 0.5854362845420837, + "num_tokens": 126398480.0, + "step": 18007 + }, + { + "epoch": 4.869659275283937, + "grad_norm": 0.9126051068305969, + "learning_rate": 2.0321917574465586e-06, + "loss": 1.7778, + "mean_token_accuracy": 0.56545490026474, + "num_tokens": 126922605.0, + "step": 18008 + }, + { + "epoch": 4.869929691725257, + "grad_norm": 0.8182246088981628, + "learning_rate": 2.0320586753463066e-06, + "loss": 1.7914, + "mean_token_accuracy": 0.5501883029937744, + "num_tokens": 127446777.0, + "step": 18009 + }, + { + "epoch": 4.870200108166577, + "grad_norm": 0.7843688130378723, + "learning_rate": 2.031925868408868e-06, + "loss": 1.7656, + "mean_token_accuracy": 0.5839092135429382, + "num_tokens": 127970934.0, + "step": 18010 + }, + { + "epoch": 4.870470524607896, + "grad_norm": 0.8125665187835693, + "learning_rate": 2.0317933366383174e-06, + "loss": 1.79, + "mean_token_accuracy": 0.5698060393333435, + "num_tokens": 128495158.0, + "step": 18011 + }, + { + "epoch": 4.870740941049216, + "grad_norm": 0.6857720017433167, + "learning_rate": 2.0316610800387202e-06, + "loss": 1.6713, + "mean_token_accuracy": 0.6076172590255737, + "num_tokens": 129012659.0, + "step": 18012 + }, + { + "epoch": 4.871011357490535, + "grad_norm": 1.0248417854309082, + "learning_rate": 2.031529098614136e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5879781246185303, + "num_tokens": 129468778.0, + "step": 18013 + }, + { + "epoch": 4.871281773931855, + "grad_norm": 0.853934645652771, + "learning_rate": 2.0313973923686135e-06, + "loss": 1.8199, + "mean_token_accuracy": 0.5774973034858704, + "num_tokens": 129992855.0, + "step": 18014 + }, + { + "epoch": 4.871552190373174, + "grad_norm": 0.8928382992744446, + "learning_rate": 2.031265961306193e-06, + "loss": 1.7307, + "mean_token_accuracy": 0.5872266888618469, + "num_tokens": 130517055.0, + "step": 18015 + }, + { + "epoch": 4.8718226068144945, + "grad_norm": 0.7600048780441284, + "learning_rate": 2.0311348054309084e-06, + "loss": 1.7563, + "mean_token_accuracy": 0.5860975384712219, + "num_tokens": 131041262.0, + "step": 18016 + }, + { + "epoch": 4.872093023255814, + "grad_norm": 0.9820850491523743, + "learning_rate": 2.031003924746784e-06, + "loss": 1.858, + "mean_token_accuracy": 0.5755751132965088, + "num_tokens": 131565517.0, + "step": 18017 + }, + { + "epoch": 4.872363439697134, + "grad_norm": 0.9209731817245483, + "learning_rate": 2.0308733192578353e-06, + "loss": 1.7519, + "mean_token_accuracy": 0.6015522480010986, + "num_tokens": 132089663.0, + "step": 18018 + }, + { + "epoch": 4.872633856138453, + "grad_norm": 0.8539766669273376, + "learning_rate": 2.03074298896807e-06, + "loss": 1.8197, + "mean_token_accuracy": 0.5711874961853027, + "num_tokens": 132613860.0, + "step": 18019 + }, + { + "epoch": 4.872904272579773, + "grad_norm": 0.7245509028434753, + "learning_rate": 2.0306129338814867e-06, + "loss": 1.7768, + "mean_token_accuracy": 0.5831977128982544, + "num_tokens": 133138043.0, + "step": 18020 + }, + { + "epoch": 4.873174689021092, + "grad_norm": 0.3170018494129181, + "learning_rate": 2.0304831540020756e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.7057192325592041, + "num_tokens": 133662163.0, + "step": 18021 + }, + { + "epoch": 4.873445105462412, + "grad_norm": 0.8699237108230591, + "learning_rate": 2.030353649333818e-06, + "loss": 1.8016, + "mean_token_accuracy": 0.5870220065116882, + "num_tokens": 134161362.0, + "step": 18022 + }, + { + "epoch": 4.873715521903732, + "grad_norm": 0.8027691841125488, + "learning_rate": 2.0302244198806896e-06, + "loss": 1.8742, + "mean_token_accuracy": 0.5677652359008789, + "num_tokens": 134685537.0, + "step": 18023 + }, + { + "epoch": 4.873985938345052, + "grad_norm": 0.794016420841217, + "learning_rate": 2.030095465646655e-06, + "loss": 1.7994, + "mean_token_accuracy": 0.5890286564826965, + "num_tokens": 135209697.0, + "step": 18024 + }, + { + "epoch": 4.874256354786371, + "grad_norm": 0.8414022922515869, + "learning_rate": 2.029966786635669e-06, + "loss": 1.9212, + "mean_token_accuracy": 0.5592486262321472, + "num_tokens": 135709582.0, + "step": 18025 + }, + { + "epoch": 4.874526771227691, + "grad_norm": 0.7654980421066284, + "learning_rate": 2.0298383828516816e-06, + "loss": 1.7653, + "mean_token_accuracy": 0.590674638748169, + "num_tokens": 136233746.0, + "step": 18026 + }, + { + "epoch": 4.87479718766901, + "grad_norm": 0.737246572971344, + "learning_rate": 2.029710254298632e-06, + "loss": 1.8349, + "mean_token_accuracy": 0.5712077617645264, + "num_tokens": 136742137.0, + "step": 18027 + }, + { + "epoch": 4.87506760411033, + "grad_norm": 0.759693443775177, + "learning_rate": 2.0295824009804518e-06, + "loss": 1.7888, + "mean_token_accuracy": 0.5841536521911621, + "num_tokens": 137266191.0, + "step": 18028 + }, + { + "epoch": 4.875338020551649, + "grad_norm": 0.8098699450492859, + "learning_rate": 2.029454822901064e-06, + "loss": 1.7999, + "mean_token_accuracy": 0.5826330184936523, + "num_tokens": 137790466.0, + "step": 18029 + }, + { + "epoch": 4.8756084369929695, + "grad_norm": 0.8720329403877258, + "learning_rate": 2.0293275200643826e-06, + "loss": 1.7857, + "mean_token_accuracy": 0.5939971208572388, + "num_tokens": 138314659.0, + "step": 18030 + }, + { + "epoch": 4.875878853434289, + "grad_norm": 0.8685548901557922, + "learning_rate": 2.029200492474314e-06, + "loss": 1.8425, + "mean_token_accuracy": 0.5698104500770569, + "num_tokens": 138838915.0, + "step": 18031 + }, + { + "epoch": 4.876149269875609, + "grad_norm": 0.8886162638664246, + "learning_rate": 2.0290737401347555e-06, + "loss": 1.868, + "mean_token_accuracy": 0.5690780282020569, + "num_tokens": 139363095.0, + "step": 18032 + }, + { + "epoch": 4.876419686316928, + "grad_norm": 0.9986214637756348, + "learning_rate": 2.028947263049597e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5791459083557129, + "num_tokens": 139850002.0, + "step": 18033 + }, + { + "epoch": 4.876690102758248, + "grad_norm": 0.7547705769538879, + "learning_rate": 2.028821061222718e-06, + "loss": 1.8056, + "mean_token_accuracy": 0.5787213444709778, + "num_tokens": 140374230.0, + "step": 18034 + }, + { + "epoch": 4.876960519199567, + "grad_norm": 0.941991925239563, + "learning_rate": 2.0286951346579922e-06, + "loss": 1.8555, + "mean_token_accuracy": 0.5656051635742188, + "num_tokens": 140898448.0, + "step": 18035 + }, + { + "epoch": 4.8772309356408865, + "grad_norm": 0.8076122999191284, + "learning_rate": 2.0285694833592816e-06, + "loss": 1.903, + "mean_token_accuracy": 0.5752556324005127, + "num_tokens": 141422723.0, + "step": 18036 + }, + { + "epoch": 4.877501352082207, + "grad_norm": 1.0815250873565674, + "learning_rate": 2.0284441073304427e-06, + "loss": 1.9415, + "mean_token_accuracy": 0.5521395206451416, + "num_tokens": 141914222.0, + "step": 18037 + }, + { + "epoch": 4.877771768523527, + "grad_norm": 0.8551380634307861, + "learning_rate": 2.0283190065753224e-06, + "loss": 1.779, + "mean_token_accuracy": 0.5994383096694946, + "num_tokens": 142432929.0, + "step": 18038 + }, + { + "epoch": 4.878042184964846, + "grad_norm": 0.9911007881164551, + "learning_rate": 2.028194181097759e-06, + "loss": 1.7718, + "mean_token_accuracy": 0.6163816452026367, + "num_tokens": 142902409.0, + "step": 18039 + }, + { + "epoch": 4.878312601406165, + "grad_norm": 0.8619473576545715, + "learning_rate": 2.0280696309015826e-06, + "loss": 1.6956, + "mean_token_accuracy": 0.6082712411880493, + "num_tokens": 143412178.0, + "step": 18040 + }, + { + "epoch": 4.878583017847485, + "grad_norm": 0.3515984117984772, + "learning_rate": 2.0279453559906143e-06, + "loss": 1.0856, + "mean_token_accuracy": 0.7063872814178467, + "num_tokens": 143936326.0, + "step": 18041 + }, + { + "epoch": 4.878853434288805, + "grad_norm": 1.1237157583236694, + "learning_rate": 2.027821356368668e-06, + "loss": 1.8474, + "mean_token_accuracy": 0.577297568321228, + "num_tokens": 144460587.0, + "step": 18042 + }, + { + "epoch": 4.8791238507301244, + "grad_norm": 1.020279884338379, + "learning_rate": 2.027697632039548e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5801868438720703, + "num_tokens": 144954144.0, + "step": 18043 + }, + { + "epoch": 4.879394267171444, + "grad_norm": 0.8762710690498352, + "learning_rate": 2.0275741830070498e-06, + "loss": 1.7976, + "mean_token_accuracy": 0.5760393738746643, + "num_tokens": 145478411.0, + "step": 18044 + }, + { + "epoch": 4.879664683612764, + "grad_norm": 0.7985502481460571, + "learning_rate": 2.027451009274962e-06, + "loss": 1.7774, + "mean_token_accuracy": 0.5883092880249023, + "num_tokens": 145968142.0, + "step": 18045 + }, + { + "epoch": 4.879935100054083, + "grad_norm": 0.8121235966682434, + "learning_rate": 2.0273281108470645e-06, + "loss": 1.8422, + "mean_token_accuracy": 0.5575683116912842, + "num_tokens": 146492307.0, + "step": 18046 + }, + { + "epoch": 4.880205516495403, + "grad_norm": 1.0698573589324951, + "learning_rate": 2.0272054877271265e-06, + "loss": 1.8725, + "mean_token_accuracy": 0.574177622795105, + "num_tokens": 147013586.0, + "step": 18047 + }, + { + "epoch": 4.880475932936722, + "grad_norm": 1.0922081470489502, + "learning_rate": 2.0270831399189117e-06, + "loss": 1.8454, + "mean_token_accuracy": 0.5904761552810669, + "num_tokens": 147498818.0, + "step": 18048 + }, + { + "epoch": 4.880746349378042, + "grad_norm": 0.9495946764945984, + "learning_rate": 2.0269610674261746e-06, + "loss": 1.8275, + "mean_token_accuracy": 0.5800237655639648, + "num_tokens": 148023014.0, + "step": 18049 + }, + { + "epoch": 4.8810167658193615, + "grad_norm": 0.8782907724380493, + "learning_rate": 2.0268392702526587e-06, + "loss": 1.9152, + "mean_token_accuracy": 0.5649883151054382, + "num_tokens": 148547186.0, + "step": 18050 + }, + { + "epoch": 4.881287182260682, + "grad_norm": 0.9533151388168335, + "learning_rate": 2.0267177484021035e-06, + "loss": 1.8575, + "mean_token_accuracy": 0.5743605494499207, + "num_tokens": 149071424.0, + "step": 18051 + }, + { + "epoch": 4.881557598702001, + "grad_norm": 0.744583249092102, + "learning_rate": 2.0265965018782365e-06, + "loss": 1.65, + "mean_token_accuracy": 0.6206200122833252, + "num_tokens": 149595703.0, + "step": 18052 + }, + { + "epoch": 4.881828015143321, + "grad_norm": 0.7854401469230652, + "learning_rate": 2.026475530684777e-06, + "loss": 1.7533, + "mean_token_accuracy": 0.5869663953781128, + "num_tokens": 150119911.0, + "step": 18053 + }, + { + "epoch": 4.88209843158464, + "grad_norm": 0.8598699569702148, + "learning_rate": 2.026354834825438e-06, + "loss": 1.7143, + "mean_token_accuracy": 0.5883742570877075, + "num_tokens": 150592536.0, + "step": 18054 + }, + { + "epoch": 4.88236884802596, + "grad_norm": 0.8812234997749329, + "learning_rate": 2.0262344143039225e-06, + "loss": 1.8513, + "mean_token_accuracy": 0.5834607481956482, + "num_tokens": 151106735.0, + "step": 18055 + }, + { + "epoch": 4.882639264467279, + "grad_norm": 0.9186065196990967, + "learning_rate": 2.026114269123925e-06, + "loss": 1.8365, + "mean_token_accuracy": 0.5833166837692261, + "num_tokens": 151631012.0, + "step": 18056 + }, + { + "epoch": 4.8829096809085994, + "grad_norm": 0.8613530397415161, + "learning_rate": 2.0259943992891324e-06, + "loss": 1.9156, + "mean_token_accuracy": 0.5487148761749268, + "num_tokens": 152155231.0, + "step": 18057 + }, + { + "epoch": 4.883180097349919, + "grad_norm": 0.9619997143745422, + "learning_rate": 2.0258748048032223e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5891070365905762, + "num_tokens": 152639887.0, + "step": 18058 + }, + { + "epoch": 4.883450513791239, + "grad_norm": 0.9097834825515747, + "learning_rate": 2.0257554856698645e-06, + "loss": 1.8383, + "mean_token_accuracy": 0.6011489629745483, + "num_tokens": 153058282.0, + "step": 18059 + }, + { + "epoch": 4.883720930232558, + "grad_norm": 0.8611766695976257, + "learning_rate": 2.0256364418927203e-06, + "loss": 1.8608, + "mean_token_accuracy": 0.5781204700469971, + "num_tokens": 153582544.0, + "step": 18060 + }, + { + "epoch": 4.883991346673878, + "grad_norm": 0.33419790863990784, + "learning_rate": 2.0255176734754417e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.7109571099281311, + "num_tokens": 154106819.0, + "step": 18061 + }, + { + "epoch": 4.884261763115197, + "grad_norm": 0.8157393932342529, + "learning_rate": 2.0253991804216726e-06, + "loss": 1.7227, + "mean_token_accuracy": 0.5954738855361938, + "num_tokens": 154630989.0, + "step": 18062 + }, + { + "epoch": 4.884532179556517, + "grad_norm": 0.7714493870735168, + "learning_rate": 2.0252809627350498e-06, + "loss": 1.8778, + "mean_token_accuracy": 0.5674623250961304, + "num_tokens": 155155168.0, + "step": 18063 + }, + { + "epoch": 4.8848025959978365, + "grad_norm": 0.8416491150856018, + "learning_rate": 2.0251630204192e-06, + "loss": 1.7881, + "mean_token_accuracy": 0.5875750780105591, + "num_tokens": 155679439.0, + "step": 18064 + }, + { + "epoch": 4.885073012439157, + "grad_norm": 0.8300194144248962, + "learning_rate": 2.0250453534777416e-06, + "loss": 1.7606, + "mean_token_accuracy": 0.5831462144851685, + "num_tokens": 156153112.0, + "step": 18065 + }, + { + "epoch": 4.885343428880476, + "grad_norm": 0.740193247795105, + "learning_rate": 2.0249279619142856e-06, + "loss": 1.7988, + "mean_token_accuracy": 0.5814003944396973, + "num_tokens": 156677378.0, + "step": 18066 + }, + { + "epoch": 4.885613845321796, + "grad_norm": 0.8299737572669983, + "learning_rate": 2.024810845732434e-06, + "loss": 1.8035, + "mean_token_accuracy": 0.5956903696060181, + "num_tokens": 157141447.0, + "step": 18067 + }, + { + "epoch": 4.885884261763115, + "grad_norm": 0.8183402419090271, + "learning_rate": 2.0246940049357793e-06, + "loss": 1.8411, + "mean_token_accuracy": 0.5757548809051514, + "num_tokens": 157665625.0, + "step": 18068 + }, + { + "epoch": 4.886154678204435, + "grad_norm": 0.865786612033844, + "learning_rate": 2.024577439527908e-06, + "loss": 1.8155, + "mean_token_accuracy": 0.600303053855896, + "num_tokens": 158189894.0, + "step": 18069 + }, + { + "epoch": 4.886425094645754, + "grad_norm": 0.8744480013847351, + "learning_rate": 2.024461149512395e-06, + "loss": 1.8354, + "mean_token_accuracy": 0.5924167633056641, + "num_tokens": 158658278.0, + "step": 18070 + }, + { + "epoch": 4.8866955110870745, + "grad_norm": 0.7800681591033936, + "learning_rate": 2.02434513489281e-06, + "loss": 1.5773, + "mean_token_accuracy": 0.6457562446594238, + "num_tokens": 159121615.0, + "step": 18071 + }, + { + "epoch": 4.886965927528394, + "grad_norm": 0.87807697057724, + "learning_rate": 2.024229395672712e-06, + "loss": 1.8145, + "mean_token_accuracy": 0.5852935314178467, + "num_tokens": 159645888.0, + "step": 18072 + }, + { + "epoch": 4.887236343969714, + "grad_norm": 0.7651499509811401, + "learning_rate": 2.024113931855651e-06, + "loss": 1.8887, + "mean_token_accuracy": 0.573081374168396, + "num_tokens": 160170117.0, + "step": 18073 + }, + { + "epoch": 4.887506760411033, + "grad_norm": 0.7864689826965332, + "learning_rate": 2.0239987434451724e-06, + "loss": 1.7948, + "mean_token_accuracy": 0.5908865928649902, + "num_tokens": 160694212.0, + "step": 18074 + }, + { + "epoch": 4.887777176852353, + "grad_norm": 0.8508669137954712, + "learning_rate": 2.023883830444808e-06, + "loss": 1.8279, + "mean_token_accuracy": 0.5764008164405823, + "num_tokens": 161218358.0, + "step": 18075 + }, + { + "epoch": 4.888047593293672, + "grad_norm": 0.7983810305595398, + "learning_rate": 2.023769192858085e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5716357231140137, + "num_tokens": 161742587.0, + "step": 18076 + }, + { + "epoch": 4.888318009734991, + "grad_norm": 1.0174610614776611, + "learning_rate": 2.0236548306885205e-06, + "loss": 1.7936, + "mean_token_accuracy": 0.5967375636100769, + "num_tokens": 162245395.0, + "step": 18077 + }, + { + "epoch": 4.8885884261763115, + "grad_norm": 0.936555027961731, + "learning_rate": 2.0235407439396245e-06, + "loss": 1.8189, + "mean_token_accuracy": 0.5857948660850525, + "num_tokens": 162725692.0, + "step": 18078 + }, + { + "epoch": 4.888858842617632, + "grad_norm": 0.8040414452552795, + "learning_rate": 2.023426932614895e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5717307329177856, + "num_tokens": 163249846.0, + "step": 18079 + }, + { + "epoch": 4.889129259058951, + "grad_norm": 0.8903074860572815, + "learning_rate": 2.023313396717827e-06, + "loss": 1.7801, + "mean_token_accuracy": 0.5693254470825195, + "num_tokens": 163774115.0, + "step": 18080 + }, + { + "epoch": 4.88939967550027, + "grad_norm": 0.40832266211509705, + "learning_rate": 2.023200136251902e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7153040170669556, + "num_tokens": 164298323.0, + "step": 18081 + }, + { + "epoch": 4.88967009194159, + "grad_norm": 1.0355991125106812, + "learning_rate": 2.023087151220595e-06, + "loss": 1.7994, + "mean_token_accuracy": 0.5703091025352478, + "num_tokens": 164822560.0, + "step": 18082 + }, + { + "epoch": 4.88994050838291, + "grad_norm": 0.8983299732208252, + "learning_rate": 2.0229744416273747e-06, + "loss": 1.9057, + "mean_token_accuracy": 0.5696300268173218, + "num_tokens": 165346651.0, + "step": 18083 + }, + { + "epoch": 4.890210924824229, + "grad_norm": 0.9277617931365967, + "learning_rate": 2.022862007475699e-06, + "loss": 1.8036, + "mean_token_accuracy": 0.5882396697998047, + "num_tokens": 165870717.0, + "step": 18084 + }, + { + "epoch": 4.890481341265549, + "grad_norm": 0.9020925164222717, + "learning_rate": 2.0227498487690146e-06, + "loss": 1.7549, + "mean_token_accuracy": 0.5880953669548035, + "num_tokens": 166394927.0, + "step": 18085 + }, + { + "epoch": 4.890751757706869, + "grad_norm": 0.8581941723823547, + "learning_rate": 2.0226379655107667e-06, + "loss": 1.8647, + "mean_token_accuracy": 0.5618687272071838, + "num_tokens": 166902145.0, + "step": 18086 + }, + { + "epoch": 4.891022174148188, + "grad_norm": 0.8114163875579834, + "learning_rate": 2.022526357704387e-06, + "loss": 1.8621, + "mean_token_accuracy": 0.5669156312942505, + "num_tokens": 167426410.0, + "step": 18087 + }, + { + "epoch": 4.891292590589508, + "grad_norm": 0.8688433766365051, + "learning_rate": 2.022415025353299e-06, + "loss": 1.8207, + "mean_token_accuracy": 0.5601406097412109, + "num_tokens": 167950606.0, + "step": 18088 + }, + { + "epoch": 4.891563007030827, + "grad_norm": 0.8135632872581482, + "learning_rate": 2.0223039684609194e-06, + "loss": 1.7563, + "mean_token_accuracy": 0.580295205116272, + "num_tokens": 168474854.0, + "step": 18089 + }, + { + "epoch": 4.891833423472147, + "grad_norm": 0.7695328593254089, + "learning_rate": 2.022193187030656e-06, + "loss": 1.8069, + "mean_token_accuracy": 0.594739556312561, + "num_tokens": 168971975.0, + "step": 18090 + }, + { + "epoch": 4.892103839913466, + "grad_norm": 0.7934525012969971, + "learning_rate": 2.022082681065907e-06, + "loss": 1.7723, + "mean_token_accuracy": 0.5931512117385864, + "num_tokens": 169496093.0, + "step": 18091 + }, + { + "epoch": 4.8923742563547865, + "grad_norm": 1.004501461982727, + "learning_rate": 2.021972450570065e-06, + "loss": 1.8657, + "mean_token_accuracy": 0.5813671350479126, + "num_tokens": 170003969.0, + "step": 18092 + }, + { + "epoch": 4.892644672796106, + "grad_norm": 0.9977633953094482, + "learning_rate": 2.02186249554651e-06, + "loss": 1.9024, + "mean_token_accuracy": 0.5529690384864807, + "num_tokens": 170528163.0, + "step": 18093 + }, + { + "epoch": 4.892915089237426, + "grad_norm": 0.8457004427909851, + "learning_rate": 2.021752815998617e-06, + "loss": 1.7982, + "mean_token_accuracy": 0.5901768803596497, + "num_tokens": 171008030.0, + "step": 18094 + }, + { + "epoch": 4.893185505678745, + "grad_norm": 0.8581104278564453, + "learning_rate": 2.021643411929751e-06, + "loss": 1.7816, + "mean_token_accuracy": 0.5772466659545898, + "num_tokens": 171486672.0, + "step": 18095 + }, + { + "epoch": 4.893455922120065, + "grad_norm": 0.9051412343978882, + "learning_rate": 2.0215342833432673e-06, + "loss": 1.8477, + "mean_token_accuracy": 0.5823311805725098, + "num_tokens": 171970484.0, + "step": 18096 + }, + { + "epoch": 4.893726338561384, + "grad_norm": 0.7852486968040466, + "learning_rate": 2.0214254302425173e-06, + "loss": 1.7664, + "mean_token_accuracy": 0.584785521030426, + "num_tokens": 172470003.0, + "step": 18097 + }, + { + "epoch": 4.893996755002704, + "grad_norm": 0.9877113699913025, + "learning_rate": 2.021316852630839e-06, + "loss": 1.6805, + "mean_token_accuracy": 0.6052300333976746, + "num_tokens": 172953955.0, + "step": 18098 + }, + { + "epoch": 4.894267171444024, + "grad_norm": 0.9476572871208191, + "learning_rate": 2.021208550511564e-06, + "loss": 1.7713, + "mean_token_accuracy": 0.5894556045532227, + "num_tokens": 173431004.0, + "step": 18099 + }, + { + "epoch": 4.894537587885344, + "grad_norm": 0.8292467594146729, + "learning_rate": 2.0211005238880153e-06, + "loss": 1.7917, + "mean_token_accuracy": 0.5917295813560486, + "num_tokens": 173933474.0, + "step": 18100 + }, + { + "epoch": 4.894808004326663, + "grad_norm": 0.3464226722717285, + "learning_rate": 2.0209927727635086e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.7114534378051758, + "num_tokens": 174400456.0, + "step": 18101 + }, + { + "epoch": 4.895078420767983, + "grad_norm": 0.9258591532707214, + "learning_rate": 2.0208852971413485e-06, + "loss": 1.7576, + "mean_token_accuracy": 0.5877249240875244, + "num_tokens": 174924575.0, + "step": 18102 + }, + { + "epoch": 4.895348837209302, + "grad_norm": 1.0116705894470215, + "learning_rate": 2.020778097024834e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.583084762096405, + "num_tokens": 175402010.0, + "step": 18103 + }, + { + "epoch": 4.895619253650622, + "grad_norm": 0.996040940284729, + "learning_rate": 2.020671172417253e-06, + "loss": 1.862, + "mean_token_accuracy": 0.5789048075675964, + "num_tokens": 175891806.0, + "step": 18104 + }, + { + "epoch": 4.895889670091941, + "grad_norm": 0.8989819884300232, + "learning_rate": 2.0205645233218873e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5717458724975586, + "num_tokens": 176415958.0, + "step": 18105 + }, + { + "epoch": 4.8961600865332615, + "grad_norm": 0.8875397443771362, + "learning_rate": 2.0204581497420086e-06, + "loss": 1.8119, + "mean_token_accuracy": 0.590817928314209, + "num_tokens": 176908945.0, + "step": 18106 + }, + { + "epoch": 4.896430502974581, + "grad_norm": 0.7156994938850403, + "learning_rate": 2.020352051680882e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5637103915214539, + "num_tokens": 177433130.0, + "step": 18107 + }, + { + "epoch": 4.896700919415901, + "grad_norm": 0.9137025475502014, + "learning_rate": 2.0202462291417603e-06, + "loss": 1.8372, + "mean_token_accuracy": 0.5664272308349609, + "num_tokens": 177957360.0, + "step": 18108 + }, + { + "epoch": 4.89697133585722, + "grad_norm": 1.1547167301177979, + "learning_rate": 2.0201406821278926e-06, + "loss": 1.8304, + "mean_token_accuracy": 0.5830127596855164, + "num_tokens": 178481507.0, + "step": 18109 + }, + { + "epoch": 4.89724175229854, + "grad_norm": 0.9029582738876343, + "learning_rate": 2.0200354106425165e-06, + "loss": 1.7667, + "mean_token_accuracy": 0.5658566951751709, + "num_tokens": 179005716.0, + "step": 18110 + }, + { + "epoch": 4.897512168739859, + "grad_norm": 0.8390859961509705, + "learning_rate": 2.019930414688863e-06, + "loss": 1.804, + "mean_token_accuracy": 0.5955677032470703, + "num_tokens": 179489038.0, + "step": 18111 + }, + { + "epoch": 4.897782585181179, + "grad_norm": 0.843386173248291, + "learning_rate": 2.0198256942701526e-06, + "loss": 1.8218, + "mean_token_accuracy": 0.5915295481681824, + "num_tokens": 179974759.0, + "step": 18112 + }, + { + "epoch": 4.898053001622499, + "grad_norm": 0.8094131350517273, + "learning_rate": 2.0197212493895994e-06, + "loss": 1.8082, + "mean_token_accuracy": 0.5786253213882446, + "num_tokens": 180498965.0, + "step": 18113 + }, + { + "epoch": 4.898323418063819, + "grad_norm": 0.9083893895149231, + "learning_rate": 2.0196170800504067e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.566565990447998, + "num_tokens": 181018483.0, + "step": 18114 + }, + { + "epoch": 4.898593834505138, + "grad_norm": 0.7235360741615295, + "learning_rate": 2.0195131862557723e-06, + "loss": 1.7913, + "mean_token_accuracy": 0.5703175663948059, + "num_tokens": 181542568.0, + "step": 18115 + }, + { + "epoch": 4.898864250946458, + "grad_norm": 0.9112717509269714, + "learning_rate": 2.0194095680088825e-06, + "loss": 1.7188, + "mean_token_accuracy": 0.5966533422470093, + "num_tokens": 182006379.0, + "step": 18116 + }, + { + "epoch": 4.899134667387777, + "grad_norm": 0.7398706078529358, + "learning_rate": 2.019306225312919e-06, + "loss": 1.7795, + "mean_token_accuracy": 0.5820324420928955, + "num_tokens": 182504202.0, + "step": 18117 + }, + { + "epoch": 4.899405083829096, + "grad_norm": 0.7272691130638123, + "learning_rate": 2.0192031581710497e-06, + "loss": 1.6329, + "mean_token_accuracy": 0.5879433155059814, + "num_tokens": 183028418.0, + "step": 18118 + }, + { + "epoch": 4.899675500270416, + "grad_norm": 0.8831026554107666, + "learning_rate": 2.019100366586438e-06, + "loss": 1.799, + "mean_token_accuracy": 0.5752525925636292, + "num_tokens": 183531133.0, + "step": 18119 + }, + { + "epoch": 4.8999459167117365, + "grad_norm": 0.8229399919509888, + "learning_rate": 2.018997850562239e-06, + "loss": 1.7609, + "mean_token_accuracy": 0.5897096991539001, + "num_tokens": 184055366.0, + "step": 18120 + }, + { + "epoch": 4.900216333153056, + "grad_norm": 0.34203606843948364, + "learning_rate": 2.0188956101015963e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.7166236639022827, + "num_tokens": 184579563.0, + "step": 18121 + }, + { + "epoch": 4.900486749594375, + "grad_norm": 0.7222595810890198, + "learning_rate": 2.0187936452076495e-06, + "loss": 1.7342, + "mean_token_accuracy": 0.5981708765029907, + "num_tokens": 185063022.0, + "step": 18122 + }, + { + "epoch": 4.900757166035695, + "grad_norm": 0.8822020292282104, + "learning_rate": 2.018691955883525e-06, + "loss": 1.7895, + "mean_token_accuracy": 0.5882372856140137, + "num_tokens": 185587261.0, + "step": 18123 + }, + { + "epoch": 4.901027582477015, + "grad_norm": 0.8792064189910889, + "learning_rate": 2.0185905421323432e-06, + "loss": 1.8296, + "mean_token_accuracy": 0.5626815557479858, + "num_tokens": 186079409.0, + "step": 18124 + }, + { + "epoch": 4.901297998918334, + "grad_norm": 0.9046633839607239, + "learning_rate": 2.0184894039572163e-06, + "loss": 1.8315, + "mean_token_accuracy": 0.5729349255561829, + "num_tokens": 186603665.0, + "step": 18125 + }, + { + "epoch": 4.9015684153596535, + "grad_norm": 0.8490185141563416, + "learning_rate": 2.018388541361247e-06, + "loss": 1.8432, + "mean_token_accuracy": 0.5720847845077515, + "num_tokens": 187127807.0, + "step": 18126 + }, + { + "epoch": 4.901838831800974, + "grad_norm": 0.8726322650909424, + "learning_rate": 2.0182879543475316e-06, + "loss": 1.7951, + "mean_token_accuracy": 0.5799682140350342, + "num_tokens": 187630273.0, + "step": 18127 + }, + { + "epoch": 4.902109248242293, + "grad_norm": 0.9431951642036438, + "learning_rate": 2.0181876429191543e-06, + "loss": 1.8463, + "mean_token_accuracy": 0.567396879196167, + "num_tokens": 188154415.0, + "step": 18128 + }, + { + "epoch": 4.902379664683613, + "grad_norm": 0.8679593205451965, + "learning_rate": 2.0180876070791944e-06, + "loss": 1.734, + "mean_token_accuracy": 0.5904794931411743, + "num_tokens": 188632818.0, + "step": 18129 + }, + { + "epoch": 4.902650081124932, + "grad_norm": 0.8564162254333496, + "learning_rate": 2.0179878468307205e-06, + "loss": 1.9604, + "mean_token_accuracy": 0.5918171405792236, + "num_tokens": 189093624.0, + "step": 18130 + }, + { + "epoch": 4.902920497566252, + "grad_norm": 0.7322147488594055, + "learning_rate": 2.0178883621767937e-06, + "loss": 1.7682, + "mean_token_accuracy": 0.5834531784057617, + "num_tokens": 189617782.0, + "step": 18131 + }, + { + "epoch": 4.903190914007571, + "grad_norm": 0.7693238258361816, + "learning_rate": 2.0177891531204675e-06, + "loss": 1.8143, + "mean_token_accuracy": 0.591094434261322, + "num_tokens": 190141887.0, + "step": 18132 + }, + { + "epoch": 4.903461330448891, + "grad_norm": 1.020795464515686, + "learning_rate": 2.0176902196647836e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.59322589635849, + "num_tokens": 190544718.0, + "step": 18133 + }, + { + "epoch": 4.903731746890211, + "grad_norm": 0.8466023206710815, + "learning_rate": 2.01759156181278e-06, + "loss": 1.853, + "mean_token_accuracy": 0.5776190757751465, + "num_tokens": 191047777.0, + "step": 18134 + }, + { + "epoch": 4.904002163331531, + "grad_norm": 0.8002407550811768, + "learning_rate": 2.0174931795674834e-06, + "loss": 1.7476, + "mean_token_accuracy": 0.5903853178024292, + "num_tokens": 191571974.0, + "step": 18135 + }, + { + "epoch": 4.90427257977285, + "grad_norm": 0.857626736164093, + "learning_rate": 2.017395072931911e-06, + "loss": 1.8706, + "mean_token_accuracy": 0.5560564994812012, + "num_tokens": 192096028.0, + "step": 18136 + }, + { + "epoch": 4.90454299621417, + "grad_norm": 0.8407262563705444, + "learning_rate": 2.017297241909074e-06, + "loss": 1.7556, + "mean_token_accuracy": 0.5950353741645813, + "num_tokens": 192571745.0, + "step": 18137 + }, + { + "epoch": 4.904813412655489, + "grad_norm": 0.8134681582450867, + "learning_rate": 2.0171996865019736e-06, + "loss": 1.7827, + "mean_token_accuracy": 0.5923097729682922, + "num_tokens": 193056372.0, + "step": 18138 + }, + { + "epoch": 4.905083829096809, + "grad_norm": 0.712973952293396, + "learning_rate": 2.017102406713604e-06, + "loss": 1.7049, + "mean_token_accuracy": 0.5995086431503296, + "num_tokens": 193580608.0, + "step": 18139 + }, + { + "epoch": 4.9053542455381285, + "grad_norm": 0.8989500403404236, + "learning_rate": 2.0170054025469497e-06, + "loss": 1.9236, + "mean_token_accuracy": 0.5544354915618896, + "num_tokens": 194048699.0, + "step": 18140 + }, + { + "epoch": 4.905624661979449, + "grad_norm": 0.31270432472229004, + "learning_rate": 2.0169086740049865e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.7156348824501038, + "num_tokens": 194572948.0, + "step": 18141 + }, + { + "epoch": 4.905895078420768, + "grad_norm": 0.826630711555481, + "learning_rate": 2.016812221090683e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5853099822998047, + "num_tokens": 195097227.0, + "step": 18142 + }, + { + "epoch": 4.906165494862088, + "grad_norm": 0.7825099229812622, + "learning_rate": 2.0167160438069987e-06, + "loss": 1.8544, + "mean_token_accuracy": 0.577899158000946, + "num_tokens": 195621420.0, + "step": 18143 + }, + { + "epoch": 4.906435911303407, + "grad_norm": 0.8746238946914673, + "learning_rate": 2.0166201421568835e-06, + "loss": 1.7998, + "mean_token_accuracy": 0.5688154697418213, + "num_tokens": 196145479.0, + "step": 18144 + }, + { + "epoch": 4.906706327744727, + "grad_norm": 0.7880906462669373, + "learning_rate": 2.016524516143281e-06, + "loss": 1.8844, + "mean_token_accuracy": 0.5706113576889038, + "num_tokens": 196669631.0, + "step": 18145 + }, + { + "epoch": 4.906976744186046, + "grad_norm": 0.7907081842422485, + "learning_rate": 2.0164291657691255e-06, + "loss": 1.8261, + "mean_token_accuracy": 0.584894061088562, + "num_tokens": 197193725.0, + "step": 18146 + }, + { + "epoch": 4.907247160627366, + "grad_norm": 0.7775841951370239, + "learning_rate": 2.016334091037341e-06, + "loss": 1.9186, + "mean_token_accuracy": 0.5604729056358337, + "num_tokens": 197718005.0, + "step": 18147 + }, + { + "epoch": 4.907517577068686, + "grad_norm": 0.8395069241523743, + "learning_rate": 2.0162392919508476e-06, + "loss": 1.9269, + "mean_token_accuracy": 0.5685103535652161, + "num_tokens": 198242100.0, + "step": 18148 + }, + { + "epoch": 4.907787993510006, + "grad_norm": 0.9978395700454712, + "learning_rate": 2.016144768512551e-06, + "loss": 1.8415, + "mean_token_accuracy": 0.5676519870758057, + "num_tokens": 198766365.0, + "step": 18149 + }, + { + "epoch": 4.908058409951325, + "grad_norm": 0.8868494629859924, + "learning_rate": 2.0160505207253534e-06, + "loss": 1.8569, + "mean_token_accuracy": 0.5736315250396729, + "num_tokens": 199290476.0, + "step": 18150 + }, + { + "epoch": 4.908328826392645, + "grad_norm": 0.7461291551589966, + "learning_rate": 2.0159565485921453e-06, + "loss": 1.8121, + "mean_token_accuracy": 0.5711377263069153, + "num_tokens": 199814543.0, + "step": 18151 + }, + { + "epoch": 4.908599242833964, + "grad_norm": 0.7794379591941833, + "learning_rate": 2.015862852115811e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.5684705972671509, + "num_tokens": 200338819.0, + "step": 18152 + }, + { + "epoch": 4.908869659275284, + "grad_norm": 0.8522952198982239, + "learning_rate": 2.015769431299225e-06, + "loss": 1.7823, + "mean_token_accuracy": 0.5578891038894653, + "num_tokens": 200862919.0, + "step": 18153 + }, + { + "epoch": 4.9091400757166035, + "grad_norm": 0.8364975452423096, + "learning_rate": 2.0156762861452532e-06, + "loss": 1.8366, + "mean_token_accuracy": 0.5883191227912903, + "num_tokens": 201387160.0, + "step": 18154 + }, + { + "epoch": 4.909410492157924, + "grad_norm": 1.0222671031951904, + "learning_rate": 2.0155834166567553e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5732055902481079, + "num_tokens": 201911445.0, + "step": 18155 + }, + { + "epoch": 4.909680908599243, + "grad_norm": 0.8429107666015625, + "learning_rate": 2.0154908228365776e-06, + "loss": 1.8316, + "mean_token_accuracy": 0.5596439838409424, + "num_tokens": 202435704.0, + "step": 18156 + }, + { + "epoch": 4.909951325040563, + "grad_norm": 0.7684603333473206, + "learning_rate": 2.0153985046875653e-06, + "loss": 1.763, + "mean_token_accuracy": 0.5991894006729126, + "num_tokens": 202931413.0, + "step": 18157 + }, + { + "epoch": 4.910221741481882, + "grad_norm": 0.7587348818778992, + "learning_rate": 2.0153064622125476e-06, + "loss": 1.7978, + "mean_token_accuracy": 0.5983380079269409, + "num_tokens": 203455611.0, + "step": 18158 + }, + { + "epoch": 4.910492157923201, + "grad_norm": 0.8113987445831299, + "learning_rate": 2.01521469541435e-06, + "loss": 1.8951, + "mean_token_accuracy": 0.5595365166664124, + "num_tokens": 203944484.0, + "step": 18159 + }, + { + "epoch": 4.910762574364521, + "grad_norm": 0.8939711451530457, + "learning_rate": 2.015123204295788e-06, + "loss": 1.7866, + "mean_token_accuracy": 0.5897061824798584, + "num_tokens": 204468661.0, + "step": 18160 + }, + { + "epoch": 4.911032990805841, + "grad_norm": 0.3437865376472473, + "learning_rate": 2.0150319888596696e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7370471954345703, + "num_tokens": 204992940.0, + "step": 18161 + }, + { + "epoch": 4.911303407247161, + "grad_norm": 0.8992679119110107, + "learning_rate": 2.014941049108793e-06, + "loss": 1.6123, + "mean_token_accuracy": 0.6041104197502136, + "num_tokens": 205517162.0, + "step": 18162 + }, + { + "epoch": 4.91157382368848, + "grad_norm": 1.0434802770614624, + "learning_rate": 2.0148503850459476e-06, + "loss": 1.7348, + "mean_token_accuracy": 0.5914303064346313, + "num_tokens": 206011498.0, + "step": 18163 + }, + { + "epoch": 4.9118442401298, + "grad_norm": 0.818862795829773, + "learning_rate": 2.0147599966739167e-06, + "loss": 1.7716, + "mean_token_accuracy": 0.5814186930656433, + "num_tokens": 206535774.0, + "step": 18164 + }, + { + "epoch": 4.91211465657112, + "grad_norm": 0.9155552983283997, + "learning_rate": 2.0146698839954726e-06, + "loss": 1.927, + "mean_token_accuracy": 0.5553755760192871, + "num_tokens": 207060013.0, + "step": 18165 + }, + { + "epoch": 4.912385073012439, + "grad_norm": 0.9189044237136841, + "learning_rate": 2.01458004701338e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5899521112442017, + "num_tokens": 207584196.0, + "step": 18166 + }, + { + "epoch": 4.912655489453758, + "grad_norm": 0.825090229511261, + "learning_rate": 2.014490485730397e-06, + "loss": 1.7325, + "mean_token_accuracy": 0.5821899771690369, + "num_tokens": 208108327.0, + "step": 18167 + }, + { + "epoch": 4.9129259058950785, + "grad_norm": 0.8721466064453125, + "learning_rate": 2.0144012001492695e-06, + "loss": 1.86, + "mean_token_accuracy": 0.5499621629714966, + "num_tokens": 208632561.0, + "step": 18168 + }, + { + "epoch": 4.913196322336398, + "grad_norm": 0.9004215598106384, + "learning_rate": 2.014312190272739e-06, + "loss": 1.8161, + "mean_token_accuracy": 0.5778932571411133, + "num_tokens": 209140313.0, + "step": 18169 + }, + { + "epoch": 4.913466738777718, + "grad_norm": 0.7210052013397217, + "learning_rate": 2.0142234561035346e-06, + "loss": 1.82, + "mean_token_accuracy": 0.5632534623146057, + "num_tokens": 209664588.0, + "step": 18170 + }, + { + "epoch": 4.913737155219037, + "grad_norm": 0.842990517616272, + "learning_rate": 2.0141349976443813e-06, + "loss": 1.7798, + "mean_token_accuracy": 0.5780806541442871, + "num_tokens": 210188711.0, + "step": 18171 + }, + { + "epoch": 4.914007571660357, + "grad_norm": 1.090739130973816, + "learning_rate": 2.014046814897991e-06, + "loss": 1.8562, + "mean_token_accuracy": 0.5771925449371338, + "num_tokens": 210707700.0, + "step": 18172 + }, + { + "epoch": 4.914277988101676, + "grad_norm": 1.0061887502670288, + "learning_rate": 2.013958907867072e-06, + "loss": 1.566, + "mean_token_accuracy": 0.6541075706481934, + "num_tokens": 211173690.0, + "step": 18173 + }, + { + "epoch": 4.914548404542996, + "grad_norm": 0.8048569560050964, + "learning_rate": 2.0138712765543177e-06, + "loss": 1.8414, + "mean_token_accuracy": 0.578890323638916, + "num_tokens": 211697836.0, + "step": 18174 + }, + { + "epoch": 4.9148188209843156, + "grad_norm": 1.1147289276123047, + "learning_rate": 2.0137839209624206e-06, + "loss": 1.8299, + "mean_token_accuracy": 0.5878345966339111, + "num_tokens": 212222027.0, + "step": 18175 + }, + { + "epoch": 4.915089237425636, + "grad_norm": 1.1999330520629883, + "learning_rate": 2.0136968410940594e-06, + "loss": 1.8766, + "mean_token_accuracy": 0.5770243406295776, + "num_tokens": 212709993.0, + "step": 18176 + }, + { + "epoch": 4.915359653866955, + "grad_norm": 1.0145344734191895, + "learning_rate": 2.013610036951905e-06, + "loss": 1.8651, + "mean_token_accuracy": 0.5629885196685791, + "num_tokens": 213234254.0, + "step": 18177 + }, + { + "epoch": 4.915630070308275, + "grad_norm": 0.8073009848594666, + "learning_rate": 2.0135235085386236e-06, + "loss": 1.8319, + "mean_token_accuracy": 0.5815336108207703, + "num_tokens": 213758493.0, + "step": 18178 + }, + { + "epoch": 4.915900486749594, + "grad_norm": 0.8317893743515015, + "learning_rate": 2.0134372558568664e-06, + "loss": 1.8176, + "mean_token_accuracy": 0.5611922740936279, + "num_tokens": 214282570.0, + "step": 18179 + }, + { + "epoch": 4.916170903190914, + "grad_norm": 0.8618084788322449, + "learning_rate": 2.0133512789092845e-06, + "loss": 1.6812, + "mean_token_accuracy": 0.6070999503135681, + "num_tokens": 214778521.0, + "step": 18180 + }, + { + "epoch": 4.916441319632233, + "grad_norm": 0.36877715587615967, + "learning_rate": 2.0132655776985115e-06, + "loss": 1.1242, + "mean_token_accuracy": 0.7089745998382568, + "num_tokens": 215252130.0, + "step": 18181 + }, + { + "epoch": 4.9167117360735535, + "grad_norm": 1.1495519876480103, + "learning_rate": 2.01318015222718e-06, + "loss": 1.869, + "mean_token_accuracy": 0.5556582808494568, + "num_tokens": 215776374.0, + "step": 18182 + }, + { + "epoch": 4.916982152514873, + "grad_norm": 0.8924867510795593, + "learning_rate": 2.013095002497909e-06, + "loss": 1.7751, + "mean_token_accuracy": 0.5732933282852173, + "num_tokens": 216300569.0, + "step": 18183 + }, + { + "epoch": 4.917252568956193, + "grad_norm": 1.1258231401443481, + "learning_rate": 2.0130101285133125e-06, + "loss": 1.8194, + "mean_token_accuracy": 0.58744215965271, + "num_tokens": 216725034.0, + "step": 18184 + }, + { + "epoch": 4.917522985397512, + "grad_norm": 0.8896790146827698, + "learning_rate": 2.012925530275994e-06, + "loss": 1.8665, + "mean_token_accuracy": 0.5675511360168457, + "num_tokens": 217249303.0, + "step": 18185 + }, + { + "epoch": 4.917793401838832, + "grad_norm": 0.7123094201087952, + "learning_rate": 2.0128412077885497e-06, + "loss": 1.7979, + "mean_token_accuracy": 0.5815013647079468, + "num_tokens": 217773456.0, + "step": 18186 + }, + { + "epoch": 4.918063818280151, + "grad_norm": 0.8193820118904114, + "learning_rate": 2.012757161053566e-06, + "loss": 1.7588, + "mean_token_accuracy": 0.5942606329917908, + "num_tokens": 218297550.0, + "step": 18187 + }, + { + "epoch": 4.918334234721471, + "grad_norm": 1.0001486539840698, + "learning_rate": 2.0126733900736233e-06, + "loss": 1.8406, + "mean_token_accuracy": 0.5684971213340759, + "num_tokens": 218821806.0, + "step": 18188 + }, + { + "epoch": 4.9186046511627906, + "grad_norm": 0.9323586821556091, + "learning_rate": 2.01258989485129e-06, + "loss": 1.9593, + "mean_token_accuracy": 0.5702405571937561, + "num_tokens": 219284600.0, + "step": 18189 + }, + { + "epoch": 4.918875067604111, + "grad_norm": 0.858432948589325, + "learning_rate": 2.0125066753891307e-06, + "loss": 1.7869, + "mean_token_accuracy": 0.5774502754211426, + "num_tokens": 219808843.0, + "step": 18190 + }, + { + "epoch": 4.91914548404543, + "grad_norm": 0.8766955137252808, + "learning_rate": 2.012423731689696e-06, + "loss": 1.8248, + "mean_token_accuracy": 0.5709025263786316, + "num_tokens": 220333012.0, + "step": 18191 + }, + { + "epoch": 4.91941590048675, + "grad_norm": 0.955971360206604, + "learning_rate": 2.012341063755532e-06, + "loss": 1.8411, + "mean_token_accuracy": 0.5894397497177124, + "num_tokens": 220857275.0, + "step": 18192 + }, + { + "epoch": 4.919686316928069, + "grad_norm": 0.9589058756828308, + "learning_rate": 2.0122586715891753e-06, + "loss": 1.8379, + "mean_token_accuracy": 0.5778842568397522, + "num_tokens": 221356612.0, + "step": 18193 + }, + { + "epoch": 4.919956733369389, + "grad_norm": 1.0044316053390503, + "learning_rate": 2.0121765551931533e-06, + "loss": 1.9197, + "mean_token_accuracy": 0.5485483407974243, + "num_tokens": 221880694.0, + "step": 18194 + }, + { + "epoch": 4.920227149810708, + "grad_norm": 1.097244381904602, + "learning_rate": 2.012094714569986e-06, + "loss": 2.0081, + "mean_token_accuracy": 0.5767050981521606, + "num_tokens": 222340438.0, + "step": 18195 + }, + { + "epoch": 4.9204975662520285, + "grad_norm": 0.8019852638244629, + "learning_rate": 2.012013149722185e-06, + "loss": 1.8085, + "mean_token_accuracy": 0.5803554058074951, + "num_tokens": 222864332.0, + "step": 18196 + }, + { + "epoch": 4.920767982693348, + "grad_norm": 0.6527277827262878, + "learning_rate": 2.0119318606522528e-06, + "loss": 1.7177, + "mean_token_accuracy": 0.5870339274406433, + "num_tokens": 223388472.0, + "step": 18197 + }, + { + "epoch": 4.921038399134668, + "grad_norm": 7.008025169372559, + "learning_rate": 2.011850847362683e-06, + "loss": 1.7358, + "mean_token_accuracy": 0.5941534042358398, + "num_tokens": 223912737.0, + "step": 18198 + }, + { + "epoch": 4.921308815575987, + "grad_norm": 0.7510403990745544, + "learning_rate": 2.0117701098559618e-06, + "loss": 1.7872, + "mean_token_accuracy": 0.5880924463272095, + "num_tokens": 224436910.0, + "step": 18199 + }, + { + "epoch": 4.921579232017306, + "grad_norm": 0.8283142447471619, + "learning_rate": 2.011689648134566e-06, + "loss": 1.8006, + "mean_token_accuracy": 0.5847684741020203, + "num_tokens": 224951198.0, + "step": 18200 + }, + { + "epoch": 4.921849648458626, + "grad_norm": 0.34151363372802734, + "learning_rate": 2.0116094622009642e-06, + "loss": 1.055, + "mean_token_accuracy": 0.7170854806900024, + "num_tokens": 225473253.0, + "step": 18201 + }, + { + "epoch": 4.922120064899946, + "grad_norm": 0.8911046385765076, + "learning_rate": 2.011529552057618e-06, + "loss": 1.7792, + "mean_token_accuracy": 0.6028041243553162, + "num_tokens": 225997511.0, + "step": 18202 + }, + { + "epoch": 4.9223904813412656, + "grad_norm": 0.9698275327682495, + "learning_rate": 2.011449917706978e-06, + "loss": 1.7737, + "mean_token_accuracy": 0.5744516849517822, + "num_tokens": 226521680.0, + "step": 18203 + }, + { + "epoch": 4.922660897782585, + "grad_norm": 0.7613029479980469, + "learning_rate": 2.011370559151489e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5699653029441833, + "num_tokens": 227045901.0, + "step": 18204 + }, + { + "epoch": 4.922931314223905, + "grad_norm": 0.8242015242576599, + "learning_rate": 2.011291476393585e-06, + "loss": 1.7717, + "mean_token_accuracy": 0.5840463638305664, + "num_tokens": 227570084.0, + "step": 18205 + }, + { + "epoch": 4.923201730665225, + "grad_norm": 0.8499463200569153, + "learning_rate": 2.0112126694356913e-06, + "loss": 1.8731, + "mean_token_accuracy": 0.5600795149803162, + "num_tokens": 228094117.0, + "step": 18206 + }, + { + "epoch": 4.923472147106544, + "grad_norm": 0.8737640380859375, + "learning_rate": 2.0111341382802278e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5805236101150513, + "num_tokens": 228618363.0, + "step": 18207 + }, + { + "epoch": 4.923742563547863, + "grad_norm": 0.8141157031059265, + "learning_rate": 2.011055882929604e-06, + "loss": 1.9123, + "mean_token_accuracy": 0.5519925355911255, + "num_tokens": 229142640.0, + "step": 18208 + }, + { + "epoch": 4.924012979989183, + "grad_norm": 0.7902467250823975, + "learning_rate": 2.0109779033862206e-06, + "loss": 1.8624, + "mean_token_accuracy": 0.5686579346656799, + "num_tokens": 229584609.0, + "step": 18209 + }, + { + "epoch": 4.924283396430503, + "grad_norm": 0.8954144716262817, + "learning_rate": 2.0109001996524696e-06, + "loss": 1.8379, + "mean_token_accuracy": 0.5893483757972717, + "num_tokens": 230051051.0, + "step": 18210 + }, + { + "epoch": 4.924553812871823, + "grad_norm": 0.8295857906341553, + "learning_rate": 2.0108227717307353e-06, + "loss": 1.8334, + "mean_token_accuracy": 0.5563850402832031, + "num_tokens": 230575324.0, + "step": 18211 + }, + { + "epoch": 4.924824229313142, + "grad_norm": 4.550954341888428, + "learning_rate": 2.0107456196233943e-06, + "loss": 1.6349, + "mean_token_accuracy": 0.6247031092643738, + "num_tokens": 231099608.0, + "step": 18212 + }, + { + "epoch": 4.925094645754462, + "grad_norm": 0.8570019006729126, + "learning_rate": 2.010668743332814e-06, + "loss": 1.7717, + "mean_token_accuracy": 0.5668023228645325, + "num_tokens": 231623804.0, + "step": 18213 + }, + { + "epoch": 4.925365062195781, + "grad_norm": 0.8753171563148499, + "learning_rate": 2.010592142861351e-06, + "loss": 1.7404, + "mean_token_accuracy": 0.6135057806968689, + "num_tokens": 232141927.0, + "step": 18214 + }, + { + "epoch": 4.925635478637101, + "grad_norm": 0.8239645957946777, + "learning_rate": 2.010515818211358e-06, + "loss": 1.8751, + "mean_token_accuracy": 0.5750772953033447, + "num_tokens": 232666078.0, + "step": 18215 + }, + { + "epoch": 4.9259058950784205, + "grad_norm": 0.7834244966506958, + "learning_rate": 2.0104397693851764e-06, + "loss": 1.9199, + "mean_token_accuracy": 0.5477933287620544, + "num_tokens": 233190189.0, + "step": 18216 + }, + { + "epoch": 4.926176311519741, + "grad_norm": 0.9692816734313965, + "learning_rate": 2.010363996385139e-06, + "loss": 1.8099, + "mean_token_accuracy": 0.5675657391548157, + "num_tokens": 233714336.0, + "step": 18217 + }, + { + "epoch": 4.92644672796106, + "grad_norm": 0.9023290276527405, + "learning_rate": 2.010288499213571e-06, + "loss": 1.8677, + "mean_token_accuracy": 0.5733615159988403, + "num_tokens": 234177959.0, + "step": 18218 + }, + { + "epoch": 4.92671714440238, + "grad_norm": 0.7067496180534363, + "learning_rate": 2.0102132778727896e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.567142903804779, + "num_tokens": 234702162.0, + "step": 18219 + }, + { + "epoch": 4.926987560843699, + "grad_norm": 0.9565960168838501, + "learning_rate": 2.0101383323651006e-06, + "loss": 1.5818, + "mean_token_accuracy": 0.611484944820404, + "num_tokens": 235226334.0, + "step": 18220 + }, + { + "epoch": 4.927257977285019, + "grad_norm": 0.3462861478328705, + "learning_rate": 2.010063662692806e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.7135391235351562, + "num_tokens": 235711128.0, + "step": 18221 + }, + { + "epoch": 4.927528393726338, + "grad_norm": 0.8759725689888, + "learning_rate": 2.009989268858196e-06, + "loss": 1.7447, + "mean_token_accuracy": 0.5774729251861572, + "num_tokens": 236235308.0, + "step": 18222 + }, + { + "epoch": 4.927798810167658, + "grad_norm": 0.9306034445762634, + "learning_rate": 2.009915150863552e-06, + "loss": 1.6076, + "mean_token_accuracy": 0.6010432243347168, + "num_tokens": 236728305.0, + "step": 18223 + }, + { + "epoch": 4.928069226608978, + "grad_norm": 0.9740625023841858, + "learning_rate": 2.00984130871115e-06, + "loss": 1.802, + "mean_token_accuracy": 0.5850160121917725, + "num_tokens": 237252402.0, + "step": 18224 + }, + { + "epoch": 4.928339643050298, + "grad_norm": 0.8124048709869385, + "learning_rate": 2.0097677424032558e-06, + "loss": 1.864, + "mean_token_accuracy": 0.5623757839202881, + "num_tokens": 237776680.0, + "step": 18225 + }, + { + "epoch": 4.928610059491617, + "grad_norm": 0.8623993396759033, + "learning_rate": 2.0096944519421246e-06, + "loss": 1.7179, + "mean_token_accuracy": 0.603705644607544, + "num_tokens": 238300915.0, + "step": 18226 + }, + { + "epoch": 4.928880475932937, + "grad_norm": 0.7745200395584106, + "learning_rate": 2.0096214373300074e-06, + "loss": 1.7848, + "mean_token_accuracy": 0.5814625024795532, + "num_tokens": 238825055.0, + "step": 18227 + }, + { + "epoch": 4.929150892374256, + "grad_norm": 0.8175833821296692, + "learning_rate": 2.0095486985691416e-06, + "loss": 1.909, + "mean_token_accuracy": 0.5589269995689392, + "num_tokens": 239349156.0, + "step": 18228 + }, + { + "epoch": 4.929421308815576, + "grad_norm": 0.8445907831192017, + "learning_rate": 2.0094762356617634e-06, + "loss": 1.8495, + "mean_token_accuracy": 0.5696963667869568, + "num_tokens": 239825104.0, + "step": 18229 + }, + { + "epoch": 4.9296917252568955, + "grad_norm": 0.8767874836921692, + "learning_rate": 2.009404048610092e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5711520910263062, + "num_tokens": 240255428.0, + "step": 18230 + }, + { + "epoch": 4.929962141698216, + "grad_norm": 0.8286240100860596, + "learning_rate": 2.009332137416344e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5713165998458862, + "num_tokens": 240740417.0, + "step": 18231 + }, + { + "epoch": 4.930232558139535, + "grad_norm": 0.8936863541603088, + "learning_rate": 2.0092605020827276e-06, + "loss": 1.8465, + "mean_token_accuracy": 0.5825810432434082, + "num_tokens": 241254302.0, + "step": 18232 + }, + { + "epoch": 4.930502974580855, + "grad_norm": 1.0116699934005737, + "learning_rate": 2.0091891426114375e-06, + "loss": 1.7677, + "mean_token_accuracy": 0.5874196887016296, + "num_tokens": 241778540.0, + "step": 18233 + }, + { + "epoch": 4.930773391022174, + "grad_norm": 0.9250150918960571, + "learning_rate": 2.0091180590046645e-06, + "loss": 1.8025, + "mean_token_accuracy": 0.5798600316047668, + "num_tokens": 242248136.0, + "step": 18234 + }, + { + "epoch": 4.931043807463494, + "grad_norm": 0.8662959933280945, + "learning_rate": 2.0090472512645916e-06, + "loss": 1.8142, + "mean_token_accuracy": 0.5871187448501587, + "num_tokens": 242772377.0, + "step": 18235 + }, + { + "epoch": 4.931314223904813, + "grad_norm": 0.8889704942703247, + "learning_rate": 2.008976719393388e-06, + "loss": 1.8694, + "mean_token_accuracy": 0.5691215991973877, + "num_tokens": 243296583.0, + "step": 18236 + }, + { + "epoch": 4.931584640346133, + "grad_norm": 0.7970700860023499, + "learning_rate": 2.0089064633932204e-06, + "loss": 1.7306, + "mean_token_accuracy": 0.5945670008659363, + "num_tokens": 243761212.0, + "step": 18237 + }, + { + "epoch": 4.931855056787453, + "grad_norm": 0.8644306659698486, + "learning_rate": 2.008836483266243e-06, + "loss": 1.8129, + "mean_token_accuracy": 0.5575532913208008, + "num_tokens": 244285393.0, + "step": 18238 + }, + { + "epoch": 4.932125473228773, + "grad_norm": 0.7442276477813721, + "learning_rate": 2.008766779014604e-06, + "loss": 1.8721, + "mean_token_accuracy": 0.5718086361885071, + "num_tokens": 244809581.0, + "step": 18239 + }, + { + "epoch": 4.932395889670092, + "grad_norm": 0.7522858381271362, + "learning_rate": 2.008697350640442e-06, + "loss": 1.797, + "mean_token_accuracy": 0.5872831344604492, + "num_tokens": 245333818.0, + "step": 18240 + }, + { + "epoch": 4.932666306111411, + "grad_norm": 0.32641494274139404, + "learning_rate": 2.008628198145886e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7217596769332886, + "num_tokens": 245858051.0, + "step": 18241 + }, + { + "epoch": 4.932936722552731, + "grad_norm": 0.9919810891151428, + "learning_rate": 2.0085593215330597e-06, + "loss": 1.8475, + "mean_token_accuracy": 0.5870686769485474, + "num_tokens": 246327653.0, + "step": 18242 + }, + { + "epoch": 4.933207138994051, + "grad_norm": 0.8500140905380249, + "learning_rate": 2.0084907208040754e-06, + "loss": 1.9305, + "mean_token_accuracy": 0.5851141214370728, + "num_tokens": 246851817.0, + "step": 18243 + }, + { + "epoch": 4.9334775554353705, + "grad_norm": 0.8267431259155273, + "learning_rate": 2.0084223959610374e-06, + "loss": 1.8499, + "mean_token_accuracy": 0.5706936717033386, + "num_tokens": 247354697.0, + "step": 18244 + }, + { + "epoch": 4.93374797187669, + "grad_norm": 0.7950462698936462, + "learning_rate": 2.0083543470060434e-06, + "loss": 1.8591, + "mean_token_accuracy": 0.5526212453842163, + "num_tokens": 247878784.0, + "step": 18245 + }, + { + "epoch": 4.93401838831801, + "grad_norm": 0.9412038922309875, + "learning_rate": 2.00828657394118e-06, + "loss": 1.8159, + "mean_token_accuracy": 0.5960444211959839, + "num_tokens": 248402947.0, + "step": 18246 + }, + { + "epoch": 4.93428880475933, + "grad_norm": 0.7941096425056458, + "learning_rate": 2.008219076768529e-06, + "loss": 1.7722, + "mean_token_accuracy": 0.5972472429275513, + "num_tokens": 248925942.0, + "step": 18247 + }, + { + "epoch": 4.934559221200649, + "grad_norm": 0.8498246669769287, + "learning_rate": 2.008151855490158e-06, + "loss": 1.8594, + "mean_token_accuracy": 0.5665258169174194, + "num_tokens": 249429995.0, + "step": 18248 + }, + { + "epoch": 4.934829637641968, + "grad_norm": 0.8230748772621155, + "learning_rate": 2.0080849101081322e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5823975801467896, + "num_tokens": 249931216.0, + "step": 18249 + }, + { + "epoch": 4.935100054083288, + "grad_norm": 0.952021598815918, + "learning_rate": 2.0080182406245057e-06, + "loss": 1.8595, + "mean_token_accuracy": 0.5645686388015747, + "num_tokens": 250455412.0, + "step": 18250 + }, + { + "epoch": 4.9353704705246075, + "grad_norm": 0.8128529191017151, + "learning_rate": 2.007951847041322e-06, + "loss": 1.7495, + "mean_token_accuracy": 0.5913483500480652, + "num_tokens": 250942952.0, + "step": 18251 + }, + { + "epoch": 4.935640886965928, + "grad_norm": 0.7818528413772583, + "learning_rate": 2.00788572936062e-06, + "loss": 1.6991, + "mean_token_accuracy": 0.5963652729988098, + "num_tokens": 251431851.0, + "step": 18252 + }, + { + "epoch": 4.935911303407247, + "grad_norm": 0.7728400826454163, + "learning_rate": 2.0078198875844276e-06, + "loss": 1.806, + "mean_token_accuracy": 0.5748482942581177, + "num_tokens": 251955978.0, + "step": 18253 + }, + { + "epoch": 4.936181719848567, + "grad_norm": 0.8015575408935547, + "learning_rate": 2.007754321714766e-06, + "loss": 1.6968, + "mean_token_accuracy": 0.6221132278442383, + "num_tokens": 252469477.0, + "step": 18254 + }, + { + "epoch": 4.936452136289886, + "grad_norm": 0.7712442278862, + "learning_rate": 2.0076890317536467e-06, + "loss": 1.7904, + "mean_token_accuracy": 0.5913265943527222, + "num_tokens": 252993728.0, + "step": 18255 + }, + { + "epoch": 4.936722552731206, + "grad_norm": 0.8387512564659119, + "learning_rate": 2.0076240177030715e-06, + "loss": 1.8352, + "mean_token_accuracy": 0.5935355424880981, + "num_tokens": 253458715.0, + "step": 18256 + }, + { + "epoch": 4.936992969172525, + "grad_norm": 0.8636565208435059, + "learning_rate": 2.007559279565036e-06, + "loss": 1.8224, + "mean_token_accuracy": 0.5714256763458252, + "num_tokens": 253982980.0, + "step": 18257 + }, + { + "epoch": 4.9372633856138455, + "grad_norm": 0.8528897166252136, + "learning_rate": 2.0074948173415283e-06, + "loss": 1.9207, + "mean_token_accuracy": 0.5644949674606323, + "num_tokens": 254507227.0, + "step": 18258 + }, + { + "epoch": 4.937533802055165, + "grad_norm": 0.8524997234344482, + "learning_rate": 2.0074306310345237e-06, + "loss": 1.8194, + "mean_token_accuracy": 0.5730584263801575, + "num_tokens": 255031382.0, + "step": 18259 + }, + { + "epoch": 4.937804218496485, + "grad_norm": 0.8105649948120117, + "learning_rate": 2.0073667206459922e-06, + "loss": 1.7856, + "mean_token_accuracy": 0.5991092920303345, + "num_tokens": 255555616.0, + "step": 18260 + }, + { + "epoch": 4.938074634937804, + "grad_norm": 0.3902931213378906, + "learning_rate": 2.007303086177896e-06, + "loss": 1.1671, + "mean_token_accuracy": 0.6885641813278198, + "num_tokens": 256034397.0, + "step": 18261 + }, + { + "epoch": 4.938345051379124, + "grad_norm": 0.8726271986961365, + "learning_rate": 2.0072397276321875e-06, + "loss": 1.7638, + "mean_token_accuracy": 0.5910435318946838, + "num_tokens": 256558650.0, + "step": 18262 + }, + { + "epoch": 4.938615467820443, + "grad_norm": 0.8258180618286133, + "learning_rate": 2.0071766450108097e-06, + "loss": 1.8993, + "mean_token_accuracy": 0.5655498504638672, + "num_tokens": 257082619.0, + "step": 18263 + }, + { + "epoch": 4.938885884261763, + "grad_norm": 0.7110355496406555, + "learning_rate": 2.007113838315699e-06, + "loss": 1.8693, + "mean_token_accuracy": 0.5711386799812317, + "num_tokens": 257606850.0, + "step": 18264 + }, + { + "epoch": 4.9391563007030825, + "grad_norm": 0.7006120681762695, + "learning_rate": 2.007051307548782e-06, + "loss": 1.7877, + "mean_token_accuracy": 0.5784636735916138, + "num_tokens": 258131078.0, + "step": 18265 + }, + { + "epoch": 4.939426717144403, + "grad_norm": 0.757413387298584, + "learning_rate": 2.006989052711977e-06, + "loss": 1.7345, + "mean_token_accuracy": 0.5798842906951904, + "num_tokens": 258655323.0, + "step": 18266 + }, + { + "epoch": 4.939697133585722, + "grad_norm": 0.8038651943206787, + "learning_rate": 2.0069270738071945e-06, + "loss": 1.8685, + "mean_token_accuracy": 0.5780147314071655, + "num_tokens": 259179529.0, + "step": 18267 + }, + { + "epoch": 4.939967550027042, + "grad_norm": 1.015692114830017, + "learning_rate": 2.0068653708363368e-06, + "loss": 1.8369, + "mean_token_accuracy": 0.582972526550293, + "num_tokens": 259611006.0, + "step": 18268 + }, + { + "epoch": 4.940237966468361, + "grad_norm": 0.9626439809799194, + "learning_rate": 2.0068039438012966e-06, + "loss": 1.8212, + "mean_token_accuracy": 0.575721263885498, + "num_tokens": 260135275.0, + "step": 18269 + }, + { + "epoch": 4.940508382909681, + "grad_norm": 0.8205080628395081, + "learning_rate": 2.0067427927039576e-06, + "loss": 1.8107, + "mean_token_accuracy": 0.5771666169166565, + "num_tokens": 260643484.0, + "step": 18270 + }, + { + "epoch": 4.940778799351, + "grad_norm": 0.7957165241241455, + "learning_rate": 2.006681917546198e-06, + "loss": 1.8445, + "mean_token_accuracy": 0.5803461074829102, + "num_tokens": 261167676.0, + "step": 18271 + }, + { + "epoch": 4.9410492157923205, + "grad_norm": 0.813490092754364, + "learning_rate": 2.006621318329885e-06, + "loss": 1.8443, + "mean_token_accuracy": 0.5571301579475403, + "num_tokens": 261691894.0, + "step": 18272 + }, + { + "epoch": 4.94131963223364, + "grad_norm": 0.8889367580413818, + "learning_rate": 2.006560995056877e-06, + "loss": 1.8368, + "mean_token_accuracy": 0.5843141078948975, + "num_tokens": 262216122.0, + "step": 18273 + }, + { + "epoch": 4.94159004867496, + "grad_norm": 0.9736059308052063, + "learning_rate": 2.0065009477290264e-06, + "loss": 1.9391, + "mean_token_accuracy": 0.5571796894073486, + "num_tokens": 262692916.0, + "step": 18274 + }, + { + "epoch": 4.941860465116279, + "grad_norm": 0.7039529085159302, + "learning_rate": 2.0064411763481744e-06, + "loss": 1.8565, + "mean_token_accuracy": 0.5851403474807739, + "num_tokens": 263217199.0, + "step": 18275 + }, + { + "epoch": 4.942130881557599, + "grad_norm": 0.8847854137420654, + "learning_rate": 2.0063816809161563e-06, + "loss": 1.8953, + "mean_token_accuracy": 0.5584304332733154, + "num_tokens": 263741406.0, + "step": 18276 + }, + { + "epoch": 4.942401297998918, + "grad_norm": 0.8359160423278809, + "learning_rate": 2.006322461434796e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.5964263677597046, + "num_tokens": 264240393.0, + "step": 18277 + }, + { + "epoch": 4.942671714440238, + "grad_norm": 0.8803504109382629, + "learning_rate": 2.006263517905911e-06, + "loss": 1.6903, + "mean_token_accuracy": 0.597977876663208, + "num_tokens": 264764459.0, + "step": 18278 + }, + { + "epoch": 4.9429421308815575, + "grad_norm": 0.8866788744926453, + "learning_rate": 2.0062048503313106e-06, + "loss": 1.8324, + "mean_token_accuracy": 0.5524961948394775, + "num_tokens": 265288689.0, + "step": 18279 + }, + { + "epoch": 4.943212547322878, + "grad_norm": 0.9304558634757996, + "learning_rate": 2.0061464587127942e-06, + "loss": 1.8003, + "mean_token_accuracy": 0.579750657081604, + "num_tokens": 265812792.0, + "step": 18280 + }, + { + "epoch": 4.943482963764197, + "grad_norm": 0.337007611989975, + "learning_rate": 2.0060883430521543e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.7143275737762451, + "num_tokens": 266336943.0, + "step": 18281 + }, + { + "epoch": 4.943753380205516, + "grad_norm": 0.9389451146125793, + "learning_rate": 2.006030503351173e-06, + "loss": 1.955, + "mean_token_accuracy": 0.5510344505310059, + "num_tokens": 266861214.0, + "step": 18282 + }, + { + "epoch": 4.944023796646836, + "grad_norm": 0.8567320704460144, + "learning_rate": 2.0059729396116244e-06, + "loss": 1.727, + "mean_token_accuracy": 0.57271409034729, + "num_tokens": 267385284.0, + "step": 18283 + }, + { + "epoch": 4.944294213088156, + "grad_norm": 1.0048874616622925, + "learning_rate": 2.005915651835277e-06, + "loss": 1.782, + "mean_token_accuracy": 0.6161700487136841, + "num_tokens": 267811763.0, + "step": 18284 + }, + { + "epoch": 4.944564629529475, + "grad_norm": 0.8875541687011719, + "learning_rate": 2.0058586400238868e-06, + "loss": 1.8267, + "mean_token_accuracy": 0.5770688056945801, + "num_tokens": 268336028.0, + "step": 18285 + }, + { + "epoch": 4.944835045970795, + "grad_norm": 0.764124870300293, + "learning_rate": 2.005801904179204e-06, + "loss": 1.7358, + "mean_token_accuracy": 0.5816665887832642, + "num_tokens": 268860111.0, + "step": 18286 + }, + { + "epoch": 4.945105462412115, + "grad_norm": 0.9255045652389526, + "learning_rate": 2.0057454443029682e-06, + "loss": 1.8661, + "mean_token_accuracy": 0.5735963582992554, + "num_tokens": 269347660.0, + "step": 18287 + }, + { + "epoch": 4.945375878853435, + "grad_norm": 0.7763049602508545, + "learning_rate": 2.0056892603969126e-06, + "loss": 1.7446, + "mean_token_accuracy": 0.5968576073646545, + "num_tokens": 269871939.0, + "step": 18288 + }, + { + "epoch": 4.945646295294754, + "grad_norm": 0.7493183612823486, + "learning_rate": 2.005633352462762e-06, + "loss": 1.7767, + "mean_token_accuracy": 0.5787438154220581, + "num_tokens": 270396215.0, + "step": 18289 + }, + { + "epoch": 4.945916711736073, + "grad_norm": 0.89997398853302, + "learning_rate": 2.0055777205022304e-06, + "loss": 1.7549, + "mean_token_accuracy": 0.600865364074707, + "num_tokens": 270920383.0, + "step": 18290 + }, + { + "epoch": 4.946187128177393, + "grad_norm": 0.7161917090415955, + "learning_rate": 2.0055223645170246e-06, + "loss": 1.8329, + "mean_token_accuracy": 0.5715436935424805, + "num_tokens": 271444564.0, + "step": 18291 + }, + { + "epoch": 4.9464575446187125, + "grad_norm": 0.8574862480163574, + "learning_rate": 2.005467284508844e-06, + "loss": 1.7253, + "mean_token_accuracy": 0.5786612033843994, + "num_tokens": 271960768.0, + "step": 18292 + }, + { + "epoch": 4.9467279610600325, + "grad_norm": 0.7780795097351074, + "learning_rate": 2.0054124804793786e-06, + "loss": 1.8901, + "mean_token_accuracy": 0.5599294900894165, + "num_tokens": 272484880.0, + "step": 18293 + }, + { + "epoch": 4.946998377501352, + "grad_norm": 0.9419575333595276, + "learning_rate": 2.0053579524303095e-06, + "loss": 1.7927, + "mean_token_accuracy": 0.5738560557365417, + "num_tokens": 272981488.0, + "step": 18294 + }, + { + "epoch": 4.947268793942672, + "grad_norm": 0.8445490598678589, + "learning_rate": 2.0053037003633106e-06, + "loss": 1.7789, + "mean_token_accuracy": 0.5823911428451538, + "num_tokens": 273487259.0, + "step": 18295 + }, + { + "epoch": 4.947539210383991, + "grad_norm": 0.7811682820320129, + "learning_rate": 2.0052497242800446e-06, + "loss": 1.8975, + "mean_token_accuracy": 0.5671210289001465, + "num_tokens": 274011360.0, + "step": 18296 + }, + { + "epoch": 4.947809626825311, + "grad_norm": 0.8133079409599304, + "learning_rate": 2.0051960241821704e-06, + "loss": 1.7589, + "mean_token_accuracy": 0.5936417579650879, + "num_tokens": 274535489.0, + "step": 18297 + }, + { + "epoch": 4.94808004326663, + "grad_norm": 0.8100019097328186, + "learning_rate": 2.005142600071333e-06, + "loss": 1.7483, + "mean_token_accuracy": 0.6023896932601929, + "num_tokens": 275059693.0, + "step": 18298 + }, + { + "epoch": 4.94835045970795, + "grad_norm": 0.8636226654052734, + "learning_rate": 2.005089451949173e-06, + "loss": 1.7619, + "mean_token_accuracy": 0.6060841679573059, + "num_tokens": 275533987.0, + "step": 18299 + }, + { + "epoch": 4.94862087614927, + "grad_norm": 0.8217051029205322, + "learning_rate": 2.0050365798173212e-06, + "loss": 1.8826, + "mean_token_accuracy": 0.5527961850166321, + "num_tokens": 276058144.0, + "step": 18300 + }, + { + "epoch": 4.94889129259059, + "grad_norm": 1.6763086318969727, + "learning_rate": 2.004983983677399e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.7095702886581421, + "num_tokens": 276546404.0, + "step": 18301 + }, + { + "epoch": 4.949161709031909, + "grad_norm": 0.7667071223258972, + "learning_rate": 2.0049316635310213e-06, + "loss": 1.8018, + "mean_token_accuracy": 0.581449031829834, + "num_tokens": 277070671.0, + "step": 18302 + }, + { + "epoch": 4.949432125473229, + "grad_norm": 0.9555622339248657, + "learning_rate": 2.004879619379793e-06, + "loss": 1.875, + "mean_token_accuracy": 0.5607794523239136, + "num_tokens": 277533628.0, + "step": 18303 + }, + { + "epoch": 4.949702541914548, + "grad_norm": 1.004356861114502, + "learning_rate": 2.004827851225311e-06, + "loss": 1.553, + "mean_token_accuracy": 0.6216748952865601, + "num_tokens": 278057825.0, + "step": 18304 + }, + { + "epoch": 4.949972958355868, + "grad_norm": 0.8441066741943359, + "learning_rate": 2.0047763590691625e-06, + "loss": 1.7813, + "mean_token_accuracy": 0.6009631752967834, + "num_tokens": 278519517.0, + "step": 18305 + }, + { + "epoch": 4.9502433747971875, + "grad_norm": 0.927914559841156, + "learning_rate": 2.0047251429129296e-06, + "loss": 1.8863, + "mean_token_accuracy": 0.5818274021148682, + "num_tokens": 278945788.0, + "step": 18306 + }, + { + "epoch": 4.9505137912385075, + "grad_norm": 0.8344043493270874, + "learning_rate": 2.0046742027581824e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5766341686248779, + "num_tokens": 279469968.0, + "step": 18307 + }, + { + "epoch": 4.950784207679827, + "grad_norm": 1.0115348100662231, + "learning_rate": 2.0046235386064844e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5845593214035034, + "num_tokens": 279930559.0, + "step": 18308 + }, + { + "epoch": 4.951054624121147, + "grad_norm": 0.941496729850769, + "learning_rate": 2.0045731504593896e-06, + "loss": 1.985, + "mean_token_accuracy": 0.5546045303344727, + "num_tokens": 280407386.0, + "step": 18309 + }, + { + "epoch": 4.951325040562466, + "grad_norm": 0.8257378935813904, + "learning_rate": 2.0045230383184445e-06, + "loss": 1.9263, + "mean_token_accuracy": 0.5583258271217346, + "num_tokens": 280931628.0, + "step": 18310 + }, + { + "epoch": 4.951595457003786, + "grad_norm": 0.8500110507011414, + "learning_rate": 2.0044732021851877e-06, + "loss": 1.8306, + "mean_token_accuracy": 0.5770924687385559, + "num_tokens": 281408822.0, + "step": 18311 + }, + { + "epoch": 4.951865873445105, + "grad_norm": 0.7822251915931702, + "learning_rate": 2.004423642061145e-06, + "loss": 1.6918, + "mean_token_accuracy": 0.5978110432624817, + "num_tokens": 281933107.0, + "step": 18312 + }, + { + "epoch": 4.952136289886425, + "grad_norm": 0.9718325138092041, + "learning_rate": 2.0043743579478407e-06, + "loss": 1.7996, + "mean_token_accuracy": 0.5959556102752686, + "num_tokens": 282457279.0, + "step": 18313 + }, + { + "epoch": 4.952406706327745, + "grad_norm": 0.8170108199119568, + "learning_rate": 2.004325349846785e-06, + "loss": 1.7453, + "mean_token_accuracy": 0.5870019197463989, + "num_tokens": 282981561.0, + "step": 18314 + }, + { + "epoch": 4.952677122769065, + "grad_norm": 0.7393548488616943, + "learning_rate": 2.0042766177594826e-06, + "loss": 1.8325, + "mean_token_accuracy": 0.5693999528884888, + "num_tokens": 283505786.0, + "step": 18315 + }, + { + "epoch": 4.952947539210384, + "grad_norm": 0.7722622752189636, + "learning_rate": 2.004228161687428e-06, + "loss": 1.7382, + "mean_token_accuracy": 0.6174198389053345, + "num_tokens": 284029858.0, + "step": 18316 + }, + { + "epoch": 4.953217955651704, + "grad_norm": 0.7092530131340027, + "learning_rate": 2.0041799816321076e-06, + "loss": 1.765, + "mean_token_accuracy": 0.5727978348731995, + "num_tokens": 284554055.0, + "step": 18317 + }, + { + "epoch": 4.953488372093023, + "grad_norm": 0.8063308000564575, + "learning_rate": 2.0041320775950015e-06, + "loss": 1.7459, + "mean_token_accuracy": 0.5961163640022278, + "num_tokens": 285056292.0, + "step": 18318 + }, + { + "epoch": 4.953758788534343, + "grad_norm": 0.7553987503051758, + "learning_rate": 2.0040844495775776e-06, + "loss": 1.7687, + "mean_token_accuracy": 0.5834898948669434, + "num_tokens": 285580554.0, + "step": 18319 + }, + { + "epoch": 4.9540292049756625, + "grad_norm": 0.8525358438491821, + "learning_rate": 2.0040370975812984e-06, + "loss": 1.7815, + "mean_token_accuracy": 0.5698703527450562, + "num_tokens": 286104784.0, + "step": 18320 + }, + { + "epoch": 4.9542996214169825, + "grad_norm": 0.377495139837265, + "learning_rate": 2.003990021607616e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.7152419090270996, + "num_tokens": 286628852.0, + "step": 18321 + }, + { + "epoch": 4.954570037858302, + "grad_norm": 0.8349190950393677, + "learning_rate": 2.0039432216579756e-06, + "loss": 1.7269, + "mean_token_accuracy": 0.5906639695167542, + "num_tokens": 287153101.0, + "step": 18322 + }, + { + "epoch": 4.954840454299621, + "grad_norm": 0.8570844531059265, + "learning_rate": 2.0038966977338124e-06, + "loss": 1.7461, + "mean_token_accuracy": 0.5732377767562866, + "num_tokens": 287677318.0, + "step": 18323 + }, + { + "epoch": 4.955110870740941, + "grad_norm": 0.9113246202468872, + "learning_rate": 2.0038504498365544e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5588590502738953, + "num_tokens": 288201590.0, + "step": 18324 + }, + { + "epoch": 4.955381287182261, + "grad_norm": 0.8521092534065247, + "learning_rate": 2.003804477967622e-06, + "loss": 1.4783, + "mean_token_accuracy": 0.6335194110870361, + "num_tokens": 288725870.0, + "step": 18325 + }, + { + "epoch": 4.95565170362358, + "grad_norm": 0.7368624210357666, + "learning_rate": 2.0037587821284224e-06, + "loss": 1.7762, + "mean_token_accuracy": 0.5957278609275818, + "num_tokens": 289242623.0, + "step": 18326 + }, + { + "epoch": 4.9559221200648995, + "grad_norm": 0.7595860958099365, + "learning_rate": 2.003713362320361e-06, + "loss": 1.8351, + "mean_token_accuracy": 0.5843027830123901, + "num_tokens": 289766857.0, + "step": 18327 + }, + { + "epoch": 4.95619253650622, + "grad_norm": 0.8360291719436646, + "learning_rate": 2.003668218544829e-06, + "loss": 1.9597, + "mean_token_accuracy": 0.5533219575881958, + "num_tokens": 290256404.0, + "step": 18328 + }, + { + "epoch": 4.95646295294754, + "grad_norm": 0.8806552886962891, + "learning_rate": 2.003623350803213e-06, + "loss": 1.7627, + "mean_token_accuracy": 0.5746445655822754, + "num_tokens": 290726383.0, + "step": 18329 + }, + { + "epoch": 4.956733369388859, + "grad_norm": 0.9163816571235657, + "learning_rate": 2.0035787590968894e-06, + "loss": 1.9572, + "mean_token_accuracy": 0.5542494058609009, + "num_tokens": 291195062.0, + "step": 18330 + }, + { + "epoch": 4.957003785830178, + "grad_norm": 0.7328323125839233, + "learning_rate": 2.0035344434272257e-06, + "loss": 1.8738, + "mean_token_accuracy": 0.5585352778434753, + "num_tokens": 291719260.0, + "step": 18331 + }, + { + "epoch": 4.957274202271498, + "grad_norm": 0.8253713846206665, + "learning_rate": 2.003490403795582e-06, + "loss": 1.8588, + "mean_token_accuracy": 0.5757308006286621, + "num_tokens": 292243529.0, + "step": 18332 + }, + { + "epoch": 4.957544618712817, + "grad_norm": 0.8425630927085876, + "learning_rate": 2.0034466402033103e-06, + "loss": 1.8489, + "mean_token_accuracy": 0.5725975036621094, + "num_tokens": 292767756.0, + "step": 18333 + }, + { + "epoch": 4.9578150351541375, + "grad_norm": 0.6700744032859802, + "learning_rate": 2.0034031526517535e-06, + "loss": 1.8442, + "mean_token_accuracy": 0.5765211582183838, + "num_tokens": 293291968.0, + "step": 18334 + }, + { + "epoch": 4.958085451595457, + "grad_norm": 0.7416312098503113, + "learning_rate": 2.0033599411422435e-06, + "loss": 1.7093, + "mean_token_accuracy": 0.5882793068885803, + "num_tokens": 293816193.0, + "step": 18335 + }, + { + "epoch": 4.958355868036777, + "grad_norm": 0.8059266805648804, + "learning_rate": 2.003317005676109e-06, + "loss": 1.8206, + "mean_token_accuracy": 0.5731819868087769, + "num_tokens": 294340427.0, + "step": 18336 + }, + { + "epoch": 4.958626284478096, + "grad_norm": 0.7754524946212769, + "learning_rate": 2.003274346254666e-06, + "loss": 1.8033, + "mean_token_accuracy": 0.5944486260414124, + "num_tokens": 294864713.0, + "step": 18337 + }, + { + "epoch": 4.958896700919416, + "grad_norm": 0.8676798939704895, + "learning_rate": 2.0032319628792234e-06, + "loss": 1.8492, + "mean_token_accuracy": 0.5884758830070496, + "num_tokens": 295360323.0, + "step": 18338 + }, + { + "epoch": 4.959167117360735, + "grad_norm": 0.911012589931488, + "learning_rate": 2.0031898555510825e-06, + "loss": 1.8422, + "mean_token_accuracy": 0.567508339881897, + "num_tokens": 295884522.0, + "step": 18339 + }, + { + "epoch": 4.959437533802055, + "grad_norm": 0.8421658277511597, + "learning_rate": 2.003148024271535e-06, + "loss": 1.7333, + "mean_token_accuracy": 0.6141002178192139, + "num_tokens": 296408703.0, + "step": 18340 + }, + { + "epoch": 4.9597079502433745, + "grad_norm": 0.33409836888313293, + "learning_rate": 2.0031064690418638e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.7085379362106323, + "num_tokens": 296932914.0, + "step": 18341 + }, + { + "epoch": 4.959978366684695, + "grad_norm": 1.0092719793319702, + "learning_rate": 2.0030651898633436e-06, + "loss": 1.8344, + "mean_token_accuracy": 0.5657063722610474, + "num_tokens": 297457025.0, + "step": 18342 + }, + { + "epoch": 4.960248783126014, + "grad_norm": 0.9200308322906494, + "learning_rate": 2.003024186737242e-06, + "loss": 1.7824, + "mean_token_accuracy": 0.5952522158622742, + "num_tokens": 297950959.0, + "step": 18343 + }, + { + "epoch": 4.960519199567334, + "grad_norm": 0.9551336169242859, + "learning_rate": 2.0029834596648175e-06, + "loss": 1.8559, + "mean_token_accuracy": 0.5693817734718323, + "num_tokens": 298459039.0, + "step": 18344 + }, + { + "epoch": 4.960789616008653, + "grad_norm": 0.8761352896690369, + "learning_rate": 2.002943008647318e-06, + "loss": 1.7955, + "mean_token_accuracy": 0.5828441381454468, + "num_tokens": 298983261.0, + "step": 18345 + }, + { + "epoch": 4.961060032449973, + "grad_norm": 0.8285080790519714, + "learning_rate": 2.002902833685986e-06, + "loss": 1.8306, + "mean_token_accuracy": 0.5695840120315552, + "num_tokens": 299507397.0, + "step": 18346 + }, + { + "epoch": 4.961330448891292, + "grad_norm": 0.7976149916648865, + "learning_rate": 2.0028629347820544e-06, + "loss": 1.8263, + "mean_token_accuracy": 0.5878543853759766, + "num_tokens": 300031673.0, + "step": 18347 + }, + { + "epoch": 4.9616008653326125, + "grad_norm": 0.986901581287384, + "learning_rate": 2.0028233119367457e-06, + "loss": 1.8342, + "mean_token_accuracy": 0.5605082511901855, + "num_tokens": 300518226.0, + "step": 18348 + }, + { + "epoch": 4.961871281773932, + "grad_norm": 0.9100663065910339, + "learning_rate": 2.002783965151278e-06, + "loss": 1.8049, + "mean_token_accuracy": 0.5781903266906738, + "num_tokens": 301042306.0, + "step": 18349 + }, + { + "epoch": 4.962141698215252, + "grad_norm": 0.7240347862243652, + "learning_rate": 2.002744894426856e-06, + "loss": 1.726, + "mean_token_accuracy": 0.5954951643943787, + "num_tokens": 301566564.0, + "step": 18350 + }, + { + "epoch": 4.962412114656571, + "grad_norm": 0.7875854969024658, + "learning_rate": 2.0027060997646803e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5572686195373535, + "num_tokens": 302090793.0, + "step": 18351 + }, + { + "epoch": 4.962682531097891, + "grad_norm": 0.7812188863754272, + "learning_rate": 2.002667581165941e-06, + "loss": 1.7511, + "mean_token_accuracy": 0.5950479507446289, + "num_tokens": 302614867.0, + "step": 18352 + }, + { + "epoch": 4.96295294753921, + "grad_norm": 0.9528694748878479, + "learning_rate": 2.00262933863182e-06, + "loss": 1.6477, + "mean_token_accuracy": 0.6211587190628052, + "num_tokens": 303117759.0, + "step": 18353 + }, + { + "epoch": 4.96322336398053, + "grad_norm": 0.8035416007041931, + "learning_rate": 2.00259137216349e-06, + "loss": 1.8761, + "mean_token_accuracy": 0.5652918219566345, + "num_tokens": 303641994.0, + "step": 18354 + }, + { + "epoch": 4.9634937804218495, + "grad_norm": 0.8263120055198669, + "learning_rate": 2.002553681762116e-06, + "loss": 1.7565, + "mean_token_accuracy": 0.5848255753517151, + "num_tokens": 304166222.0, + "step": 18355 + }, + { + "epoch": 4.96376419686317, + "grad_norm": 0.82542884349823, + "learning_rate": 2.0025162674288555e-06, + "loss": 1.6563, + "mean_token_accuracy": 0.5998701453208923, + "num_tokens": 304690445.0, + "step": 18356 + }, + { + "epoch": 4.964034613304489, + "grad_norm": 0.8532304167747498, + "learning_rate": 2.0024791291648554e-06, + "loss": 1.6324, + "mean_token_accuracy": 0.6197637319564819, + "num_tokens": 305179718.0, + "step": 18357 + }, + { + "epoch": 4.964305029745809, + "grad_norm": 0.8407093286514282, + "learning_rate": 2.002442266971256e-06, + "loss": 1.7976, + "mean_token_accuracy": 0.5871216654777527, + "num_tokens": 305703783.0, + "step": 18358 + }, + { + "epoch": 4.964575446187128, + "grad_norm": 0.7586722373962402, + "learning_rate": 2.002405680849188e-06, + "loss": 1.6943, + "mean_token_accuracy": 0.6018483638763428, + "num_tokens": 306227973.0, + "step": 18359 + }, + { + "epoch": 4.964845862628448, + "grad_norm": 0.9364888072013855, + "learning_rate": 2.0023693707997737e-06, + "loss": 1.7183, + "mean_token_accuracy": 0.604736864566803, + "num_tokens": 306752252.0, + "step": 18360 + }, + { + "epoch": 4.965116279069767, + "grad_norm": 0.3630383312702179, + "learning_rate": 2.0023333368241276e-06, + "loss": 1.0804, + "mean_token_accuracy": 0.7063599824905396, + "num_tokens": 307276502.0, + "step": 18361 + }, + { + "epoch": 4.9653866955110875, + "grad_norm": 0.9890848994255066, + "learning_rate": 2.002297578923354e-06, + "loss": 1.8854, + "mean_token_accuracy": 0.5668492317199707, + "num_tokens": 307800777.0, + "step": 18362 + }, + { + "epoch": 4.965657111952407, + "grad_norm": 0.9879478812217712, + "learning_rate": 2.002262097098553e-06, + "loss": 1.741, + "mean_token_accuracy": 0.5911914110183716, + "num_tokens": 308308039.0, + "step": 18363 + }, + { + "epoch": 4.965927528393726, + "grad_norm": 0.8564903140068054, + "learning_rate": 2.0022268913508096e-06, + "loss": 1.8427, + "mean_token_accuracy": 0.5645694732666016, + "num_tokens": 308832179.0, + "step": 18364 + }, + { + "epoch": 4.966197944835046, + "grad_norm": 0.8390790224075317, + "learning_rate": 2.0021919616812072e-06, + "loss": 1.8795, + "mean_token_accuracy": 0.5730495452880859, + "num_tokens": 309356442.0, + "step": 18365 + }, + { + "epoch": 4.966468361276366, + "grad_norm": 0.9891909956932068, + "learning_rate": 2.0021573080908157e-06, + "loss": 1.8457, + "mean_token_accuracy": 0.5933536291122437, + "num_tokens": 309778269.0, + "step": 18366 + }, + { + "epoch": 4.966738777717685, + "grad_norm": 0.9052384495735168, + "learning_rate": 2.0021229305806993e-06, + "loss": 1.8026, + "mean_token_accuracy": 0.5876568555831909, + "num_tokens": 310242319.0, + "step": 18367 + }, + { + "epoch": 4.967009194159004, + "grad_norm": 0.8727076649665833, + "learning_rate": 2.0020888291519127e-06, + "loss": 1.7848, + "mean_token_accuracy": 0.5855199098587036, + "num_tokens": 310766579.0, + "step": 18368 + }, + { + "epoch": 4.9672796106003245, + "grad_norm": 0.968283474445343, + "learning_rate": 2.002055003805502e-06, + "loss": 1.7972, + "mean_token_accuracy": 0.5702043771743774, + "num_tokens": 311290854.0, + "step": 18369 + }, + { + "epoch": 4.967550027041645, + "grad_norm": 0.8982271552085876, + "learning_rate": 2.002021454542504e-06, + "loss": 1.8042, + "mean_token_accuracy": 0.5923849940299988, + "num_tokens": 311792195.0, + "step": 18370 + }, + { + "epoch": 4.967820443482964, + "grad_norm": 0.898444414138794, + "learning_rate": 2.0019881813639496e-06, + "loss": 1.8945, + "mean_token_accuracy": 0.5616974234580994, + "num_tokens": 312267793.0, + "step": 18371 + }, + { + "epoch": 4.968090859924283, + "grad_norm": 0.8174431324005127, + "learning_rate": 2.001955184270859e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5840471386909485, + "num_tokens": 312780384.0, + "step": 18372 + }, + { + "epoch": 4.968361276365603, + "grad_norm": 0.7589060068130493, + "learning_rate": 2.001922463264245e-06, + "loss": 1.792, + "mean_token_accuracy": 0.5788033604621887, + "num_tokens": 313304433.0, + "step": 18373 + }, + { + "epoch": 4.968631692806922, + "grad_norm": 0.9790955185890198, + "learning_rate": 2.001890018345112e-06, + "loss": 1.7747, + "mean_token_accuracy": 0.5737177729606628, + "num_tokens": 313828638.0, + "step": 18374 + }, + { + "epoch": 4.968902109248242, + "grad_norm": 1.011460542678833, + "learning_rate": 2.001857849514454e-06, + "loss": 1.6659, + "mean_token_accuracy": 0.6166062951087952, + "num_tokens": 314352858.0, + "step": 18375 + }, + { + "epoch": 4.969172525689562, + "grad_norm": 1.013391137123108, + "learning_rate": 2.00182595677326e-06, + "loss": 1.8541, + "mean_token_accuracy": 0.5589640140533447, + "num_tokens": 314877133.0, + "step": 18376 + }, + { + "epoch": 4.969442942130882, + "grad_norm": 0.8228922486305237, + "learning_rate": 2.0017943401225075e-06, + "loss": 1.7346, + "mean_token_accuracy": 0.5984978675842285, + "num_tokens": 315401387.0, + "step": 18377 + }, + { + "epoch": 4.969713358572201, + "grad_norm": 0.7994210124015808, + "learning_rate": 2.0017629995631657e-06, + "loss": 1.8292, + "mean_token_accuracy": 0.5724659562110901, + "num_tokens": 315892761.0, + "step": 18378 + }, + { + "epoch": 4.969983775013521, + "grad_norm": 0.7355107069015503, + "learning_rate": 2.0017319350961977e-06, + "loss": 1.6351, + "mean_token_accuracy": 0.6192596554756165, + "num_tokens": 316374699.0, + "step": 18379 + }, + { + "epoch": 4.97025419145484, + "grad_norm": 0.8165284991264343, + "learning_rate": 2.0017011467225563e-06, + "loss": 1.9096, + "mean_token_accuracy": 0.56577467918396, + "num_tokens": 316898858.0, + "step": 18380 + }, + { + "epoch": 4.97052460789616, + "grad_norm": 0.34580737352371216, + "learning_rate": 2.0016706344431852e-06, + "loss": 1.1733, + "mean_token_accuracy": 0.6820721626281738, + "num_tokens": 317423040.0, + "step": 18381 + }, + { + "epoch": 4.970795024337479, + "grad_norm": 0.9197473526000977, + "learning_rate": 2.0016403982590217e-06, + "loss": 1.8196, + "mean_token_accuracy": 0.5644863843917847, + "num_tokens": 317947051.0, + "step": 18382 + }, + { + "epoch": 4.9710654407787995, + "grad_norm": 0.9437544941902161, + "learning_rate": 2.0016104381709943e-06, + "loss": 1.7648, + "mean_token_accuracy": 0.5819193124771118, + "num_tokens": 318442092.0, + "step": 18383 + }, + { + "epoch": 4.971335857220119, + "grad_norm": 0.9942412972450256, + "learning_rate": 2.0015807541800202e-06, + "loss": 1.9545, + "mean_token_accuracy": 0.5387973785400391, + "num_tokens": 318910165.0, + "step": 18384 + }, + { + "epoch": 4.971606273661439, + "grad_norm": 0.76014244556427, + "learning_rate": 2.001551346287011e-06, + "loss": 1.7774, + "mean_token_accuracy": 0.5899007320404053, + "num_tokens": 319434355.0, + "step": 18385 + }, + { + "epoch": 4.971876690102758, + "grad_norm": 0.8383069634437561, + "learning_rate": 2.0015222144928704e-06, + "loss": 1.8857, + "mean_token_accuracy": 0.5546612739562988, + "num_tokens": 319958519.0, + "step": 18386 + }, + { + "epoch": 4.972147106544078, + "grad_norm": 0.8584714531898499, + "learning_rate": 2.0014933587984896e-06, + "loss": 1.6818, + "mean_token_accuracy": 0.594254732131958, + "num_tokens": 320482700.0, + "step": 18387 + }, + { + "epoch": 4.972417522985397, + "grad_norm": 0.8695445656776428, + "learning_rate": 2.001464779204756e-06, + "loss": 1.7752, + "mean_token_accuracy": 0.5813106894493103, + "num_tokens": 321006845.0, + "step": 18388 + }, + { + "epoch": 4.972687939426717, + "grad_norm": 1.0851292610168457, + "learning_rate": 2.001436475712546e-06, + "loss": 1.7952, + "mean_token_accuracy": 0.5922647714614868, + "num_tokens": 321476864.0, + "step": 18389 + }, + { + "epoch": 4.972958355868037, + "grad_norm": 1.0275009870529175, + "learning_rate": 2.0014084483227278e-06, + "loss": 1.7566, + "mean_token_accuracy": 0.5971297025680542, + "num_tokens": 322001073.0, + "step": 18390 + }, + { + "epoch": 4.973228772309357, + "grad_norm": 0.8073035478591919, + "learning_rate": 2.001380697036162e-06, + "loss": 1.8091, + "mean_token_accuracy": 0.5903282761573792, + "num_tokens": 322525318.0, + "step": 18391 + }, + { + "epoch": 4.973499188750676, + "grad_norm": 0.8582891225814819, + "learning_rate": 2.0013532218537e-06, + "loss": 1.9006, + "mean_token_accuracy": 0.5505489110946655, + "num_tokens": 323049596.0, + "step": 18392 + }, + { + "epoch": 4.973769605191996, + "grad_norm": 0.9066325426101685, + "learning_rate": 2.0013260227761837e-06, + "loss": 1.8646, + "mean_token_accuracy": 0.5462079048156738, + "num_tokens": 323573830.0, + "step": 18393 + }, + { + "epoch": 4.974040021633315, + "grad_norm": 0.9115956425666809, + "learning_rate": 2.001299099804449e-06, + "loss": 1.8401, + "mean_token_accuracy": 0.5701371431350708, + "num_tokens": 324098055.0, + "step": 18394 + }, + { + "epoch": 4.974310438074635, + "grad_norm": 0.7421880960464478, + "learning_rate": 2.00127245293932e-06, + "loss": 1.7863, + "mean_token_accuracy": 0.5768823623657227, + "num_tokens": 324622251.0, + "step": 18395 + }, + { + "epoch": 4.974580854515954, + "grad_norm": 0.8462137579917908, + "learning_rate": 2.0012460821816167e-06, + "loss": 1.7967, + "mean_token_accuracy": 0.5891181230545044, + "num_tokens": 325121545.0, + "step": 18396 + }, + { + "epoch": 4.9748512709572745, + "grad_norm": 0.9111301302909851, + "learning_rate": 2.0012199875321474e-06, + "loss": 1.7501, + "mean_token_accuracy": 0.5909631252288818, + "num_tokens": 325645741.0, + "step": 18397 + }, + { + "epoch": 4.975121687398594, + "grad_norm": 0.9158846735954285, + "learning_rate": 2.001194168991712e-06, + "loss": 1.9091, + "mean_token_accuracy": 0.5504060387611389, + "num_tokens": 326169984.0, + "step": 18398 + }, + { + "epoch": 4.975392103839914, + "grad_norm": 0.9058983325958252, + "learning_rate": 2.0011686265611033e-06, + "loss": 1.8136, + "mean_token_accuracy": 0.5661685466766357, + "num_tokens": 326694138.0, + "step": 18399 + }, + { + "epoch": 4.975662520281233, + "grad_norm": 0.8529409170150757, + "learning_rate": 2.001143360241105e-06, + "loss": 1.7992, + "mean_token_accuracy": 0.573999285697937, + "num_tokens": 327218396.0, + "step": 18400 + }, + { + "epoch": 4.975932936722553, + "grad_norm": 0.3679799437522888, + "learning_rate": 2.0011183700324927e-06, + "loss": 1.1086, + "mean_token_accuracy": 0.7067388296127319, + "num_tokens": 327688715.0, + "step": 18401 + }, + { + "epoch": 4.976203353163872, + "grad_norm": 1.001232624053955, + "learning_rate": 2.0010936559360334e-06, + "loss": 1.7186, + "mean_token_accuracy": 0.5765022039413452, + "num_tokens": 328212978.0, + "step": 18402 + }, + { + "epoch": 4.976473769605192, + "grad_norm": 1.019139051437378, + "learning_rate": 2.0010692179524833e-06, + "loss": 1.9676, + "mean_token_accuracy": 0.5427075028419495, + "num_tokens": 328737048.0, + "step": 18403 + }, + { + "epoch": 4.976744186046512, + "grad_norm": 0.8020778894424438, + "learning_rate": 2.0010450560825946e-06, + "loss": 1.8189, + "mean_token_accuracy": 0.5726993083953857, + "num_tokens": 329261316.0, + "step": 18404 + }, + { + "epoch": 4.977014602487831, + "grad_norm": 0.793729841709137, + "learning_rate": 2.0010211703271084e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.5814208984375, + "num_tokens": 329760468.0, + "step": 18405 + }, + { + "epoch": 4.977285018929151, + "grad_norm": 0.8821200132369995, + "learning_rate": 2.0009975606867553e-06, + "loss": 1.7874, + "mean_token_accuracy": 0.5973687171936035, + "num_tokens": 330284639.0, + "step": 18406 + }, + { + "epoch": 4.977555435370471, + "grad_norm": 0.8987224102020264, + "learning_rate": 2.000974227162263e-06, + "loss": 1.7347, + "mean_token_accuracy": 0.5871440172195435, + "num_tokens": 330808833.0, + "step": 18407 + }, + { + "epoch": 4.97782585181179, + "grad_norm": 0.7249507308006287, + "learning_rate": 2.000951169754345e-06, + "loss": 1.8658, + "mean_token_accuracy": 0.5785822868347168, + "num_tokens": 331333109.0, + "step": 18408 + }, + { + "epoch": 4.978096268253109, + "grad_norm": 0.7572739124298096, + "learning_rate": 2.000928388463709e-06, + "loss": 1.86, + "mean_token_accuracy": 0.5647509694099426, + "num_tokens": 331857371.0, + "step": 18409 + }, + { + "epoch": 4.9783666846944294, + "grad_norm": 0.8482463359832764, + "learning_rate": 2.0009058832910554e-06, + "loss": 1.8913, + "mean_token_accuracy": 0.5554089546203613, + "num_tokens": 332378740.0, + "step": 18410 + }, + { + "epoch": 4.9786371011357495, + "grad_norm": 0.762136697769165, + "learning_rate": 2.000883654237074e-06, + "loss": 1.8574, + "mean_token_accuracy": 0.5655848979949951, + "num_tokens": 332903020.0, + "step": 18411 + }, + { + "epoch": 4.978907517577069, + "grad_norm": 0.8699022531509399, + "learning_rate": 2.0008617013024467e-06, + "loss": 1.7617, + "mean_token_accuracy": 0.5913696885108948, + "num_tokens": 333427292.0, + "step": 18412 + }, + { + "epoch": 4.979177934018388, + "grad_norm": 0.9258713126182556, + "learning_rate": 2.0008400244878464e-06, + "loss": 1.7339, + "mean_token_accuracy": 0.6058884263038635, + "num_tokens": 333951331.0, + "step": 18413 + }, + { + "epoch": 4.979448350459708, + "grad_norm": 0.9842916131019592, + "learning_rate": 2.0008186237939395e-06, + "loss": 1.8714, + "mean_token_accuracy": 0.5888753533363342, + "num_tokens": 334412305.0, + "step": 18414 + }, + { + "epoch": 4.979718766901027, + "grad_norm": 0.7763984799385071, + "learning_rate": 2.0007974992213825e-06, + "loss": 1.7563, + "mean_token_accuracy": 0.5901970863342285, + "num_tokens": 334936502.0, + "step": 18415 + }, + { + "epoch": 4.979989183342347, + "grad_norm": 0.8330879211425781, + "learning_rate": 2.000776650770822e-06, + "loss": 1.8666, + "mean_token_accuracy": 0.5670347213745117, + "num_tokens": 335419222.0, + "step": 18416 + }, + { + "epoch": 4.9802595997836665, + "grad_norm": 0.8632883429527283, + "learning_rate": 2.0007560784429005e-06, + "loss": 1.8264, + "mean_token_accuracy": 0.5709579586982727, + "num_tokens": 335943489.0, + "step": 18417 + }, + { + "epoch": 4.980530016224987, + "grad_norm": 0.7465636730194092, + "learning_rate": 2.000735782238246e-06, + "loss": 1.7628, + "mean_token_accuracy": 0.5929538011550903, + "num_tokens": 336467640.0, + "step": 18418 + }, + { + "epoch": 4.980800432666306, + "grad_norm": 0.7983992695808411, + "learning_rate": 2.000715762157484e-06, + "loss": 1.732, + "mean_token_accuracy": 0.5904808640480042, + "num_tokens": 336991904.0, + "step": 18419 + }, + { + "epoch": 4.981070849107626, + "grad_norm": 0.8662100434303284, + "learning_rate": 2.0006960182012274e-06, + "loss": 1.7015, + "mean_token_accuracy": 0.6011518239974976, + "num_tokens": 337480295.0, + "step": 18420 + }, + { + "epoch": 4.981341265548945, + "grad_norm": 0.385216623544693, + "learning_rate": 2.000676550370082e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7409121990203857, + "num_tokens": 338004449.0, + "step": 18421 + }, + { + "epoch": 4.981611681990265, + "grad_norm": 0.8190751075744629, + "learning_rate": 2.000657358664645e-06, + "loss": 1.9149, + "mean_token_accuracy": 0.5595393180847168, + "num_tokens": 338528700.0, + "step": 18422 + }, + { + "epoch": 4.981882098431584, + "grad_norm": 0.8665584921836853, + "learning_rate": 2.000638443085506e-06, + "loss": 1.7549, + "mean_token_accuracy": 0.5907066464424133, + "num_tokens": 339004429.0, + "step": 18423 + }, + { + "epoch": 4.9821525148729044, + "grad_norm": 0.8520934581756592, + "learning_rate": 2.0006198036332443e-06, + "loss": 1.7922, + "mean_token_accuracy": 0.5815640687942505, + "num_tokens": 339528635.0, + "step": 18424 + }, + { + "epoch": 4.982422931314224, + "grad_norm": 0.7609646916389465, + "learning_rate": 2.0006014403084327e-06, + "loss": 1.6723, + "mean_token_accuracy": 0.6288962364196777, + "num_tokens": 340052895.0, + "step": 18425 + }, + { + "epoch": 4.982693347755544, + "grad_norm": 0.8538106083869934, + "learning_rate": 2.000583353111635e-06, + "loss": 1.7272, + "mean_token_accuracy": 0.6068123579025269, + "num_tokens": 340545750.0, + "step": 18426 + }, + { + "epoch": 4.982963764196863, + "grad_norm": 0.7792177200317383, + "learning_rate": 2.000565542043405e-06, + "loss": 1.8783, + "mean_token_accuracy": 0.5786378979682922, + "num_tokens": 340996151.0, + "step": 18427 + }, + { + "epoch": 4.983234180638183, + "grad_norm": 0.7099344730377197, + "learning_rate": 2.0005480071042904e-06, + "loss": 1.7265, + "mean_token_accuracy": 0.5937710404396057, + "num_tokens": 341520418.0, + "step": 18428 + }, + { + "epoch": 4.983504597079502, + "grad_norm": 0.7260000705718994, + "learning_rate": 2.000530748294828e-06, + "loss": 1.6949, + "mean_token_accuracy": 0.6023504137992859, + "num_tokens": 342044697.0, + "step": 18429 + }, + { + "epoch": 4.983775013520822, + "grad_norm": 0.8320748209953308, + "learning_rate": 2.0005137656155477e-06, + "loss": 1.8557, + "mean_token_accuracy": 0.5730913281440735, + "num_tokens": 342568913.0, + "step": 18430 + }, + { + "epoch": 4.9840454299621415, + "grad_norm": 0.7898685932159424, + "learning_rate": 2.0004970590669713e-06, + "loss": 1.8123, + "mean_token_accuracy": 0.5934020280838013, + "num_tokens": 343012022.0, + "step": 18431 + }, + { + "epoch": 4.984315846403462, + "grad_norm": 0.8091280460357666, + "learning_rate": 2.0004806286496118e-06, + "loss": 1.8948, + "mean_token_accuracy": 0.5488324761390686, + "num_tokens": 343536269.0, + "step": 18432 + }, + { + "epoch": 4.984586262844781, + "grad_norm": 0.8217262625694275, + "learning_rate": 2.0004644743639716e-06, + "loss": 1.8416, + "mean_token_accuracy": 0.5847750902175903, + "num_tokens": 344020561.0, + "step": 18433 + }, + { + "epoch": 4.984856679286101, + "grad_norm": 0.8347830176353455, + "learning_rate": 2.0004485962105466e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5797526836395264, + "num_tokens": 344544768.0, + "step": 18434 + }, + { + "epoch": 4.98512709572742, + "grad_norm": 0.883288562297821, + "learning_rate": 2.000432994189825e-06, + "loss": 1.7968, + "mean_token_accuracy": 0.5773335099220276, + "num_tokens": 345068976.0, + "step": 18435 + }, + { + "epoch": 4.98539751216874, + "grad_norm": 0.7796065211296082, + "learning_rate": 2.0004176683022862e-06, + "loss": 1.8718, + "mean_token_accuracy": 0.5726267099380493, + "num_tokens": 345593173.0, + "step": 18436 + }, + { + "epoch": 4.985667928610059, + "grad_norm": 0.9339675307273865, + "learning_rate": 2.000402618548398e-06, + "loss": 1.6712, + "mean_token_accuracy": 0.6246685981750488, + "num_tokens": 346052499.0, + "step": 18437 + }, + { + "epoch": 4.9859383450513795, + "grad_norm": 0.76832515001297, + "learning_rate": 2.0003878449286246e-06, + "loss": 1.807, + "mean_token_accuracy": 0.5719771385192871, + "num_tokens": 346543477.0, + "step": 18438 + }, + { + "epoch": 4.986208761492699, + "grad_norm": 1.0296629667282104, + "learning_rate": 2.0003733474434183e-06, + "loss": 1.7721, + "mean_token_accuracy": 0.5775745511054993, + "num_tokens": 347015604.0, + "step": 18439 + }, + { + "epoch": 4.986479177934019, + "grad_norm": 0.8930104374885559, + "learning_rate": 2.0003591260932224e-06, + "loss": 1.6584, + "mean_token_accuracy": 0.6021103858947754, + "num_tokens": 347532117.0, + "step": 18440 + }, + { + "epoch": 4.986749594375338, + "grad_norm": 0.32219091057777405, + "learning_rate": 2.0003451808784766e-06, + "loss": 1.1174, + "mean_token_accuracy": 0.697942852973938, + "num_tokens": 348056394.0, + "step": 18441 + }, + { + "epoch": 4.987020010816658, + "grad_norm": 0.859491229057312, + "learning_rate": 2.0003315117996057e-06, + "loss": 1.7942, + "mean_token_accuracy": 0.5978081226348877, + "num_tokens": 348580513.0, + "step": 18442 + }, + { + "epoch": 4.987290427257977, + "grad_norm": 0.7851154208183289, + "learning_rate": 2.000318118857031e-06, + "loss": 1.7111, + "mean_token_accuracy": 0.5834558010101318, + "num_tokens": 349073610.0, + "step": 18443 + }, + { + "epoch": 4.987560843699297, + "grad_norm": 0.8492110371589661, + "learning_rate": 2.0003050020511623e-06, + "loss": 1.7005, + "mean_token_accuracy": 0.590195894241333, + "num_tokens": 349575393.0, + "step": 18444 + }, + { + "epoch": 4.9878312601406165, + "grad_norm": 0.9902327060699463, + "learning_rate": 2.0002921613824023e-06, + "loss": 1.8164, + "mean_token_accuracy": 0.5721412897109985, + "num_tokens": 350099495.0, + "step": 18445 + }, + { + "epoch": 4.988101676581936, + "grad_norm": 0.9560039639472961, + "learning_rate": 2.0002795968511455e-06, + "loss": 1.7924, + "mean_token_accuracy": 0.5806638598442078, + "num_tokens": 350623707.0, + "step": 18446 + }, + { + "epoch": 4.988372093023256, + "grad_norm": 0.8248210549354553, + "learning_rate": 2.000267308457777e-06, + "loss": 1.7308, + "mean_token_accuracy": 0.5786388516426086, + "num_tokens": 351147741.0, + "step": 18447 + }, + { + "epoch": 4.988642509464576, + "grad_norm": 1.069968342781067, + "learning_rate": 2.0002552962026744e-06, + "loss": 1.9815, + "mean_token_accuracy": 0.5553053021430969, + "num_tokens": 351615386.0, + "step": 18448 + }, + { + "epoch": 4.988912925905895, + "grad_norm": 0.808010995388031, + "learning_rate": 2.000243560086206e-06, + "loss": 1.89, + "mean_token_accuracy": 0.556024968624115, + "num_tokens": 352139640.0, + "step": 18449 + }, + { + "epoch": 4.989183342347214, + "grad_norm": 0.8239672780036926, + "learning_rate": 2.0002321001087306e-06, + "loss": 1.8476, + "mean_token_accuracy": 0.5691277980804443, + "num_tokens": 352663857.0, + "step": 18450 + }, + { + "epoch": 4.989453758788534, + "grad_norm": 0.8050810098648071, + "learning_rate": 2.000220916270602e-06, + "loss": 1.896, + "mean_token_accuracy": 0.5599582195281982, + "num_tokens": 353152381.0, + "step": 18451 + }, + { + "epoch": 4.9897241752298545, + "grad_norm": 0.8197283744812012, + "learning_rate": 2.0002100085721617e-06, + "loss": 1.7378, + "mean_token_accuracy": 0.5723843574523926, + "num_tokens": 353676480.0, + "step": 18452 + }, + { + "epoch": 4.989994591671174, + "grad_norm": 0.8819917440414429, + "learning_rate": 2.000199377013745e-06, + "loss": 1.8307, + "mean_token_accuracy": 0.5743515491485596, + "num_tokens": 354200736.0, + "step": 18453 + }, + { + "epoch": 4.990265008112493, + "grad_norm": 0.8154612183570862, + "learning_rate": 2.000189021595679e-06, + "loss": 1.7879, + "mean_token_accuracy": 0.5902925133705139, + "num_tokens": 354724935.0, + "step": 18454 + }, + { + "epoch": 4.990535424553813, + "grad_norm": 0.8301698565483093, + "learning_rate": 2.0001789423182794e-06, + "loss": 1.8215, + "mean_token_accuracy": 0.5873166918754578, + "num_tokens": 355202490.0, + "step": 18455 + }, + { + "epoch": 4.990805840995132, + "grad_norm": 0.9565763473510742, + "learning_rate": 2.0001691391818574e-06, + "loss": 1.7598, + "mean_token_accuracy": 0.6185064911842346, + "num_tokens": 355661672.0, + "step": 18456 + }, + { + "epoch": 4.991076257436452, + "grad_norm": 0.8015350699424744, + "learning_rate": 2.0001596121867127e-06, + "loss": 1.7852, + "mean_token_accuracy": 0.5911074876785278, + "num_tokens": 356155558.0, + "step": 18457 + }, + { + "epoch": 4.991346673877771, + "grad_norm": 0.8136795163154602, + "learning_rate": 2.000150361333138e-06, + "loss": 1.8617, + "mean_token_accuracy": 0.5775332450866699, + "num_tokens": 356679805.0, + "step": 18458 + }, + { + "epoch": 4.9916170903190915, + "grad_norm": 0.9371965527534485, + "learning_rate": 2.000141386621417e-06, + "loss": 1.7865, + "mean_token_accuracy": 0.5803830623626709, + "num_tokens": 357204008.0, + "step": 18459 + }, + { + "epoch": 4.991887506760411, + "grad_norm": 0.8196001648902893, + "learning_rate": 2.000132688051825e-06, + "loss": 1.7379, + "mean_token_accuracy": 0.5983835458755493, + "num_tokens": 357703278.0, + "step": 18460 + }, + { + "epoch": 4.992157923201731, + "grad_norm": 0.33268362283706665, + "learning_rate": 2.0001242656246287e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7257228493690491, + "num_tokens": 358214638.0, + "step": 18461 + }, + { + "epoch": 4.99242833964305, + "grad_norm": 0.9469935297966003, + "learning_rate": 2.0001161193400874e-06, + "loss": 1.8715, + "mean_token_accuracy": 0.5679113864898682, + "num_tokens": 358698127.0, + "step": 18462 + }, + { + "epoch": 4.99269875608437, + "grad_norm": 0.8701324462890625, + "learning_rate": 2.0001082491984504e-06, + "loss": 1.841, + "mean_token_accuracy": 0.583458662033081, + "num_tokens": 359222296.0, + "step": 18463 + }, + { + "epoch": 4.992969172525689, + "grad_norm": 0.8283768892288208, + "learning_rate": 2.000100655199959e-06, + "loss": 1.8452, + "mean_token_accuracy": 0.5886076092720032, + "num_tokens": 359746472.0, + "step": 18464 + }, + { + "epoch": 4.993239588967009, + "grad_norm": 0.8630677461624146, + "learning_rate": 2.0000933373448474e-06, + "loss": 1.7896, + "mean_token_accuracy": 0.5913106203079224, + "num_tokens": 360270747.0, + "step": 18465 + }, + { + "epoch": 4.993510005408329, + "grad_norm": 0.7116967439651489, + "learning_rate": 2.0000862956333385e-06, + "loss": 1.8556, + "mean_token_accuracy": 0.5607490539550781, + "num_tokens": 360795007.0, + "step": 18466 + }, + { + "epoch": 4.993780421849649, + "grad_norm": 0.7957606911659241, + "learning_rate": 2.0000795300656482e-06, + "loss": 1.9056, + "mean_token_accuracy": 0.5611439347267151, + "num_tokens": 361318994.0, + "step": 18467 + }, + { + "epoch": 4.994050838290968, + "grad_norm": 0.8128014802932739, + "learning_rate": 2.000073040641986e-06, + "loss": 1.8122, + "mean_token_accuracy": 0.5753891468048096, + "num_tokens": 361843175.0, + "step": 18468 + }, + { + "epoch": 4.994321254732288, + "grad_norm": 0.7490288615226746, + "learning_rate": 2.00006682736255e-06, + "loss": 1.726, + "mean_token_accuracy": 0.5959319472312927, + "num_tokens": 362329822.0, + "step": 18469 + }, + { + "epoch": 4.994591671173607, + "grad_norm": 0.7576892971992493, + "learning_rate": 2.0000608902275303e-06, + "loss": 1.8438, + "mean_token_accuracy": 0.5658890008926392, + "num_tokens": 362803103.0, + "step": 18470 + }, + { + "epoch": 4.994862087614927, + "grad_norm": 0.8643245100975037, + "learning_rate": 2.00005522923711e-06, + "loss": 1.8866, + "mean_token_accuracy": 0.5825836658477783, + "num_tokens": 363317008.0, + "step": 18471 + }, + { + "epoch": 4.995132504056246, + "grad_norm": 0.868388295173645, + "learning_rate": 2.0000498443914625e-06, + "loss": 1.8509, + "mean_token_accuracy": 0.5739116668701172, + "num_tokens": 363841196.0, + "step": 18472 + }, + { + "epoch": 4.9954029204975665, + "grad_norm": 0.8486046195030212, + "learning_rate": 2.000044735690752e-06, + "loss": 1.8066, + "mean_token_accuracy": 0.5678800344467163, + "num_tokens": 364365371.0, + "step": 18473 + }, + { + "epoch": 4.995673336938886, + "grad_norm": 0.8329992294311523, + "learning_rate": 2.0000399031351377e-06, + "loss": 1.7813, + "mean_token_accuracy": 0.5623676776885986, + "num_tokens": 364889588.0, + "step": 18474 + }, + { + "epoch": 4.995943753380206, + "grad_norm": 0.7275218367576599, + "learning_rate": 2.0000353467247654e-06, + "loss": 1.7523, + "mean_token_accuracy": 0.5951923131942749, + "num_tokens": 365413843.0, + "step": 18475 + }, + { + "epoch": 4.996214169821525, + "grad_norm": 0.7341027855873108, + "learning_rate": 2.000031066459776e-06, + "loss": 1.8773, + "mean_token_accuracy": 0.5600748658180237, + "num_tokens": 365938055.0, + "step": 18476 + }, + { + "epoch": 4.996484586262845, + "grad_norm": 0.8151838779449463, + "learning_rate": 2.0000270623403e-06, + "loss": 1.761, + "mean_token_accuracy": 0.5556716918945312, + "num_tokens": 366462168.0, + "step": 18477 + }, + { + "epoch": 4.996755002704164, + "grad_norm": 0.7759853601455688, + "learning_rate": 2.0000233343664612e-06, + "loss": 1.8399, + "mean_token_accuracy": 0.575865626335144, + "num_tokens": 366986327.0, + "step": 18478 + }, + { + "epoch": 4.997025419145484, + "grad_norm": 0.8543022871017456, + "learning_rate": 2.000019882538374e-06, + "loss": 1.7944, + "mean_token_accuracy": 0.5826181173324585, + "num_tokens": 367510597.0, + "step": 18479 + }, + { + "epoch": 4.997295835586804, + "grad_norm": 0.7575192451477051, + "learning_rate": 2.000016706856144e-06, + "loss": 1.8074, + "mean_token_accuracy": 0.5893454551696777, + "num_tokens": 368034864.0, + "step": 18480 + }, + { + "epoch": 4.997566252028124, + "grad_norm": 0.374727338552475, + "learning_rate": 2.000013807319869e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.703759491443634, + "num_tokens": 368542023.0, + "step": 18481 + }, + { + "epoch": 4.997836668469443, + "grad_norm": 0.8081258535385132, + "learning_rate": 2.0000111839296376e-06, + "loss": 1.7324, + "mean_token_accuracy": 0.5743421912193298, + "num_tokens": 369066286.0, + "step": 18482 + }, + { + "epoch": 4.998107084910763, + "grad_norm": 0.8412291407585144, + "learning_rate": 2.00000883668553e-06, + "loss": 1.6778, + "mean_token_accuracy": 0.6030992269515991, + "num_tokens": 369590537.0, + "step": 18483 + }, + { + "epoch": 4.998377501352082, + "grad_norm": 0.7636772990226746, + "learning_rate": 2.0000067655876183e-06, + "loss": 1.7168, + "mean_token_accuracy": 0.5836695432662964, + "num_tokens": 370114798.0, + "step": 18484 + }, + { + "epoch": 4.998647917793402, + "grad_norm": 0.7530269026756287, + "learning_rate": 2.0000049706359663e-06, + "loss": 1.8933, + "mean_token_accuracy": 0.5660099387168884, + "num_tokens": 370639080.0, + "step": 18485 + }, + { + "epoch": 4.998918334234721, + "grad_norm": 0.7676099538803101, + "learning_rate": 2.0000034518306296e-06, + "loss": 1.7926, + "mean_token_accuracy": 0.5802400708198547, + "num_tokens": 371163355.0, + "step": 18486 + }, + { + "epoch": 4.999188750676041, + "grad_norm": 0.8395646810531616, + "learning_rate": 2.000002209171654e-06, + "loss": 1.8995, + "mean_token_accuracy": 0.5548835396766663, + "num_tokens": 371687591.0, + "step": 18487 + }, + { + "epoch": 4.999459167117361, + "grad_norm": 0.8710453510284424, + "learning_rate": 2.000001242659078e-06, + "loss": 1.9166, + "mean_token_accuracy": 0.5887430310249329, + "num_tokens": 372076295.0, + "step": 18488 + }, + { + "epoch": 4.999729583558681, + "grad_norm": 0.8219760060310364, + "learning_rate": 2.0000005522929305e-06, + "loss": 1.6771, + "mean_token_accuracy": 0.6052401661872864, + "num_tokens": 372600427.0, + "step": 18489 + }, + { + "epoch": 5.0, + "grad_norm": 0.9643396735191345, + "learning_rate": 2.0000001380732342e-06, + "loss": 1.8835, + "mean_token_accuracy": 0.577031135559082, + "num_tokens": 373109098.0, + "step": 18490 + }, + { + "epoch": 5.0, + "step": 18490, + "total_flos": 5.519232355835209e+19, + "train_loss": 0.0698729722998863, + "train_runtime": 8612.5861, + "train_samples_per_second": 17.173, + "train_steps_per_second": 2.147 + } + ], + "logging_steps": 1, + "max_steps": 18490, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 185, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.519232355835209e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..42347d8 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c060d886990465743f771b10c5648d127c5c669d045fe040777e28e77c1066bd +size 11960